duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,479 @@
1
+ """Historical result storage implementation.
2
+
3
+ Provides persistent storage for validation results in SQLite,
4
+ enabling trend analysis and historical comparison.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import sqlite3
11
+ import uuid
12
+ from dataclasses import dataclass
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any
16
+
17
+ from duckguard.history.schema import CREATE_TABLES_SQL, QUERIES, SCHEMA_VERSION
18
+
19
+ if TYPE_CHECKING:
20
+ from duckguard.rules.executor import ExecutionResult
21
+
22
+
23
+ @dataclass
24
+ class StoredRun:
25
+ """Represents a stored validation run.
26
+
27
+ Attributes:
28
+ run_id: Unique identifier for this run
29
+ source: Data source that was validated
30
+ ruleset_name: Name of the ruleset used (if any)
31
+ started_at: When validation started
32
+ finished_at: When validation finished
33
+ quality_score: Overall quality score (0-100)
34
+ total_checks: Total number of checks executed
35
+ passed_count: Number of checks that passed
36
+ failed_count: Number of checks that failed
37
+ warning_count: Number of warnings
38
+ passed: Whether the validation passed overall
39
+ metadata: Additional metadata (e.g., Airflow context)
40
+ """
41
+
42
+ run_id: str
43
+ source: str
44
+ ruleset_name: str | None
45
+ started_at: datetime
46
+ finished_at: datetime | None
47
+ quality_score: float
48
+ total_checks: int
49
+ passed_count: int
50
+ failed_count: int
51
+ warning_count: int
52
+ passed: bool
53
+ metadata: dict[str, Any] | None = None
54
+
55
+
56
+ @dataclass
57
+ class StoredCheckResult:
58
+ """Represents a stored check result.
59
+
60
+ Attributes:
61
+ id: Database ID
62
+ run_id: Associated run ID
63
+ check_type: Type of check (e.g., NOT_NULL, UNIQUE)
64
+ column_name: Column that was checked (None for table-level)
65
+ passed: Whether the check passed
66
+ severity: Check severity (error, warning, info)
67
+ actual_value: The actual value found
68
+ expected_value: The expected value
69
+ message: Human-readable result message
70
+ details: Additional details
71
+ """
72
+
73
+ id: int
74
+ run_id: str
75
+ check_type: str
76
+ column_name: str | None
77
+ passed: bool
78
+ severity: str
79
+ actual_value: str | None
80
+ expected_value: str | None
81
+ message: str | None
82
+ details: dict[str, Any] | None = None
83
+
84
+
85
+ @dataclass
86
+ class TrendDataPoint:
87
+ """A single data point in a quality trend.
88
+
89
+ Attributes:
90
+ date: The date of this data point
91
+ avg_score: Average quality score for the day
92
+ min_score: Minimum quality score for the day
93
+ max_score: Maximum quality score for the day
94
+ run_count: Number of runs on this day
95
+ passed_count: Number of passing runs
96
+ failed_count: Number of failing runs
97
+ """
98
+
99
+ date: str
100
+ avg_score: float
101
+ min_score: float
102
+ max_score: float
103
+ run_count: int
104
+ passed_count: int
105
+ failed_count: int
106
+
107
+
108
+ class HistoryStorage:
109
+ """Storage for historical validation results.
110
+
111
+ Stores validation results in a SQLite database for trend analysis
112
+ and historical comparison.
113
+
114
+ Usage:
115
+ from duckguard.history import HistoryStorage
116
+ from duckguard import connect, load_rules, execute_rules
117
+
118
+ # Run validation
119
+ result = execute_rules(load_rules("rules.yaml"), connect("data.csv"))
120
+
121
+ # Store result
122
+ storage = HistoryStorage()
123
+ run_id = storage.store(result)
124
+
125
+ # Query history
126
+ runs = storage.get_runs("data.csv", limit=10)
127
+ trend = storage.get_trend("data.csv", days=30)
128
+
129
+ Attributes:
130
+ db_path: Path to the SQLite database file
131
+ """
132
+
133
+ DEFAULT_DB_PATH = Path.home() / ".duckguard" / "history.db"
134
+
135
+ def __init__(self, db_path: str | Path | None = None):
136
+ """Initialize history storage.
137
+
138
+ Args:
139
+ db_path: Path to SQLite database. Defaults to ~/.duckguard/history.db
140
+ """
141
+ self.db_path = Path(db_path) if db_path else self.DEFAULT_DB_PATH
142
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
143
+
144
+ self._conn: sqlite3.Connection | None = None
145
+ self._init_db()
146
+
147
+ def _init_db(self) -> None:
148
+ """Initialize database schema."""
149
+ conn = self._get_connection()
150
+ conn.executescript(CREATE_TABLES_SQL)
151
+
152
+ # Set schema version
153
+ conn.execute(
154
+ "INSERT OR REPLACE INTO schema_info (key, value) VALUES (?, ?)",
155
+ ("schema_version", str(SCHEMA_VERSION)),
156
+ )
157
+ conn.commit()
158
+
159
+ def _get_connection(self) -> sqlite3.Connection:
160
+ """Get database connection."""
161
+ if self._conn is None:
162
+ self._conn = sqlite3.connect(
163
+ str(self.db_path),
164
+ detect_types=sqlite3.PARSE_DECLTYPES,
165
+ )
166
+ self._conn.row_factory = sqlite3.Row
167
+ return self._conn
168
+
169
+ def store(
170
+ self,
171
+ result: ExecutionResult,
172
+ *,
173
+ metadata: dict[str, Any] | None = None,
174
+ ) -> str:
175
+ """Store an execution result.
176
+
177
+ Args:
178
+ result: ExecutionResult to store
179
+ metadata: Additional metadata (e.g., Airflow context, environment)
180
+
181
+ Returns:
182
+ The generated run_id
183
+ """
184
+ conn = self._get_connection()
185
+ run_id = str(uuid.uuid4())
186
+
187
+ # Insert run record
188
+ conn.execute(
189
+ QUERIES["insert_run"],
190
+ (
191
+ run_id,
192
+ result.source,
193
+ result.ruleset.name if result.ruleset else None,
194
+ result.started_at.isoformat(),
195
+ result.finished_at.isoformat() if result.finished_at else None,
196
+ result.quality_score,
197
+ result.total_checks,
198
+ result.passed_count,
199
+ result.failed_count,
200
+ result.warning_count,
201
+ 1 if result.passed else 0,
202
+ json.dumps(metadata) if metadata else None,
203
+ ),
204
+ )
205
+
206
+ # Insert check results
207
+ for check_result in result.results:
208
+ cursor = conn.execute(
209
+ QUERIES["insert_check_result"],
210
+ (
211
+ run_id,
212
+ check_result.check.type.value,
213
+ check_result.column,
214
+ 1 if check_result.passed else 0,
215
+ check_result.severity.value,
216
+ str(check_result.actual_value) if check_result.actual_value is not None else None,
217
+ str(check_result.expected_value) if check_result.expected_value is not None else None,
218
+ check_result.message,
219
+ json.dumps(check_result.details) if check_result.details else None,
220
+ ),
221
+ )
222
+ check_id = cursor.lastrowid
223
+
224
+ # Insert failed row samples if available (limited to 10)
225
+ if check_result.details and check_result.details.get("failed_rows"):
226
+ failed_rows = check_result.details["failed_rows"][:10]
227
+ for i, row_data in enumerate(failed_rows):
228
+ if isinstance(row_data, dict):
229
+ conn.execute(
230
+ QUERIES["insert_failed_row"],
231
+ (
232
+ run_id,
233
+ check_id,
234
+ row_data.get("row_index", i),
235
+ check_result.column or "",
236
+ str(row_data.get("value")),
237
+ str(check_result.expected_value),
238
+ row_data.get("reason", ""),
239
+ json.dumps(row_data.get("context")) if row_data.get("context") else None,
240
+ ),
241
+ )
242
+ elif isinstance(row_data, int):
243
+ # Just row index
244
+ conn.execute(
245
+ QUERIES["insert_failed_row"],
246
+ (
247
+ run_id,
248
+ check_id,
249
+ row_data,
250
+ check_result.column or "",
251
+ None,
252
+ str(check_result.expected_value),
253
+ "",
254
+ None,
255
+ ),
256
+ )
257
+
258
+ # Update quality trends
259
+ self._update_trends(result)
260
+
261
+ conn.commit()
262
+ return run_id
263
+
264
+ def get_runs(
265
+ self,
266
+ source: str | None = None,
267
+ *,
268
+ limit: int = 100,
269
+ start_date: datetime | None = None,
270
+ end_date: datetime | None = None,
271
+ ) -> list[StoredRun]:
272
+ """Get validation runs.
273
+
274
+ Args:
275
+ source: Filter by data source path. If None, returns all sources.
276
+ limit: Maximum runs to return
277
+ start_date: Filter by start date
278
+ end_date: Filter by end date
279
+
280
+ Returns:
281
+ List of StoredRun objects, most recent first
282
+ """
283
+ conn = self._get_connection()
284
+
285
+ if source is None:
286
+ cursor = conn.execute(QUERIES["get_all_runs"], (limit,))
287
+ elif start_date and end_date:
288
+ cursor = conn.execute(
289
+ QUERIES["get_runs_in_period"],
290
+ (source, start_date.isoformat(), end_date.isoformat()),
291
+ )
292
+ else:
293
+ cursor = conn.execute(
294
+ QUERIES["get_runs_for_source"],
295
+ (source, limit),
296
+ )
297
+
298
+ return [self._row_to_stored_run(row) for row in cursor.fetchall()]
299
+
300
+ def get_run(self, run_id: str) -> StoredRun | None:
301
+ """Get a specific run by ID.
302
+
303
+ Args:
304
+ run_id: The run ID to retrieve
305
+
306
+ Returns:
307
+ StoredRun or None if not found
308
+ """
309
+ conn = self._get_connection()
310
+ cursor = conn.execute(QUERIES["get_run_by_id"], (run_id,))
311
+ row = cursor.fetchone()
312
+ return self._row_to_stored_run(row) if row else None
313
+
314
+ def get_latest_run(self, source: str) -> StoredRun | None:
315
+ """Get the most recent run for a source.
316
+
317
+ Args:
318
+ source: Data source path
319
+
320
+ Returns:
321
+ StoredRun or None if no runs exist
322
+ """
323
+ conn = self._get_connection()
324
+ cursor = conn.execute(QUERIES["get_latest_run"], (source,))
325
+ row = cursor.fetchone()
326
+ return self._row_to_stored_run(row) if row else None
327
+
328
+ def get_check_results(self, run_id: str) -> list[StoredCheckResult]:
329
+ """Get check results for a specific run.
330
+
331
+ Args:
332
+ run_id: The run ID
333
+
334
+ Returns:
335
+ List of StoredCheckResult objects
336
+ """
337
+ conn = self._get_connection()
338
+ cursor = conn.execute(QUERIES["get_check_results_for_run"], (run_id,))
339
+
340
+ results = []
341
+ for row in cursor.fetchall():
342
+ results.append(
343
+ StoredCheckResult(
344
+ id=row["id"],
345
+ run_id=row["run_id"],
346
+ check_type=row["check_type"],
347
+ column_name=row["column_name"],
348
+ passed=bool(row["passed"]),
349
+ severity=row["severity"],
350
+ actual_value=row["actual_value"],
351
+ expected_value=row["expected_value"],
352
+ message=row["message"],
353
+ details=json.loads(row["details"]) if row["details"] else None,
354
+ )
355
+ )
356
+ return results
357
+
358
+ def get_trend(
359
+ self,
360
+ source: str,
361
+ days: int = 30,
362
+ ) -> list[TrendDataPoint]:
363
+ """Get quality score trend for a source.
364
+
365
+ Args:
366
+ source: Data source path
367
+ days: Number of days to look back
368
+
369
+ Returns:
370
+ List of TrendDataPoint objects, ordered by date
371
+ """
372
+ conn = self._get_connection()
373
+ from datetime import timedelta
374
+
375
+ start_date = datetime.now() - timedelta(days=days)
376
+
377
+ cursor = conn.execute(
378
+ QUERIES["get_quality_trend"],
379
+ (source, start_date.strftime("%Y-%m-%d")),
380
+ )
381
+
382
+ return [
383
+ TrendDataPoint(
384
+ date=row["date"],
385
+ avg_score=row["avg_quality_score"],
386
+ min_score=row["min_quality_score"],
387
+ max_score=row["max_quality_score"],
388
+ run_count=row["run_count"],
389
+ passed_count=row["passed_count"],
390
+ failed_count=row["failed_count"],
391
+ )
392
+ for row in cursor.fetchall()
393
+ ]
394
+
395
+ def get_sources(self) -> list[str]:
396
+ """Get all unique sources in the history.
397
+
398
+ Returns:
399
+ List of source paths
400
+ """
401
+ conn = self._get_connection()
402
+ cursor = conn.execute(QUERIES["get_unique_sources"])
403
+ return [row["source"] for row in cursor.fetchall()]
404
+
405
+ def cleanup(self, days: int = 90) -> int:
406
+ """Delete runs older than specified days.
407
+
408
+ Args:
409
+ days: Delete runs older than this many days
410
+
411
+ Returns:
412
+ Number of runs deleted
413
+ """
414
+ conn = self._get_connection()
415
+ from datetime import timedelta
416
+
417
+ cutoff = datetime.now() - timedelta(days=days)
418
+
419
+ # Get count before deletion
420
+ cursor = conn.execute(
421
+ "SELECT COUNT(*) FROM runs WHERE started_at < ?",
422
+ (cutoff.isoformat(),),
423
+ )
424
+ count = cursor.fetchone()[0]
425
+
426
+ # Delete old records (cascading will handle related tables)
427
+ conn.execute(QUERIES["delete_old_runs"], (cutoff.isoformat(),))
428
+ conn.commit()
429
+
430
+ return count
431
+
432
+ def _update_trends(self, result: ExecutionResult) -> None:
433
+ """Update quality trend aggregation."""
434
+ conn = self._get_connection()
435
+ today = datetime.now().strftime("%Y-%m-%d")
436
+
437
+ conn.execute(
438
+ QUERIES["upsert_trend"],
439
+ (
440
+ result.source,
441
+ today,
442
+ result.quality_score,
443
+ result.quality_score,
444
+ result.quality_score,
445
+ 1 if result.passed else 0,
446
+ 0 if result.passed else 1,
447
+ ),
448
+ )
449
+
450
+ def _row_to_stored_run(self, row: sqlite3.Row) -> StoredRun:
451
+ """Convert database row to StoredRun."""
452
+ return StoredRun(
453
+ run_id=row["run_id"],
454
+ source=row["source"],
455
+ ruleset_name=row["ruleset_name"],
456
+ started_at=datetime.fromisoformat(row["started_at"]),
457
+ finished_at=datetime.fromisoformat(row["finished_at"]) if row["finished_at"] else None,
458
+ quality_score=row["quality_score"],
459
+ total_checks=row["total_checks"],
460
+ passed_count=row["passed_count"],
461
+ failed_count=row["failed_count"],
462
+ warning_count=row["warning_count"],
463
+ passed=bool(row["passed"]),
464
+ metadata=json.loads(row["metadata"]) if row["metadata"] else None,
465
+ )
466
+
467
+ def close(self) -> None:
468
+ """Close database connection."""
469
+ if self._conn:
470
+ self._conn.close()
471
+ self._conn = None
472
+
473
+ def __enter__(self) -> HistoryStorage:
474
+ """Context manager entry."""
475
+ return self
476
+
477
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
478
+ """Context manager exit."""
479
+ self.close()