duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,301 @@
1
+ """Database schema for historical result storage.
2
+
3
+ Defines the SQLite schema for storing validation results over time,
4
+ enabling trend analysis and historical comparison.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ # Schema version for migrations
10
+ SCHEMA_VERSION = 2
11
+
12
+ # SQL to create all tables
13
+ CREATE_TABLES_SQL = """
14
+ -- Validation runs table: stores metadata for each validation execution
15
+ CREATE TABLE IF NOT EXISTS runs (
16
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
17
+ run_id TEXT UNIQUE NOT NULL,
18
+ source TEXT NOT NULL,
19
+ ruleset_name TEXT,
20
+ started_at TEXT NOT NULL,
21
+ finished_at TEXT,
22
+ quality_score REAL NOT NULL,
23
+ total_checks INTEGER NOT NULL,
24
+ passed_count INTEGER NOT NULL,
25
+ failed_count INTEGER NOT NULL,
26
+ warning_count INTEGER NOT NULL,
27
+ passed INTEGER NOT NULL,
28
+ metadata TEXT,
29
+ created_at TEXT DEFAULT (datetime('now'))
30
+ );
31
+
32
+ -- Individual check results table
33
+ CREATE TABLE IF NOT EXISTS check_results (
34
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
35
+ run_id TEXT NOT NULL,
36
+ check_type TEXT NOT NULL,
37
+ column_name TEXT,
38
+ passed INTEGER NOT NULL,
39
+ severity TEXT NOT NULL,
40
+ actual_value TEXT,
41
+ expected_value TEXT,
42
+ message TEXT,
43
+ details TEXT,
44
+ created_at TEXT DEFAULT (datetime('now')),
45
+ FOREIGN KEY (run_id) REFERENCES runs(run_id)
46
+ );
47
+
48
+ -- Sample of failed rows (limited to avoid large storage)
49
+ CREATE TABLE IF NOT EXISTS failed_rows_sample (
50
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
51
+ run_id TEXT NOT NULL,
52
+ check_id INTEGER,
53
+ row_index INTEGER NOT NULL,
54
+ column_name TEXT NOT NULL,
55
+ value TEXT,
56
+ expected TEXT,
57
+ reason TEXT,
58
+ context TEXT,
59
+ created_at TEXT DEFAULT (datetime('now')),
60
+ FOREIGN KEY (run_id) REFERENCES runs(run_id),
61
+ FOREIGN KEY (check_id) REFERENCES check_results(id)
62
+ );
63
+
64
+ -- Quality score trends (aggregated daily for efficient queries)
65
+ CREATE TABLE IF NOT EXISTS quality_trends (
66
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
67
+ source TEXT NOT NULL,
68
+ date TEXT NOT NULL,
69
+ avg_quality_score REAL NOT NULL,
70
+ min_quality_score REAL NOT NULL,
71
+ max_quality_score REAL NOT NULL,
72
+ run_count INTEGER NOT NULL,
73
+ passed_count INTEGER NOT NULL,
74
+ failed_count INTEGER NOT NULL,
75
+ UNIQUE(source, date)
76
+ );
77
+
78
+ -- Schema metadata table
79
+ CREATE TABLE IF NOT EXISTS schema_info (
80
+ key TEXT PRIMARY KEY,
81
+ value TEXT NOT NULL
82
+ );
83
+
84
+ -- Indexes for common query patterns
85
+ CREATE INDEX IF NOT EXISTS idx_runs_source ON runs(source);
86
+ CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
87
+ CREATE INDEX IF NOT EXISTS idx_runs_source_started ON runs(source, started_at);
88
+ CREATE INDEX IF NOT EXISTS idx_check_results_run_id ON check_results(run_id);
89
+ CREATE INDEX IF NOT EXISTS idx_failed_rows_run_id ON failed_rows_sample(run_id);
90
+ CREATE INDEX IF NOT EXISTS idx_quality_trends_source_date ON quality_trends(source, date);
91
+
92
+ -- Schema snapshots: Store schema state at points in time
93
+ CREATE TABLE IF NOT EXISTS schema_snapshots (
94
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
95
+ source TEXT NOT NULL,
96
+ snapshot_id TEXT UNIQUE NOT NULL,
97
+ captured_at TEXT NOT NULL,
98
+ schema_json TEXT NOT NULL,
99
+ column_count INTEGER NOT NULL,
100
+ row_count INTEGER,
101
+ created_at TEXT DEFAULT (datetime('now'))
102
+ );
103
+
104
+ -- Schema changes: Track schema evolution over time
105
+ CREATE TABLE IF NOT EXISTS schema_changes (
106
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
107
+ source TEXT NOT NULL,
108
+ detected_at TEXT NOT NULL,
109
+ previous_snapshot_id TEXT,
110
+ current_snapshot_id TEXT NOT NULL,
111
+ change_type TEXT NOT NULL,
112
+ column_name TEXT,
113
+ previous_value TEXT,
114
+ current_value TEXT,
115
+ is_breaking INTEGER NOT NULL,
116
+ severity TEXT NOT NULL,
117
+ created_at TEXT DEFAULT (datetime('now')),
118
+ FOREIGN KEY (previous_snapshot_id) REFERENCES schema_snapshots(snapshot_id),
119
+ FOREIGN KEY (current_snapshot_id) REFERENCES schema_snapshots(snapshot_id)
120
+ );
121
+
122
+ -- Baselines: Store learned baselines for anomaly detection
123
+ CREATE TABLE IF NOT EXISTS baselines (
124
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
125
+ source TEXT NOT NULL,
126
+ column_name TEXT NOT NULL,
127
+ metric TEXT NOT NULL,
128
+ baseline_value TEXT NOT NULL,
129
+ sample_size INTEGER,
130
+ created_at TEXT NOT NULL,
131
+ updated_at TEXT,
132
+ UNIQUE(source, column_name, metric)
133
+ );
134
+
135
+ -- Additional indexes for new tables
136
+ CREATE INDEX IF NOT EXISTS idx_schema_snapshots_source ON schema_snapshots(source);
137
+ CREATE INDEX IF NOT EXISTS idx_schema_snapshots_captured_at ON schema_snapshots(captured_at);
138
+ CREATE INDEX IF NOT EXISTS idx_schema_changes_source ON schema_changes(source);
139
+ CREATE INDEX IF NOT EXISTS idx_schema_changes_detected_at ON schema_changes(detected_at);
140
+ CREATE INDEX IF NOT EXISTS idx_baselines_source ON baselines(source);
141
+ CREATE INDEX IF NOT EXISTS idx_baselines_source_column ON baselines(source, column_name);
142
+ """
143
+
144
+ # Pre-built queries for common operations
145
+ QUERIES = {
146
+ "insert_run": """
147
+ INSERT INTO runs (
148
+ run_id, source, ruleset_name, started_at, finished_at,
149
+ quality_score, total_checks, passed_count, failed_count,
150
+ warning_count, passed, metadata
151
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
152
+ """,
153
+ "insert_check_result": """
154
+ INSERT INTO check_results (
155
+ run_id, check_type, column_name, passed, severity,
156
+ actual_value, expected_value, message, details
157
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
158
+ """,
159
+ "insert_failed_row": """
160
+ INSERT INTO failed_rows_sample (
161
+ run_id, check_id, row_index, column_name, value, expected, reason, context
162
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
163
+ """,
164
+ "get_runs_for_source": """
165
+ SELECT * FROM runs
166
+ WHERE source = ?
167
+ ORDER BY started_at DESC
168
+ LIMIT ?
169
+ """,
170
+ "get_runs_in_period": """
171
+ SELECT * FROM runs
172
+ WHERE source = ?
173
+ AND started_at >= ?
174
+ AND started_at <= ?
175
+ ORDER BY started_at DESC
176
+ """,
177
+ "get_all_runs": """
178
+ SELECT * FROM runs
179
+ ORDER BY started_at DESC
180
+ LIMIT ?
181
+ """,
182
+ "get_quality_trend": """
183
+ SELECT date, avg_quality_score, min_quality_score, max_quality_score,
184
+ run_count, passed_count, failed_count
185
+ FROM quality_trends
186
+ WHERE source = ?
187
+ AND date >= ?
188
+ ORDER BY date
189
+ """,
190
+ "get_latest_run": """
191
+ SELECT * FROM runs
192
+ WHERE source = ?
193
+ ORDER BY started_at DESC
194
+ LIMIT 1
195
+ """,
196
+ "get_check_results_for_run": """
197
+ SELECT * FROM check_results
198
+ WHERE run_id = ?
199
+ ORDER BY id
200
+ """,
201
+ "get_failed_rows_for_run": """
202
+ SELECT * FROM failed_rows_sample
203
+ WHERE run_id = ?
204
+ ORDER BY id
205
+ """,
206
+ "upsert_trend": """
207
+ INSERT INTO quality_trends (
208
+ source, date, avg_quality_score, min_quality_score,
209
+ max_quality_score, run_count, passed_count, failed_count
210
+ ) VALUES (?, ?, ?, ?, ?, 1, ?, ?)
211
+ ON CONFLICT(source, date) DO UPDATE SET
212
+ avg_quality_score = (
213
+ (avg_quality_score * run_count + excluded.avg_quality_score)
214
+ / (run_count + 1)
215
+ ),
216
+ min_quality_score = MIN(min_quality_score, excluded.min_quality_score),
217
+ max_quality_score = MAX(max_quality_score, excluded.max_quality_score),
218
+ run_count = run_count + 1,
219
+ passed_count = passed_count + excluded.passed_count,
220
+ failed_count = failed_count + excluded.failed_count
221
+ """,
222
+ "get_unique_sources": """
223
+ SELECT DISTINCT source FROM runs
224
+ ORDER BY source
225
+ """,
226
+ "delete_old_runs": """
227
+ DELETE FROM runs
228
+ WHERE started_at < ?
229
+ """,
230
+ "get_run_by_id": """
231
+ SELECT * FROM runs
232
+ WHERE run_id = ?
233
+ """,
234
+ # Schema snapshot queries
235
+ "insert_schema_snapshot": """
236
+ INSERT INTO schema_snapshots (
237
+ source, snapshot_id, captured_at, schema_json, column_count, row_count
238
+ ) VALUES (?, ?, ?, ?, ?, ?)
239
+ """,
240
+ "get_schema_snapshots": """
241
+ SELECT * FROM schema_snapshots
242
+ WHERE source = ?
243
+ ORDER BY captured_at DESC
244
+ LIMIT ?
245
+ """,
246
+ "get_latest_schema_snapshot": """
247
+ SELECT * FROM schema_snapshots
248
+ WHERE source = ?
249
+ ORDER BY captured_at DESC
250
+ LIMIT 1
251
+ """,
252
+ "get_schema_snapshot_by_id": """
253
+ SELECT * FROM schema_snapshots
254
+ WHERE snapshot_id = ?
255
+ """,
256
+ # Schema change queries
257
+ "insert_schema_change": """
258
+ INSERT INTO schema_changes (
259
+ source, detected_at, previous_snapshot_id, current_snapshot_id,
260
+ change_type, column_name, previous_value, current_value,
261
+ is_breaking, severity
262
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
263
+ """,
264
+ "get_schema_changes": """
265
+ SELECT * FROM schema_changes
266
+ WHERE source = ?
267
+ ORDER BY detected_at DESC
268
+ LIMIT ?
269
+ """,
270
+ "get_schema_changes_since": """
271
+ SELECT * FROM schema_changes
272
+ WHERE source = ?
273
+ AND detected_at >= ?
274
+ ORDER BY detected_at DESC
275
+ """,
276
+ # Baseline queries
277
+ "upsert_baseline": """
278
+ INSERT INTO baselines (
279
+ source, column_name, metric, baseline_value, sample_size, created_at, updated_at
280
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
281
+ ON CONFLICT(source, column_name, metric) DO UPDATE SET
282
+ baseline_value = excluded.baseline_value,
283
+ sample_size = excluded.sample_size,
284
+ updated_at = excluded.updated_at
285
+ """,
286
+ "get_baseline": """
287
+ SELECT * FROM baselines
288
+ WHERE source = ?
289
+ AND column_name = ?
290
+ AND metric = ?
291
+ """,
292
+ "get_baselines_for_source": """
293
+ SELECT * FROM baselines
294
+ WHERE source = ?
295
+ ORDER BY column_name, metric
296
+ """,
297
+ "delete_baselines_for_source": """
298
+ DELETE FROM baselines
299
+ WHERE source = ?
300
+ """,
301
+ }