duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Database schema for historical result storage.
|
|
2
|
+
|
|
3
|
+
Defines the SQLite schema for storing validation results over time,
|
|
4
|
+
enabling trend analysis and historical comparison.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
# Schema version for migrations
|
|
10
|
+
SCHEMA_VERSION = 2
|
|
11
|
+
|
|
12
|
+
# SQL to create all tables
|
|
13
|
+
CREATE_TABLES_SQL = """
|
|
14
|
+
-- Validation runs table: stores metadata for each validation execution
|
|
15
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
16
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
17
|
+
run_id TEXT UNIQUE NOT NULL,
|
|
18
|
+
source TEXT NOT NULL,
|
|
19
|
+
ruleset_name TEXT,
|
|
20
|
+
started_at TEXT NOT NULL,
|
|
21
|
+
finished_at TEXT,
|
|
22
|
+
quality_score REAL NOT NULL,
|
|
23
|
+
total_checks INTEGER NOT NULL,
|
|
24
|
+
passed_count INTEGER NOT NULL,
|
|
25
|
+
failed_count INTEGER NOT NULL,
|
|
26
|
+
warning_count INTEGER NOT NULL,
|
|
27
|
+
passed INTEGER NOT NULL,
|
|
28
|
+
metadata TEXT,
|
|
29
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
-- Individual check results table
|
|
33
|
+
CREATE TABLE IF NOT EXISTS check_results (
|
|
34
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
35
|
+
run_id TEXT NOT NULL,
|
|
36
|
+
check_type TEXT NOT NULL,
|
|
37
|
+
column_name TEXT,
|
|
38
|
+
passed INTEGER NOT NULL,
|
|
39
|
+
severity TEXT NOT NULL,
|
|
40
|
+
actual_value TEXT,
|
|
41
|
+
expected_value TEXT,
|
|
42
|
+
message TEXT,
|
|
43
|
+
details TEXT,
|
|
44
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
45
|
+
FOREIGN KEY (run_id) REFERENCES runs(run_id)
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
-- Sample of failed rows (limited to avoid large storage)
|
|
49
|
+
CREATE TABLE IF NOT EXISTS failed_rows_sample (
|
|
50
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
51
|
+
run_id TEXT NOT NULL,
|
|
52
|
+
check_id INTEGER,
|
|
53
|
+
row_index INTEGER NOT NULL,
|
|
54
|
+
column_name TEXT NOT NULL,
|
|
55
|
+
value TEXT,
|
|
56
|
+
expected TEXT,
|
|
57
|
+
reason TEXT,
|
|
58
|
+
context TEXT,
|
|
59
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
60
|
+
FOREIGN KEY (run_id) REFERENCES runs(run_id),
|
|
61
|
+
FOREIGN KEY (check_id) REFERENCES check_results(id)
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
-- Quality score trends (aggregated daily for efficient queries)
|
|
65
|
+
CREATE TABLE IF NOT EXISTS quality_trends (
|
|
66
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
67
|
+
source TEXT NOT NULL,
|
|
68
|
+
date TEXT NOT NULL,
|
|
69
|
+
avg_quality_score REAL NOT NULL,
|
|
70
|
+
min_quality_score REAL NOT NULL,
|
|
71
|
+
max_quality_score REAL NOT NULL,
|
|
72
|
+
run_count INTEGER NOT NULL,
|
|
73
|
+
passed_count INTEGER NOT NULL,
|
|
74
|
+
failed_count INTEGER NOT NULL,
|
|
75
|
+
UNIQUE(source, date)
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
-- Schema metadata table
|
|
79
|
+
CREATE TABLE IF NOT EXISTS schema_info (
|
|
80
|
+
key TEXT PRIMARY KEY,
|
|
81
|
+
value TEXT NOT NULL
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
-- Indexes for common query patterns
|
|
85
|
+
CREATE INDEX IF NOT EXISTS idx_runs_source ON runs(source);
|
|
86
|
+
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
|
|
87
|
+
CREATE INDEX IF NOT EXISTS idx_runs_source_started ON runs(source, started_at);
|
|
88
|
+
CREATE INDEX IF NOT EXISTS idx_check_results_run_id ON check_results(run_id);
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_failed_rows_run_id ON failed_rows_sample(run_id);
|
|
90
|
+
CREATE INDEX IF NOT EXISTS idx_quality_trends_source_date ON quality_trends(source, date);
|
|
91
|
+
|
|
92
|
+
-- Schema snapshots: Store schema state at points in time
|
|
93
|
+
CREATE TABLE IF NOT EXISTS schema_snapshots (
|
|
94
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
95
|
+
source TEXT NOT NULL,
|
|
96
|
+
snapshot_id TEXT UNIQUE NOT NULL,
|
|
97
|
+
captured_at TEXT NOT NULL,
|
|
98
|
+
schema_json TEXT NOT NULL,
|
|
99
|
+
column_count INTEGER NOT NULL,
|
|
100
|
+
row_count INTEGER,
|
|
101
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
102
|
+
);
|
|
103
|
+
|
|
104
|
+
-- Schema changes: Track schema evolution over time
|
|
105
|
+
CREATE TABLE IF NOT EXISTS schema_changes (
|
|
106
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
107
|
+
source TEXT NOT NULL,
|
|
108
|
+
detected_at TEXT NOT NULL,
|
|
109
|
+
previous_snapshot_id TEXT,
|
|
110
|
+
current_snapshot_id TEXT NOT NULL,
|
|
111
|
+
change_type TEXT NOT NULL,
|
|
112
|
+
column_name TEXT,
|
|
113
|
+
previous_value TEXT,
|
|
114
|
+
current_value TEXT,
|
|
115
|
+
is_breaking INTEGER NOT NULL,
|
|
116
|
+
severity TEXT NOT NULL,
|
|
117
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
118
|
+
FOREIGN KEY (previous_snapshot_id) REFERENCES schema_snapshots(snapshot_id),
|
|
119
|
+
FOREIGN KEY (current_snapshot_id) REFERENCES schema_snapshots(snapshot_id)
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
-- Baselines: Store learned baselines for anomaly detection
|
|
123
|
+
CREATE TABLE IF NOT EXISTS baselines (
|
|
124
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
125
|
+
source TEXT NOT NULL,
|
|
126
|
+
column_name TEXT NOT NULL,
|
|
127
|
+
metric TEXT NOT NULL,
|
|
128
|
+
baseline_value TEXT NOT NULL,
|
|
129
|
+
sample_size INTEGER,
|
|
130
|
+
created_at TEXT NOT NULL,
|
|
131
|
+
updated_at TEXT,
|
|
132
|
+
UNIQUE(source, column_name, metric)
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
-- Additional indexes for new tables
|
|
136
|
+
CREATE INDEX IF NOT EXISTS idx_schema_snapshots_source ON schema_snapshots(source);
|
|
137
|
+
CREATE INDEX IF NOT EXISTS idx_schema_snapshots_captured_at ON schema_snapshots(captured_at);
|
|
138
|
+
CREATE INDEX IF NOT EXISTS idx_schema_changes_source ON schema_changes(source);
|
|
139
|
+
CREATE INDEX IF NOT EXISTS idx_schema_changes_detected_at ON schema_changes(detected_at);
|
|
140
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_source ON baselines(source);
|
|
141
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_source_column ON baselines(source, column_name);
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
# Pre-built queries for common operations
|
|
145
|
+
QUERIES = {
|
|
146
|
+
"insert_run": """
|
|
147
|
+
INSERT INTO runs (
|
|
148
|
+
run_id, source, ruleset_name, started_at, finished_at,
|
|
149
|
+
quality_score, total_checks, passed_count, failed_count,
|
|
150
|
+
warning_count, passed, metadata
|
|
151
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
152
|
+
""",
|
|
153
|
+
"insert_check_result": """
|
|
154
|
+
INSERT INTO check_results (
|
|
155
|
+
run_id, check_type, column_name, passed, severity,
|
|
156
|
+
actual_value, expected_value, message, details
|
|
157
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
158
|
+
""",
|
|
159
|
+
"insert_failed_row": """
|
|
160
|
+
INSERT INTO failed_rows_sample (
|
|
161
|
+
run_id, check_id, row_index, column_name, value, expected, reason, context
|
|
162
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
163
|
+
""",
|
|
164
|
+
"get_runs_for_source": """
|
|
165
|
+
SELECT * FROM runs
|
|
166
|
+
WHERE source = ?
|
|
167
|
+
ORDER BY started_at DESC
|
|
168
|
+
LIMIT ?
|
|
169
|
+
""",
|
|
170
|
+
"get_runs_in_period": """
|
|
171
|
+
SELECT * FROM runs
|
|
172
|
+
WHERE source = ?
|
|
173
|
+
AND started_at >= ?
|
|
174
|
+
AND started_at <= ?
|
|
175
|
+
ORDER BY started_at DESC
|
|
176
|
+
""",
|
|
177
|
+
"get_all_runs": """
|
|
178
|
+
SELECT * FROM runs
|
|
179
|
+
ORDER BY started_at DESC
|
|
180
|
+
LIMIT ?
|
|
181
|
+
""",
|
|
182
|
+
"get_quality_trend": """
|
|
183
|
+
SELECT date, avg_quality_score, min_quality_score, max_quality_score,
|
|
184
|
+
run_count, passed_count, failed_count
|
|
185
|
+
FROM quality_trends
|
|
186
|
+
WHERE source = ?
|
|
187
|
+
AND date >= ?
|
|
188
|
+
ORDER BY date
|
|
189
|
+
""",
|
|
190
|
+
"get_latest_run": """
|
|
191
|
+
SELECT * FROM runs
|
|
192
|
+
WHERE source = ?
|
|
193
|
+
ORDER BY started_at DESC
|
|
194
|
+
LIMIT 1
|
|
195
|
+
""",
|
|
196
|
+
"get_check_results_for_run": """
|
|
197
|
+
SELECT * FROM check_results
|
|
198
|
+
WHERE run_id = ?
|
|
199
|
+
ORDER BY id
|
|
200
|
+
""",
|
|
201
|
+
"get_failed_rows_for_run": """
|
|
202
|
+
SELECT * FROM failed_rows_sample
|
|
203
|
+
WHERE run_id = ?
|
|
204
|
+
ORDER BY id
|
|
205
|
+
""",
|
|
206
|
+
"upsert_trend": """
|
|
207
|
+
INSERT INTO quality_trends (
|
|
208
|
+
source, date, avg_quality_score, min_quality_score,
|
|
209
|
+
max_quality_score, run_count, passed_count, failed_count
|
|
210
|
+
) VALUES (?, ?, ?, ?, ?, 1, ?, ?)
|
|
211
|
+
ON CONFLICT(source, date) DO UPDATE SET
|
|
212
|
+
avg_quality_score = (
|
|
213
|
+
(avg_quality_score * run_count + excluded.avg_quality_score)
|
|
214
|
+
/ (run_count + 1)
|
|
215
|
+
),
|
|
216
|
+
min_quality_score = MIN(min_quality_score, excluded.min_quality_score),
|
|
217
|
+
max_quality_score = MAX(max_quality_score, excluded.max_quality_score),
|
|
218
|
+
run_count = run_count + 1,
|
|
219
|
+
passed_count = passed_count + excluded.passed_count,
|
|
220
|
+
failed_count = failed_count + excluded.failed_count
|
|
221
|
+
""",
|
|
222
|
+
"get_unique_sources": """
|
|
223
|
+
SELECT DISTINCT source FROM runs
|
|
224
|
+
ORDER BY source
|
|
225
|
+
""",
|
|
226
|
+
"delete_old_runs": """
|
|
227
|
+
DELETE FROM runs
|
|
228
|
+
WHERE started_at < ?
|
|
229
|
+
""",
|
|
230
|
+
"get_run_by_id": """
|
|
231
|
+
SELECT * FROM runs
|
|
232
|
+
WHERE run_id = ?
|
|
233
|
+
""",
|
|
234
|
+
# Schema snapshot queries
|
|
235
|
+
"insert_schema_snapshot": """
|
|
236
|
+
INSERT INTO schema_snapshots (
|
|
237
|
+
source, snapshot_id, captured_at, schema_json, column_count, row_count
|
|
238
|
+
) VALUES (?, ?, ?, ?, ?, ?)
|
|
239
|
+
""",
|
|
240
|
+
"get_schema_snapshots": """
|
|
241
|
+
SELECT * FROM schema_snapshots
|
|
242
|
+
WHERE source = ?
|
|
243
|
+
ORDER BY captured_at DESC
|
|
244
|
+
LIMIT ?
|
|
245
|
+
""",
|
|
246
|
+
"get_latest_schema_snapshot": """
|
|
247
|
+
SELECT * FROM schema_snapshots
|
|
248
|
+
WHERE source = ?
|
|
249
|
+
ORDER BY captured_at DESC
|
|
250
|
+
LIMIT 1
|
|
251
|
+
""",
|
|
252
|
+
"get_schema_snapshot_by_id": """
|
|
253
|
+
SELECT * FROM schema_snapshots
|
|
254
|
+
WHERE snapshot_id = ?
|
|
255
|
+
""",
|
|
256
|
+
# Schema change queries
|
|
257
|
+
"insert_schema_change": """
|
|
258
|
+
INSERT INTO schema_changes (
|
|
259
|
+
source, detected_at, previous_snapshot_id, current_snapshot_id,
|
|
260
|
+
change_type, column_name, previous_value, current_value,
|
|
261
|
+
is_breaking, severity
|
|
262
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
263
|
+
""",
|
|
264
|
+
"get_schema_changes": """
|
|
265
|
+
SELECT * FROM schema_changes
|
|
266
|
+
WHERE source = ?
|
|
267
|
+
ORDER BY detected_at DESC
|
|
268
|
+
LIMIT ?
|
|
269
|
+
""",
|
|
270
|
+
"get_schema_changes_since": """
|
|
271
|
+
SELECT * FROM schema_changes
|
|
272
|
+
WHERE source = ?
|
|
273
|
+
AND detected_at >= ?
|
|
274
|
+
ORDER BY detected_at DESC
|
|
275
|
+
""",
|
|
276
|
+
# Baseline queries
|
|
277
|
+
"upsert_baseline": """
|
|
278
|
+
INSERT INTO baselines (
|
|
279
|
+
source, column_name, metric, baseline_value, sample_size, created_at, updated_at
|
|
280
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
281
|
+
ON CONFLICT(source, column_name, metric) DO UPDATE SET
|
|
282
|
+
baseline_value = excluded.baseline_value,
|
|
283
|
+
sample_size = excluded.sample_size,
|
|
284
|
+
updated_at = excluded.updated_at
|
|
285
|
+
""",
|
|
286
|
+
"get_baseline": """
|
|
287
|
+
SELECT * FROM baselines
|
|
288
|
+
WHERE source = ?
|
|
289
|
+
AND column_name = ?
|
|
290
|
+
AND metric = ?
|
|
291
|
+
""",
|
|
292
|
+
"get_baselines_for_source": """
|
|
293
|
+
SELECT * FROM baselines
|
|
294
|
+
WHERE source = ?
|
|
295
|
+
ORDER BY column_name, metric
|
|
296
|
+
""",
|
|
297
|
+
"delete_baselines_for_source": """
|
|
298
|
+
DELETE FROM baselines
|
|
299
|
+
WHERE source = ?
|
|
300
|
+
""",
|
|
301
|
+
}
|