duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/result.py
CHANGED
|
@@ -17,6 +17,30 @@ class CheckStatus(Enum):
|
|
|
17
17
|
ERROR = "error"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
@dataclass
|
|
21
|
+
class FailedRow:
|
|
22
|
+
"""Represents a single row that failed validation.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
row_index: The 1-based row number in the source data
|
|
26
|
+
column: The column name that failed validation
|
|
27
|
+
value: The actual value that failed
|
|
28
|
+
expected: What was expected (e.g., "not null", "between 1-100")
|
|
29
|
+
reason: Human-readable explanation of why validation failed
|
|
30
|
+
context: Additional row data for context (optional)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
row_index: int
|
|
34
|
+
column: str
|
|
35
|
+
value: Any
|
|
36
|
+
expected: str
|
|
37
|
+
reason: str = ""
|
|
38
|
+
context: dict[str, Any] = field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
|
|
42
|
+
|
|
43
|
+
|
|
20
44
|
@dataclass
|
|
21
45
|
class CheckResult:
|
|
22
46
|
"""Result of a single validation check."""
|
|
@@ -46,13 +70,27 @@ class CheckResult:
|
|
|
46
70
|
|
|
47
71
|
@dataclass
|
|
48
72
|
class ValidationResult:
|
|
49
|
-
"""Result of a validation operation that can be used in assertions.
|
|
73
|
+
"""Result of a validation operation that can be used in assertions.
|
|
74
|
+
|
|
75
|
+
Enhanced with row-level error capture for debugging failed checks.
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
passed: Whether the validation passed
|
|
79
|
+
actual_value: The actual value found (e.g., count of failures)
|
|
80
|
+
expected_value: What was expected
|
|
81
|
+
message: Human-readable summary
|
|
82
|
+
details: Additional metadata
|
|
83
|
+
failed_rows: List of individual rows that failed validation
|
|
84
|
+
sample_size: How many failed rows to capture (default: 10)
|
|
85
|
+
"""
|
|
50
86
|
|
|
51
87
|
passed: bool
|
|
52
88
|
actual_value: Any
|
|
53
89
|
expected_value: Any | None = None
|
|
54
90
|
message: str = ""
|
|
55
91
|
details: dict[str, Any] = field(default_factory=dict)
|
|
92
|
+
failed_rows: list[FailedRow] = field(default_factory=list)
|
|
93
|
+
total_failures: int = 0
|
|
56
94
|
|
|
57
95
|
def __bool__(self) -> bool:
|
|
58
96
|
"""Allow using ValidationResult in boolean context for assertions."""
|
|
@@ -60,8 +98,61 @@ class ValidationResult:
|
|
|
60
98
|
|
|
61
99
|
def __repr__(self) -> str:
|
|
62
100
|
status = "PASSED" if self.passed else "FAILED"
|
|
101
|
+
if self.failed_rows:
|
|
102
|
+
return f"ValidationResult({status}, actual={self.actual_value}, failed_rows={len(self.failed_rows)})"
|
|
63
103
|
return f"ValidationResult({status}, actual={self.actual_value})"
|
|
64
104
|
|
|
105
|
+
def get_failed_values(self) -> list[Any]:
|
|
106
|
+
"""Get list of values that failed validation."""
|
|
107
|
+
return [row.value for row in self.failed_rows]
|
|
108
|
+
|
|
109
|
+
def get_failed_row_indices(self) -> list[int]:
|
|
110
|
+
"""Get list of row indices that failed validation."""
|
|
111
|
+
return [row.row_index for row in self.failed_rows]
|
|
112
|
+
|
|
113
|
+
def to_dataframe(self):
|
|
114
|
+
"""Convert failed rows to a pandas DataFrame (if pandas available).
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
pandas.DataFrame with failed row details
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ImportError: If pandas is not installed
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
import pandas as pd
|
|
124
|
+
|
|
125
|
+
if not self.failed_rows:
|
|
126
|
+
return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
|
|
127
|
+
|
|
128
|
+
return pd.DataFrame([
|
|
129
|
+
{
|
|
130
|
+
"row_index": row.row_index,
|
|
131
|
+
"column": row.column,
|
|
132
|
+
"value": row.value,
|
|
133
|
+
"expected": row.expected,
|
|
134
|
+
"reason": row.reason,
|
|
135
|
+
**row.context,
|
|
136
|
+
}
|
|
137
|
+
for row in self.failed_rows
|
|
138
|
+
])
|
|
139
|
+
except ImportError:
|
|
140
|
+
raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")
|
|
141
|
+
|
|
142
|
+
def summary(self) -> str:
|
|
143
|
+
"""Get a summary of the validation result with sample failures."""
|
|
144
|
+
lines = [self.message]
|
|
145
|
+
|
|
146
|
+
if self.failed_rows:
|
|
147
|
+
lines.append(f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):")
|
|
148
|
+
for row in self.failed_rows[:5]:
|
|
149
|
+
lines.append(f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}")
|
|
150
|
+
|
|
151
|
+
if self.total_failures > 5:
|
|
152
|
+
lines.append(f" ... and {self.total_failures - 5} more failures")
|
|
153
|
+
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
65
156
|
|
|
66
157
|
@dataclass
|
|
67
158
|
class ProfileResult:
|
|
@@ -117,3 +208,239 @@ class ScanResult:
|
|
|
117
208
|
if self.checks_run == 0:
|
|
118
209
|
return 100.0
|
|
119
210
|
return (self.checks_passed / self.checks_run) * 100
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# =========================================================================
|
|
214
|
+
# Distribution Drift Results
|
|
215
|
+
# =========================================================================
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@dataclass
|
|
219
|
+
class DriftResult:
|
|
220
|
+
"""Result of distribution drift detection between two columns.
|
|
221
|
+
|
|
222
|
+
Attributes:
|
|
223
|
+
is_drifted: Whether significant drift was detected
|
|
224
|
+
p_value: Statistical p-value from the test
|
|
225
|
+
statistic: Test statistic value
|
|
226
|
+
threshold: P-value threshold used for detection
|
|
227
|
+
method: Statistical method used (e.g., "ks_test")
|
|
228
|
+
message: Human-readable summary
|
|
229
|
+
details: Additional metadata
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
is_drifted: bool
|
|
233
|
+
p_value: float
|
|
234
|
+
statistic: float
|
|
235
|
+
threshold: float = 0.05
|
|
236
|
+
method: str = "ks_test"
|
|
237
|
+
message: str = ""
|
|
238
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
239
|
+
|
|
240
|
+
def __bool__(self) -> bool:
|
|
241
|
+
"""Returns True if NO drift detected (data is stable)."""
|
|
242
|
+
return not self.is_drifted
|
|
243
|
+
|
|
244
|
+
def __repr__(self) -> str:
|
|
245
|
+
status = "DRIFT DETECTED" if self.is_drifted else "STABLE"
|
|
246
|
+
return f"DriftResult({status}, p_value={self.p_value:.4f}, threshold={self.threshold})"
|
|
247
|
+
|
|
248
|
+
def summary(self) -> str:
|
|
249
|
+
"""Get a human-readable summary."""
|
|
250
|
+
status = "DRIFT DETECTED" if self.is_drifted else "No significant drift"
|
|
251
|
+
return f"{status} (p-value: {self.p_value:.4f}, threshold: {self.threshold}, method: {self.method})"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# =========================================================================
|
|
255
|
+
# Reconciliation Results
|
|
256
|
+
# =========================================================================
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@dataclass
|
|
260
|
+
class ReconciliationMismatch:
|
|
261
|
+
"""Represents a single row mismatch in reconciliation.
|
|
262
|
+
|
|
263
|
+
Attributes:
|
|
264
|
+
key_values: Dictionary of key column values that identify the row
|
|
265
|
+
column: Column name where mismatch occurred
|
|
266
|
+
source_value: Value in source dataset
|
|
267
|
+
target_value: Value in target dataset
|
|
268
|
+
mismatch_type: Type of mismatch ("value_diff", "missing_in_target", "extra_in_target")
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
key_values: dict[str, Any]
|
|
272
|
+
column: str
|
|
273
|
+
source_value: Any = None
|
|
274
|
+
target_value: Any = None
|
|
275
|
+
mismatch_type: str = "value_diff"
|
|
276
|
+
|
|
277
|
+
def __repr__(self) -> str:
|
|
278
|
+
keys = ", ".join(f"{k}={v}" for k, v in self.key_values.items())
|
|
279
|
+
return f"ReconciliationMismatch({keys}, {self.column}: {self.source_value} vs {self.target_value})"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class ReconciliationResult:
|
|
284
|
+
"""Result of reconciling two datasets.
|
|
285
|
+
|
|
286
|
+
Attributes:
|
|
287
|
+
passed: Whether reconciliation passed (datasets match)
|
|
288
|
+
source_row_count: Number of rows in source dataset
|
|
289
|
+
target_row_count: Number of rows in target dataset
|
|
290
|
+
missing_in_target: Rows in source but not in target
|
|
291
|
+
extra_in_target: Rows in target but not in source
|
|
292
|
+
value_mismatches: Count of value mismatches by column
|
|
293
|
+
match_percentage: Percentage of rows that match
|
|
294
|
+
key_columns: Columns used as keys for matching
|
|
295
|
+
compared_columns: Columns compared for values
|
|
296
|
+
mismatches: Sample of actual mismatches
|
|
297
|
+
details: Additional metadata
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
passed: bool
|
|
301
|
+
source_row_count: int
|
|
302
|
+
target_row_count: int
|
|
303
|
+
missing_in_target: int = 0
|
|
304
|
+
extra_in_target: int = 0
|
|
305
|
+
value_mismatches: dict[str, int] = field(default_factory=dict)
|
|
306
|
+
match_percentage: float = 100.0
|
|
307
|
+
key_columns: list[str] = field(default_factory=list)
|
|
308
|
+
compared_columns: list[str] = field(default_factory=list)
|
|
309
|
+
mismatches: list[ReconciliationMismatch] = field(default_factory=list)
|
|
310
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
311
|
+
|
|
312
|
+
def __bool__(self) -> bool:
|
|
313
|
+
"""Allow using ReconciliationResult in boolean context."""
|
|
314
|
+
return self.passed
|
|
315
|
+
|
|
316
|
+
def __repr__(self) -> str:
|
|
317
|
+
status = "MATCHED" if self.passed else "MISMATCHED"
|
|
318
|
+
return f"ReconciliationResult({status}, match={self.match_percentage:.1f}%, missing={self.missing_in_target}, extra={self.extra_in_target})"
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def total_mismatches(self) -> int:
|
|
322
|
+
"""Total number of mismatches across all columns."""
|
|
323
|
+
return self.missing_in_target + self.extra_in_target + sum(self.value_mismatches.values())
|
|
324
|
+
|
|
325
|
+
def summary(self) -> str:
|
|
326
|
+
"""Get a human-readable summary."""
|
|
327
|
+
lines = [
|
|
328
|
+
f"Reconciliation: {'PASSED' if self.passed else 'FAILED'} ({self.match_percentage:.1f}% match)",
|
|
329
|
+
f"Source rows: {self.source_row_count}, Target rows: {self.target_row_count}",
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
if self.missing_in_target > 0:
|
|
333
|
+
lines.append(f"Missing in target: {self.missing_in_target} rows")
|
|
334
|
+
if self.extra_in_target > 0:
|
|
335
|
+
lines.append(f"Extra in target: {self.extra_in_target} rows")
|
|
336
|
+
if self.value_mismatches:
|
|
337
|
+
lines.append("Column mismatches:")
|
|
338
|
+
for col, count in self.value_mismatches.items():
|
|
339
|
+
lines.append(f" {col}: {count} differences")
|
|
340
|
+
|
|
341
|
+
if self.mismatches:
|
|
342
|
+
lines.append(f"\nSample mismatches ({len(self.mismatches)} shown):")
|
|
343
|
+
for m in self.mismatches[:5]:
|
|
344
|
+
keys = ", ".join(f"{k}={v}" for k, v in m.key_values.items())
|
|
345
|
+
lines.append(f" [{keys}] {m.column}: {m.source_value!r} vs {m.target_value!r}")
|
|
346
|
+
|
|
347
|
+
return "\n".join(lines)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
# =========================================================================
|
|
351
|
+
# Group By Results
|
|
352
|
+
# =========================================================================
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
@dataclass
|
|
356
|
+
class GroupResult:
|
|
357
|
+
"""Validation result for a single group.
|
|
358
|
+
|
|
359
|
+
Attributes:
|
|
360
|
+
group_key: Dictionary of group column values
|
|
361
|
+
row_count: Number of rows in this group
|
|
362
|
+
passed: Whether all checks passed for this group
|
|
363
|
+
check_results: List of individual check results
|
|
364
|
+
stats: Group-level statistics
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
group_key: dict[str, Any]
|
|
368
|
+
row_count: int
|
|
369
|
+
passed: bool = True
|
|
370
|
+
check_results: list[ValidationResult] = field(default_factory=list)
|
|
371
|
+
stats: dict[str, Any] = field(default_factory=dict)
|
|
372
|
+
|
|
373
|
+
def __bool__(self) -> bool:
|
|
374
|
+
"""Allow using GroupResult in boolean context."""
|
|
375
|
+
return self.passed
|
|
376
|
+
|
|
377
|
+
def __repr__(self) -> str:
|
|
378
|
+
keys = ", ".join(f"{k}={v}" for k, v in self.group_key.items())
|
|
379
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
380
|
+
return f"GroupResult({keys}, rows={self.row_count}, {status})"
|
|
381
|
+
|
|
382
|
+
@property
|
|
383
|
+
def key_string(self) -> str:
|
|
384
|
+
"""Get a string representation of the group key."""
|
|
385
|
+
return ", ".join(f"{k}={v}" for k, v in self.group_key.items())
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
@dataclass
|
|
389
|
+
class GroupByResult:
|
|
390
|
+
"""Result of group-by validation across all groups.
|
|
391
|
+
|
|
392
|
+
Attributes:
|
|
393
|
+
passed: Whether all groups passed validation
|
|
394
|
+
total_groups: Total number of groups
|
|
395
|
+
passed_groups: Number of groups that passed
|
|
396
|
+
failed_groups: Number of groups that failed
|
|
397
|
+
group_results: Individual results per group
|
|
398
|
+
group_columns: Columns used for grouping
|
|
399
|
+
details: Additional metadata
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
passed: bool
|
|
403
|
+
total_groups: int
|
|
404
|
+
passed_groups: int = 0
|
|
405
|
+
failed_groups: int = 0
|
|
406
|
+
group_results: list[GroupResult] = field(default_factory=list)
|
|
407
|
+
group_columns: list[str] = field(default_factory=list)
|
|
408
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
409
|
+
|
|
410
|
+
def __bool__(self) -> bool:
|
|
411
|
+
"""Allow using GroupByResult in boolean context."""
|
|
412
|
+
return self.passed
|
|
413
|
+
|
|
414
|
+
def __repr__(self) -> str:
|
|
415
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
416
|
+
return f"GroupByResult({status}, groups={self.total_groups}, passed={self.passed_groups}, failed={self.failed_groups})"
|
|
417
|
+
|
|
418
|
+
@property
|
|
419
|
+
def pass_rate(self) -> float:
|
|
420
|
+
"""Calculate the pass rate as a percentage."""
|
|
421
|
+
if self.total_groups == 0:
|
|
422
|
+
return 100.0
|
|
423
|
+
return (self.passed_groups / self.total_groups) * 100
|
|
424
|
+
|
|
425
|
+
def get_failed_groups(self) -> list[GroupResult]:
|
|
426
|
+
"""Get list of groups that failed validation."""
|
|
427
|
+
return [g for g in self.group_results if not g.passed]
|
|
428
|
+
|
|
429
|
+
def summary(self) -> str:
|
|
430
|
+
"""Get a human-readable summary."""
|
|
431
|
+
lines = [
|
|
432
|
+
f"Group By Validation: {'PASSED' if self.passed else 'FAILED'}",
|
|
433
|
+
f"Groups: {self.total_groups} total, {self.passed_groups} passed, {self.failed_groups} failed ({self.pass_rate:.1f}%)",
|
|
434
|
+
f"Grouped by: {', '.join(self.group_columns)}",
|
|
435
|
+
]
|
|
436
|
+
|
|
437
|
+
failed = self.get_failed_groups()
|
|
438
|
+
if failed:
|
|
439
|
+
lines.append(f"\nFailed groups ({len(failed)}):")
|
|
440
|
+
for g in failed[:5]:
|
|
441
|
+
lines.append(f" [{g.key_string}]: {g.row_count} rows")
|
|
442
|
+
for cr in g.check_results:
|
|
443
|
+
if not cr.passed:
|
|
444
|
+
lines.append(f" - {cr.message}")
|
|
445
|
+
|
|
446
|
+
return "\n".join(lines)
|
duckguard/core/scoring.py
CHANGED
|
@@ -14,7 +14,7 @@ from __future__ import annotations
|
|
|
14
14
|
from dataclasses import dataclass, field
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
from enum import Enum
|
|
17
|
-
from typing import
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from duckguard.core.dataset import Dataset
|
|
@@ -302,7 +302,6 @@ class QualityScorer:
|
|
|
302
302
|
# Check for reasonable ranges on numeric columns
|
|
303
303
|
if numeric_stats.get("mean") is not None:
|
|
304
304
|
min_val = stats.get("min_value")
|
|
305
|
-
max_val = stats.get("max_value")
|
|
306
305
|
|
|
307
306
|
# Check for negative values in likely positive-only columns
|
|
308
307
|
is_likely_positive = any(
|