duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/result.py CHANGED
@@ -17,6 +17,30 @@ class CheckStatus(Enum):
17
17
  ERROR = "error"
18
18
 
19
19
 
20
+ @dataclass
21
+ class FailedRow:
22
+ """Represents a single row that failed validation.
23
+
24
+ Attributes:
25
+ row_index: The 1-based row number in the source data
26
+ column: The column name that failed validation
27
+ value: The actual value that failed
28
+ expected: What was expected (e.g., "not null", "between 1-100")
29
+ reason: Human-readable explanation of why validation failed
30
+ context: Additional row data for context (optional)
31
+ """
32
+
33
+ row_index: int
34
+ column: str
35
+ value: Any
36
+ expected: str
37
+ reason: str = ""
38
+ context: dict[str, Any] = field(default_factory=dict)
39
+
40
+ def __repr__(self) -> str:
41
+ return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
42
+
43
+
20
44
  @dataclass
21
45
  class CheckResult:
22
46
  """Result of a single validation check."""
@@ -46,13 +70,27 @@ class CheckResult:
46
70
 
47
71
  @dataclass
48
72
  class ValidationResult:
49
- """Result of a validation operation that can be used in assertions."""
73
+ """Result of a validation operation that can be used in assertions.
74
+
75
+ Enhanced with row-level error capture for debugging failed checks.
76
+
77
+ Attributes:
78
+ passed: Whether the validation passed
79
+ actual_value: The actual value found (e.g., count of failures)
80
+ expected_value: What was expected
81
+ message: Human-readable summary
82
+ details: Additional metadata
83
+ failed_rows: List of individual rows that failed validation
84
+ sample_size: How many failed rows to capture (default: 10)
85
+ """
50
86
 
51
87
  passed: bool
52
88
  actual_value: Any
53
89
  expected_value: Any | None = None
54
90
  message: str = ""
55
91
  details: dict[str, Any] = field(default_factory=dict)
92
+ failed_rows: list[FailedRow] = field(default_factory=list)
93
+ total_failures: int = 0
56
94
 
57
95
  def __bool__(self) -> bool:
58
96
  """Allow using ValidationResult in boolean context for assertions."""
@@ -60,8 +98,61 @@ class ValidationResult:
60
98
 
61
99
  def __repr__(self) -> str:
62
100
  status = "PASSED" if self.passed else "FAILED"
101
+ if self.failed_rows:
102
+ return f"ValidationResult({status}, actual={self.actual_value}, failed_rows={len(self.failed_rows)})"
63
103
  return f"ValidationResult({status}, actual={self.actual_value})"
64
104
 
105
+ def get_failed_values(self) -> list[Any]:
106
+ """Get list of values that failed validation."""
107
+ return [row.value for row in self.failed_rows]
108
+
109
+ def get_failed_row_indices(self) -> list[int]:
110
+ """Get list of row indices that failed validation."""
111
+ return [row.row_index for row in self.failed_rows]
112
+
113
+ def to_dataframe(self):
114
+ """Convert failed rows to a pandas DataFrame (if pandas available).
115
+
116
+ Returns:
117
+ pandas.DataFrame with failed row details
118
+
119
+ Raises:
120
+ ImportError: If pandas is not installed
121
+ """
122
+ try:
123
+ import pandas as pd
124
+
125
+ if not self.failed_rows:
126
+ return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
127
+
128
+ return pd.DataFrame([
129
+ {
130
+ "row_index": row.row_index,
131
+ "column": row.column,
132
+ "value": row.value,
133
+ "expected": row.expected,
134
+ "reason": row.reason,
135
+ **row.context,
136
+ }
137
+ for row in self.failed_rows
138
+ ])
139
+ except ImportError:
140
+ raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")
141
+
142
+ def summary(self) -> str:
143
+ """Get a summary of the validation result with sample failures."""
144
+ lines = [self.message]
145
+
146
+ if self.failed_rows:
147
+ lines.append(f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):")
148
+ for row in self.failed_rows[:5]:
149
+ lines.append(f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}")
150
+
151
+ if self.total_failures > 5:
152
+ lines.append(f" ... and {self.total_failures - 5} more failures")
153
+
154
+ return "\n".join(lines)
155
+
65
156
 
66
157
  @dataclass
67
158
  class ProfileResult:
@@ -117,3 +208,239 @@ class ScanResult:
117
208
  if self.checks_run == 0:
118
209
  return 100.0
119
210
  return (self.checks_passed / self.checks_run) * 100
211
+
212
+
213
+ # =========================================================================
214
+ # Distribution Drift Results
215
+ # =========================================================================
216
+
217
+
218
+ @dataclass
219
+ class DriftResult:
220
+ """Result of distribution drift detection between two columns.
221
+
222
+ Attributes:
223
+ is_drifted: Whether significant drift was detected
224
+ p_value: Statistical p-value from the test
225
+ statistic: Test statistic value
226
+ threshold: P-value threshold used for detection
227
+ method: Statistical method used (e.g., "ks_test")
228
+ message: Human-readable summary
229
+ details: Additional metadata
230
+ """
231
+
232
+ is_drifted: bool
233
+ p_value: float
234
+ statistic: float
235
+ threshold: float = 0.05
236
+ method: str = "ks_test"
237
+ message: str = ""
238
+ details: dict[str, Any] = field(default_factory=dict)
239
+
240
+ def __bool__(self) -> bool:
241
+ """Returns True if NO drift detected (data is stable)."""
242
+ return not self.is_drifted
243
+
244
+ def __repr__(self) -> str:
245
+ status = "DRIFT DETECTED" if self.is_drifted else "STABLE"
246
+ return f"DriftResult({status}, p_value={self.p_value:.4f}, threshold={self.threshold})"
247
+
248
+ def summary(self) -> str:
249
+ """Get a human-readable summary."""
250
+ status = "DRIFT DETECTED" if self.is_drifted else "No significant drift"
251
+ return f"{status} (p-value: {self.p_value:.4f}, threshold: {self.threshold}, method: {self.method})"
252
+
253
+
254
+ # =========================================================================
255
+ # Reconciliation Results
256
+ # =========================================================================
257
+
258
+
259
+ @dataclass
260
+ class ReconciliationMismatch:
261
+ """Represents a single row mismatch in reconciliation.
262
+
263
+ Attributes:
264
+ key_values: Dictionary of key column values that identify the row
265
+ column: Column name where mismatch occurred
266
+ source_value: Value in source dataset
267
+ target_value: Value in target dataset
268
+ mismatch_type: Type of mismatch ("value_diff", "missing_in_target", "extra_in_target")
269
+ """
270
+
271
+ key_values: dict[str, Any]
272
+ column: str
273
+ source_value: Any = None
274
+ target_value: Any = None
275
+ mismatch_type: str = "value_diff"
276
+
277
+ def __repr__(self) -> str:
278
+ keys = ", ".join(f"{k}={v}" for k, v in self.key_values.items())
279
+ return f"ReconciliationMismatch({keys}, {self.column}: {self.source_value} vs {self.target_value})"
280
+
281
+
282
+ @dataclass
283
+ class ReconciliationResult:
284
+ """Result of reconciling two datasets.
285
+
286
+ Attributes:
287
+ passed: Whether reconciliation passed (datasets match)
288
+ source_row_count: Number of rows in source dataset
289
+ target_row_count: Number of rows in target dataset
290
+ missing_in_target: Rows in source but not in target
291
+ extra_in_target: Rows in target but not in source
292
+ value_mismatches: Count of value mismatches by column
293
+ match_percentage: Percentage of rows that match
294
+ key_columns: Columns used as keys for matching
295
+ compared_columns: Columns compared for values
296
+ mismatches: Sample of actual mismatches
297
+ details: Additional metadata
298
+ """
299
+
300
+ passed: bool
301
+ source_row_count: int
302
+ target_row_count: int
303
+ missing_in_target: int = 0
304
+ extra_in_target: int = 0
305
+ value_mismatches: dict[str, int] = field(default_factory=dict)
306
+ match_percentage: float = 100.0
307
+ key_columns: list[str] = field(default_factory=list)
308
+ compared_columns: list[str] = field(default_factory=list)
309
+ mismatches: list[ReconciliationMismatch] = field(default_factory=list)
310
+ details: dict[str, Any] = field(default_factory=dict)
311
+
312
+ def __bool__(self) -> bool:
313
+ """Allow using ReconciliationResult in boolean context."""
314
+ return self.passed
315
+
316
+ def __repr__(self) -> str:
317
+ status = "MATCHED" if self.passed else "MISMATCHED"
318
+ return f"ReconciliationResult({status}, match={self.match_percentage:.1f}%, missing={self.missing_in_target}, extra={self.extra_in_target})"
319
+
320
+ @property
321
+ def total_mismatches(self) -> int:
322
+ """Total number of mismatches across all columns."""
323
+ return self.missing_in_target + self.extra_in_target + sum(self.value_mismatches.values())
324
+
325
+ def summary(self) -> str:
326
+ """Get a human-readable summary."""
327
+ lines = [
328
+ f"Reconciliation: {'PASSED' if self.passed else 'FAILED'} ({self.match_percentage:.1f}% match)",
329
+ f"Source rows: {self.source_row_count}, Target rows: {self.target_row_count}",
330
+ ]
331
+
332
+ if self.missing_in_target > 0:
333
+ lines.append(f"Missing in target: {self.missing_in_target} rows")
334
+ if self.extra_in_target > 0:
335
+ lines.append(f"Extra in target: {self.extra_in_target} rows")
336
+ if self.value_mismatches:
337
+ lines.append("Column mismatches:")
338
+ for col, count in self.value_mismatches.items():
339
+ lines.append(f" {col}: {count} differences")
340
+
341
+ if self.mismatches:
342
+ lines.append(f"\nSample mismatches ({len(self.mismatches)} shown):")
343
+ for m in self.mismatches[:5]:
344
+ keys = ", ".join(f"{k}={v}" for k, v in m.key_values.items())
345
+ lines.append(f" [{keys}] {m.column}: {m.source_value!r} vs {m.target_value!r}")
346
+
347
+ return "\n".join(lines)
348
+
349
+
350
+ # =========================================================================
351
+ # Group By Results
352
+ # =========================================================================
353
+
354
+
355
+ @dataclass
356
+ class GroupResult:
357
+ """Validation result for a single group.
358
+
359
+ Attributes:
360
+ group_key: Dictionary of group column values
361
+ row_count: Number of rows in this group
362
+ passed: Whether all checks passed for this group
363
+ check_results: List of individual check results
364
+ stats: Group-level statistics
365
+ """
366
+
367
+ group_key: dict[str, Any]
368
+ row_count: int
369
+ passed: bool = True
370
+ check_results: list[ValidationResult] = field(default_factory=list)
371
+ stats: dict[str, Any] = field(default_factory=dict)
372
+
373
+ def __bool__(self) -> bool:
374
+ """Allow using GroupResult in boolean context."""
375
+ return self.passed
376
+
377
+ def __repr__(self) -> str:
378
+ keys = ", ".join(f"{k}={v}" for k, v in self.group_key.items())
379
+ status = "PASSED" if self.passed else "FAILED"
380
+ return f"GroupResult({keys}, rows={self.row_count}, {status})"
381
+
382
+ @property
383
+ def key_string(self) -> str:
384
+ """Get a string representation of the group key."""
385
+ return ", ".join(f"{k}={v}" for k, v in self.group_key.items())
386
+
387
+
388
+ @dataclass
389
+ class GroupByResult:
390
+ """Result of group-by validation across all groups.
391
+
392
+ Attributes:
393
+ passed: Whether all groups passed validation
394
+ total_groups: Total number of groups
395
+ passed_groups: Number of groups that passed
396
+ failed_groups: Number of groups that failed
397
+ group_results: Individual results per group
398
+ group_columns: Columns used for grouping
399
+ details: Additional metadata
400
+ """
401
+
402
+ passed: bool
403
+ total_groups: int
404
+ passed_groups: int = 0
405
+ failed_groups: int = 0
406
+ group_results: list[GroupResult] = field(default_factory=list)
407
+ group_columns: list[str] = field(default_factory=list)
408
+ details: dict[str, Any] = field(default_factory=dict)
409
+
410
+ def __bool__(self) -> bool:
411
+ """Allow using GroupByResult in boolean context."""
412
+ return self.passed
413
+
414
+ def __repr__(self) -> str:
415
+ status = "PASSED" if self.passed else "FAILED"
416
+ return f"GroupByResult({status}, groups={self.total_groups}, passed={self.passed_groups}, failed={self.failed_groups})"
417
+
418
+ @property
419
+ def pass_rate(self) -> float:
420
+ """Calculate the pass rate as a percentage."""
421
+ if self.total_groups == 0:
422
+ return 100.0
423
+ return (self.passed_groups / self.total_groups) * 100
424
+
425
+ def get_failed_groups(self) -> list[GroupResult]:
426
+ """Get list of groups that failed validation."""
427
+ return [g for g in self.group_results if not g.passed]
428
+
429
+ def summary(self) -> str:
430
+ """Get a human-readable summary."""
431
+ lines = [
432
+ f"Group By Validation: {'PASSED' if self.passed else 'FAILED'}",
433
+ f"Groups: {self.total_groups} total, {self.passed_groups} passed, {self.failed_groups} failed ({self.pass_rate:.1f}%)",
434
+ f"Grouped by: {', '.join(self.group_columns)}",
435
+ ]
436
+
437
+ failed = self.get_failed_groups()
438
+ if failed:
439
+ lines.append(f"\nFailed groups ({len(failed)}):")
440
+ for g in failed[:5]:
441
+ lines.append(f" [{g.key_string}]: {g.row_count} rows")
442
+ for cr in g.check_results:
443
+ if not cr.passed:
444
+ lines.append(f" - {cr.message}")
445
+
446
+ return "\n".join(lines)
duckguard/core/scoring.py CHANGED
@@ -14,7 +14,7 @@ from __future__ import annotations
14
14
  from dataclasses import dataclass, field
15
15
  from datetime import datetime
16
16
  from enum import Enum
17
- from typing import Any, TYPE_CHECKING
17
+ from typing import TYPE_CHECKING
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  from duckguard.core.dataset import Dataset
@@ -302,7 +302,6 @@ class QualityScorer:
302
302
  # Check for reasonable ranges on numeric columns
303
303
  if numeric_stats.get("mean") is not None:
304
304
  min_val = stats.get("min_value")
305
- max_val = stats.get("max_value")
306
305
 
307
306
  # Check for negative values in likely positive-only columns
308
307
  is_likely_positive = any(