duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/__init__.py CHANGED
@@ -3,7 +3,7 @@ DuckGuard - Data quality that just works.
3
3
 
4
4
  A Python-native data quality tool built on DuckDB for speed.
5
5
  Features YAML-based rules, semantic type detection, data contracts,
6
- and anomaly detection.
6
+ anomaly detection, notifications, and dbt integration.
7
7
 
8
8
  Quick Start:
9
9
  # Python API
@@ -12,61 +12,80 @@ Quick Start:
12
12
  assert orders.row_count > 0
13
13
  assert orders.customer_id.null_percent == 0
14
14
 
15
+ # With row-level error capture
16
+ result = orders.quantity.between(1, 100)
17
+ if not result:
18
+ print(result.summary()) # See which rows failed
19
+
20
+ # Notifications
21
+ from duckguard.notifications import SlackNotifier
22
+ slack = SlackNotifier(webhook_url="...")
23
+ slack.send_failure_alert(result)
24
+
15
25
  # CLI
16
26
  $ duckguard check data.csv
17
27
  $ duckguard discover data.csv --output duckguard.yaml
18
28
  $ duckguard contract generate data.csv
19
29
 
20
- Documentation: https://github.com/duckguard/duckguard
30
+ Documentation: https://github.com/XDataHubAI/duckguard
21
31
  """
22
32
 
23
33
  # Core classes
24
- from duckguard.core.dataset import Dataset
34
+ # Anomaly detection
35
+ from duckguard.anomaly import (
36
+ AnomalyDetector,
37
+ AnomalyResult,
38
+ detect_anomalies,
39
+ )
40
+
41
+ # Connectors
42
+ from duckguard.connectors import connect
43
+
44
+ # Data contracts
45
+ from duckguard.contracts import (
46
+ DataContract,
47
+ diff_contracts,
48
+ generate_contract,
49
+ load_contract,
50
+ validate_contract,
51
+ )
25
52
  from duckguard.core.column import Column
53
+ from duckguard.core.dataset import Dataset
26
54
  from duckguard.core.engine import DuckGuardEngine
27
- from duckguard.core.result import ValidationResult, CheckResult
55
+ from duckguard.core.result import CheckResult, FailedRow, ValidationResult
28
56
  from duckguard.core.scoring import QualityScore, QualityScorer, score
29
57
 
30
- # Connectors
31
- from duckguard.connectors import connect
58
+ # Error classes
59
+ from duckguard.errors import (
60
+ ColumnNotFoundError,
61
+ ContractViolationError,
62
+ DuckGuardError,
63
+ RuleParseError,
64
+ UnsupportedConnectorError,
65
+ ValidationError,
66
+ )
32
67
 
33
68
  # Profiling
34
- from duckguard.profiler import profile, AutoProfiler
69
+ from duckguard.profiler import AutoProfiler, profile
35
70
 
36
71
  # Rules (YAML-based)
37
72
  from duckguard.rules import (
38
- load_rules,
39
- load_rules_from_string,
73
+ RuleSet,
40
74
  execute_rules,
41
75
  generate_rules,
42
- RuleSet,
76
+ load_rules,
77
+ load_rules_from_string,
43
78
  )
44
79
 
45
80
  # Semantic type detection
46
81
  from duckguard.semantic import (
47
- SemanticType,
48
82
  SemanticAnalyzer,
83
+ SemanticType,
49
84
  detect_type,
50
85
  detect_types_for_dataset,
51
86
  )
52
87
 
53
- # Data contracts
54
- from duckguard.contracts import (
55
- DataContract,
56
- load_contract,
57
- validate_contract,
58
- generate_contract,
59
- diff_contracts,
60
- )
61
-
62
- # Anomaly detection
63
- from duckguard.anomaly import (
64
- AnomalyDetector,
65
- AnomalyResult,
66
- detect_anomalies,
67
- )
68
-
69
- __version__ = "2.0.0"
88
+ __version__ = "2.3.0"
70
89
 
71
90
  __all__ = [
72
91
  # Core classes
@@ -75,6 +94,7 @@ __all__ = [
75
94
  "DuckGuardEngine",
76
95
  "ValidationResult",
77
96
  "CheckResult",
97
+ "FailedRow",
78
98
  # Scoring
79
99
  "QualityScore",
80
100
  "QualityScorer",
@@ -105,6 +125,13 @@ __all__ = [
105
125
  "AnomalyDetector",
106
126
  "AnomalyResult",
107
127
  "detect_anomalies",
128
+ # Errors
129
+ "DuckGuardError",
130
+ "ColumnNotFoundError",
131
+ "ContractViolationError",
132
+ "RuleParseError",
133
+ "UnsupportedConnectorError",
134
+ "ValidationError",
108
135
  # Version
109
136
  "__version__",
110
137
  ]
@@ -9,6 +9,11 @@ Example:
9
9
  anomalies = detector.detect(dataset, column="amount")
10
10
  """
11
11
 
12
+ from duckguard.anomaly.baselines import (
13
+ BaselineStorage,
14
+ ColumnBaseline,
15
+ StoredBaseline,
16
+ )
12
17
  from duckguard.anomaly.detector import (
13
18
  AnomalyDetector,
14
19
  AnomalyResult,
@@ -17,18 +22,41 @@ from duckguard.anomaly.detector import (
17
22
  detect_column_anomalies,
18
23
  )
19
24
  from duckguard.anomaly.methods import (
20
- ZScoreMethod,
21
25
  IQRMethod,
26
+ ModifiedZScoreMethod,
22
27
  PercentChangeMethod,
28
+ ZScoreMethod,
29
+ create_method,
30
+ )
31
+ from duckguard.anomaly.ml_methods import (
32
+ BaselineComparison,
33
+ BaselineMethod,
34
+ DistributionComparison,
35
+ KSTestMethod,
36
+ SeasonalMethod,
23
37
  )
24
38
 
25
39
  __all__ = [
40
+ # Detector
26
41
  "AnomalyDetector",
27
42
  "AnomalyResult",
28
43
  "AnomalyType",
29
44
  "detect_anomalies",
30
45
  "detect_column_anomalies",
46
+ # Standard methods
31
47
  "ZScoreMethod",
32
48
  "IQRMethod",
33
49
  "PercentChangeMethod",
50
+ "ModifiedZScoreMethod",
51
+ "create_method",
52
+ # ML methods
53
+ "BaselineMethod",
54
+ "KSTestMethod",
55
+ "SeasonalMethod",
56
+ "BaselineComparison",
57
+ "DistributionComparison",
58
+ # Baselines
59
+ "BaselineStorage",
60
+ "StoredBaseline",
61
+ "ColumnBaseline",
34
62
  ]
@@ -0,0 +1,294 @@
1
+ """Baseline storage for ML-based anomaly detection.
2
+
3
+ Provides functionality to store and retrieve learned baselines for
4
+ comparison-based anomaly detection.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from typing import Any
13
+
14
+ from duckguard.history.schema import QUERIES
15
+ from duckguard.history.storage import HistoryStorage
16
+
17
+
18
+ @dataclass
19
+ class StoredBaseline:
20
+ """Represents a stored baseline.
21
+
22
+ Attributes:
23
+ source: Data source path
24
+ column_name: Column name
25
+ metric: Metric name (mean, stddev, distribution, etc.)
26
+ value: Baseline value (can be complex for distributions)
27
+ sample_size: Number of samples used to compute baseline
28
+ created_at: When baseline was first created
29
+ updated_at: When baseline was last updated
30
+ """
31
+
32
+ source: str
33
+ column_name: str
34
+ metric: str
35
+ value: Any
36
+ sample_size: int | None
37
+ created_at: datetime
38
+ updated_at: datetime | None
39
+
40
+
41
+ class BaselineStorage:
42
+ """Store and retrieve learned baselines for anomaly detection.
43
+
44
+ Usage:
45
+ from duckguard.anomaly.baselines import BaselineStorage
46
+ from duckguard.history import HistoryStorage
47
+
48
+ storage = BaselineStorage()
49
+
50
+ # Store a baseline
51
+ storage.store("data.csv", "amount", "mean", 150.5, sample_size=1000)
52
+
53
+ # Get a baseline
54
+ baseline = storage.get("data.csv", "amount", "mean")
55
+ if baseline:
56
+ print(f"Baseline mean: {baseline.value}")
57
+
58
+ # Update with rolling average
59
+ storage.update("data.csv", "amount", "mean", 155.2,
60
+ sample_size=100, method="rolling")
61
+ """
62
+
63
+ def __init__(self, storage: HistoryStorage | None = None):
64
+ """Initialize baseline storage.
65
+
66
+ Args:
67
+ storage: Optional HistoryStorage instance. Uses default if not provided.
68
+ """
69
+ self._storage = storage or HistoryStorage()
70
+
71
+ @property
72
+ def storage(self) -> HistoryStorage:
73
+ """Get the underlying storage."""
74
+ return self._storage
75
+
76
+ def store(
77
+ self,
78
+ source: str,
79
+ column_name: str,
80
+ metric: str,
81
+ value: Any,
82
+ *,
83
+ sample_size: int | None = None,
84
+ ) -> None:
85
+ """Store or update a baseline.
86
+
87
+ Args:
88
+ source: Data source path
89
+ column_name: Column name
90
+ metric: Metric name (mean, stddev, min, max, distribution, etc.)
91
+ value: Baseline value (will be JSON serialized if complex)
92
+ sample_size: Number of samples used to compute the baseline
93
+ """
94
+ conn = self._storage._get_connection()
95
+ now = datetime.now().isoformat()
96
+
97
+ # Serialize complex values to JSON
98
+ if isinstance(value, (dict, list)):
99
+ serialized_value = json.dumps(value)
100
+ else:
101
+ serialized_value = json.dumps(value)
102
+
103
+ conn.execute(
104
+ QUERIES["upsert_baseline"],
105
+ (
106
+ source,
107
+ column_name,
108
+ metric,
109
+ serialized_value,
110
+ sample_size,
111
+ now,
112
+ now,
113
+ ),
114
+ )
115
+ conn.commit()
116
+
117
+ def get(
118
+ self,
119
+ source: str,
120
+ column_name: str,
121
+ metric: str,
122
+ ) -> StoredBaseline | None:
123
+ """Get a specific baseline.
124
+
125
+ Args:
126
+ source: Data source path
127
+ column_name: Column name
128
+ metric: Metric name
129
+
130
+ Returns:
131
+ StoredBaseline or None if not found
132
+ """
133
+ conn = self._storage._get_connection()
134
+ cursor = conn.execute(
135
+ QUERIES["get_baseline"],
136
+ (source, column_name, metric),
137
+ )
138
+ row = cursor.fetchone()
139
+
140
+ if not row:
141
+ return None
142
+
143
+ return self._row_to_baseline(row)
144
+
145
+ def get_all(self, source: str) -> list[StoredBaseline]:
146
+ """Get all baselines for a source.
147
+
148
+ Args:
149
+ source: Data source path
150
+
151
+ Returns:
152
+ List of StoredBaseline objects
153
+ """
154
+ conn = self._storage._get_connection()
155
+ cursor = conn.execute(
156
+ QUERIES["get_baselines_for_source"],
157
+ (source,),
158
+ )
159
+
160
+ return [self._row_to_baseline(row) for row in cursor.fetchall()]
161
+
162
+ def update(
163
+ self,
164
+ source: str,
165
+ column_name: str,
166
+ metric: str,
167
+ new_value: Any,
168
+ *,
169
+ sample_size: int | None = None,
170
+ method: str = "replace",
171
+ ) -> None:
172
+ """Update an existing baseline.
173
+
174
+ Args:
175
+ source: Data source path
176
+ column_name: Column name
177
+ metric: Metric name
178
+ new_value: New value
179
+ sample_size: Number of samples in new data
180
+ method: Update method - "replace" or "rolling"
181
+ """
182
+ if method == "replace":
183
+ self.store(source, column_name, metric, new_value, sample_size=sample_size)
184
+ elif method == "rolling":
185
+ # Get existing baseline
186
+ existing = self.get(source, column_name, metric)
187
+ if existing and isinstance(existing.value, (int, float)):
188
+ # Rolling average
189
+ old_weight = 0.7 # Give more weight to historical
190
+ new_weight = 0.3
191
+ blended = old_weight * existing.value + new_weight * new_value
192
+ total_samples = (existing.sample_size or 0) + (sample_size or 0)
193
+ self.store(source, column_name, metric, blended, sample_size=total_samples)
194
+ else:
195
+ self.store(source, column_name, metric, new_value, sample_size=sample_size)
196
+ else:
197
+ raise ValueError(f"Unknown update method: {method}")
198
+
199
+ def delete(self, source: str) -> int:
200
+ """Delete all baselines for a source.
201
+
202
+ Args:
203
+ source: Data source path
204
+
205
+ Returns:
206
+ Number of baselines deleted
207
+ """
208
+ conn = self._storage._get_connection()
209
+
210
+ # Get count first
211
+ cursor = conn.execute(
212
+ "SELECT COUNT(*) FROM baselines WHERE source = ?",
213
+ (source,),
214
+ )
215
+ count = cursor.fetchone()[0]
216
+
217
+ conn.execute(QUERIES["delete_baselines_for_source"], (source,))
218
+ conn.commit()
219
+
220
+ return count
221
+
222
+ def _row_to_baseline(self, row) -> StoredBaseline:
223
+ """Convert database row to StoredBaseline."""
224
+ value = json.loads(row["baseline_value"])
225
+
226
+ return StoredBaseline(
227
+ source=row["source"],
228
+ column_name=row["column_name"],
229
+ metric=row["metric"],
230
+ value=value,
231
+ sample_size=row["sample_size"],
232
+ created_at=datetime.fromisoformat(row["created_at"]),
233
+ updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
234
+ )
235
+
236
+
237
+ @dataclass
238
+ class ColumnBaseline:
239
+ """Complete baseline for a single column.
240
+
241
+ Attributes:
242
+ column_name: Column name
243
+ mean: Mean value
244
+ stddev: Standard deviation
245
+ min: Minimum value
246
+ max: Maximum value
247
+ median: Median value
248
+ null_percent: Percentage of nulls
249
+ unique_percent: Percentage of unique values
250
+ sample_size: Number of samples
251
+ distribution: Optional distribution histogram
252
+ """
253
+
254
+ column_name: str
255
+ mean: float | None = None
256
+ stddev: float | None = None
257
+ min: float | None = None
258
+ max: float | None = None
259
+ median: float | None = None
260
+ null_percent: float | None = None
261
+ unique_percent: float | None = None
262
+ sample_size: int | None = None
263
+ distribution: dict[str, Any] | None = None
264
+
265
+ def to_dict(self) -> dict[str, Any]:
266
+ """Convert to dictionary."""
267
+ return {
268
+ "column_name": self.column_name,
269
+ "mean": self.mean,
270
+ "stddev": self.stddev,
271
+ "min": self.min,
272
+ "max": self.max,
273
+ "median": self.median,
274
+ "null_percent": self.null_percent,
275
+ "unique_percent": self.unique_percent,
276
+ "sample_size": self.sample_size,
277
+ "distribution": self.distribution,
278
+ }
279
+
280
+ @classmethod
281
+ def from_dict(cls, data: dict[str, Any]) -> ColumnBaseline:
282
+ """Create from dictionary."""
283
+ return cls(
284
+ column_name=data["column_name"],
285
+ mean=data.get("mean"),
286
+ stddev=data.get("stddev"),
287
+ min=data.get("min"),
288
+ max=data.get("max"),
289
+ median=data.get("median"),
290
+ null_percent=data.get("null_percent"),
291
+ unique_percent=data.get("unique_percent"),
292
+ sample_size=data.get("sample_size"),
293
+ distribution=data.get("distribution"),
294
+ )
@@ -10,15 +10,11 @@ from datetime import datetime
10
10
  from enum import Enum
11
11
  from typing import Any
12
12
 
13
- from duckguard.core.dataset import Dataset
14
13
  from duckguard.anomaly.methods import (
15
- AnomalyMethod,
16
14
  AnomalyScore,
17
- ZScoreMethod,
18
- IQRMethod,
19
- PercentChangeMethod,
20
15
  create_method,
21
16
  )
17
+ from duckguard.core.dataset import Dataset
22
18
 
23
19
 
24
20
  class AnomalyType(Enum):
@@ -5,10 +5,10 @@ Implements various statistical methods for detecting anomalies in data.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import math
8
9
  from abc import ABC, abstractmethod
9
10
  from dataclasses import dataclass, field
10
11
  from typing import Any
11
- import math
12
12
 
13
13
 
14
14
  @dataclass
@@ -177,8 +177,6 @@ class IQRMethod(AnomalyMethod):
177
177
  if not clean_values:
178
178
  return
179
179
 
180
- n = len(clean_values)
181
-
182
180
  # Calculate Q1 and Q3
183
181
  self._q1 = self._percentile(clean_values, 25)
184
182
  self._q3 = self._percentile(clean_values, 75)
@@ -409,12 +407,22 @@ def create_method(
409
407
  """Create an anomaly detection method by name.
410
408
 
411
409
  Args:
412
- method_name: Name of the method
410
+ method_name: Name of the method. Options:
411
+ - "zscore", "z_score": Z-Score method
412
+ - "iqr": Interquartile Range method
413
+ - "percent_change", "pct_change": Percent change method
414
+ - "modified_zscore", "mad": Modified Z-Score (MAD) method
415
+ - "baseline": ML-based baseline comparison
416
+ - "ks_test": Kolmogorov-Smirnov distribution test
417
+ - "seasonal": Seasonal pattern detection
413
418
  **kwargs: Method-specific parameters
414
419
 
415
420
  Returns:
416
421
  Configured AnomalyMethod
417
422
  """
423
+ # Import ML methods lazily to avoid circular imports
424
+ from duckguard.anomaly.ml_methods import BaselineMethod, KSTestMethod, SeasonalMethod
425
+
418
426
  methods = {
419
427
  "zscore": ZScoreMethod,
420
428
  "z_score": ZScoreMethod,
@@ -423,10 +431,14 @@ def create_method(
423
431
  "pct_change": PercentChangeMethod,
424
432
  "modified_zscore": ModifiedZScoreMethod,
425
433
  "mad": ModifiedZScoreMethod,
434
+ "baseline": BaselineMethod,
435
+ "ks_test": KSTestMethod,
436
+ "ks": KSTestMethod,
437
+ "seasonal": SeasonalMethod,
426
438
  }
427
439
 
428
440
  method_class = methods.get(method_name.lower())
429
441
  if not method_class:
430
- raise ValueError(f"Unknown anomaly method: {method_name}")
442
+ raise ValueError(f"Unknown anomaly method: {method_name}. Available: {list(methods.keys())}")
431
443
 
432
444
  return method_class(**kwargs)