duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
duckguard/__init__.py ADDED
@@ -0,0 +1,110 @@
1
+ """
2
+ DuckGuard - Data quality that just works.
3
+
4
+ A Python-native data quality tool built on DuckDB for speed.
5
+ Features YAML-based rules, semantic type detection, data contracts,
6
+ and anomaly detection.
7
+
8
+ Quick Start:
9
+ # Python API
10
+ from duckguard import connect
11
+ orders = connect("data/orders.csv")
12
+ assert orders.row_count > 0
13
+ assert orders.customer_id.null_percent == 0
14
+
15
+ # CLI
16
+ $ duckguard check data.csv
17
+ $ duckguard discover data.csv --output duckguard.yaml
18
+ $ duckguard contract generate data.csv
19
+
20
+ Documentation: https://github.com/duckguard/duckguard
21
+ """
22
+
23
+ # Core classes
24
+ from duckguard.core.dataset import Dataset
25
+ from duckguard.core.column import Column
26
+ from duckguard.core.engine import DuckGuardEngine
27
+ from duckguard.core.result import ValidationResult, CheckResult
28
+ from duckguard.core.scoring import QualityScore, QualityScorer, score
29
+
30
+ # Connectors
31
+ from duckguard.connectors import connect
32
+
33
+ # Profiling
34
+ from duckguard.profiler import profile, AutoProfiler
35
+
36
+ # Rules (YAML-based)
37
+ from duckguard.rules import (
38
+ load_rules,
39
+ load_rules_from_string,
40
+ execute_rules,
41
+ generate_rules,
42
+ RuleSet,
43
+ )
44
+
45
+ # Semantic type detection
46
+ from duckguard.semantic import (
47
+ SemanticType,
48
+ SemanticAnalyzer,
49
+ detect_type,
50
+ detect_types_for_dataset,
51
+ )
52
+
53
+ # Data contracts
54
+ from duckguard.contracts import (
55
+ DataContract,
56
+ load_contract,
57
+ validate_contract,
58
+ generate_contract,
59
+ diff_contracts,
60
+ )
61
+
62
+ # Anomaly detection
63
+ from duckguard.anomaly import (
64
+ AnomalyDetector,
65
+ AnomalyResult,
66
+ detect_anomalies,
67
+ )
68
+
69
+ __version__ = "2.0.0"
70
+
71
+ __all__ = [
72
+ # Core classes
73
+ "Dataset",
74
+ "Column",
75
+ "DuckGuardEngine",
76
+ "ValidationResult",
77
+ "CheckResult",
78
+ # Scoring
79
+ "QualityScore",
80
+ "QualityScorer",
81
+ "score",
82
+ # Connectors
83
+ "connect",
84
+ # Profiling
85
+ "profile",
86
+ "AutoProfiler",
87
+ # Rules
88
+ "load_rules",
89
+ "load_rules_from_string",
90
+ "execute_rules",
91
+ "generate_rules",
92
+ "RuleSet",
93
+ # Semantic
94
+ "SemanticType",
95
+ "SemanticAnalyzer",
96
+ "detect_type",
97
+ "detect_types_for_dataset",
98
+ # Contracts
99
+ "DataContract",
100
+ "load_contract",
101
+ "validate_contract",
102
+ "generate_contract",
103
+ "diff_contracts",
104
+ # Anomaly
105
+ "AnomalyDetector",
106
+ "AnomalyResult",
107
+ "detect_anomalies",
108
+ # Version
109
+ "__version__",
110
+ ]
@@ -0,0 +1,34 @@
1
+ """Anomaly detection for DuckGuard.
2
+
3
+ Provides statistical and ML-based anomaly detection for data quality monitoring.
4
+
5
+ Example:
6
+ from duckguard.anomaly import detect_anomalies, AnomalyDetector
7
+
8
+ detector = AnomalyDetector()
9
+ anomalies = detector.detect(dataset, column="amount")
10
+ """
11
+
12
+ from duckguard.anomaly.detector import (
13
+ AnomalyDetector,
14
+ AnomalyResult,
15
+ AnomalyType,
16
+ detect_anomalies,
17
+ detect_column_anomalies,
18
+ )
19
+ from duckguard.anomaly.methods import (
20
+ ZScoreMethod,
21
+ IQRMethod,
22
+ PercentChangeMethod,
23
+ )
24
+
25
+ __all__ = [
26
+ "AnomalyDetector",
27
+ "AnomalyResult",
28
+ "AnomalyType",
29
+ "detect_anomalies",
30
+ "detect_column_anomalies",
31
+ "ZScoreMethod",
32
+ "IQRMethod",
33
+ "PercentChangeMethod",
34
+ ]
@@ -0,0 +1,394 @@
1
+ """High-level anomaly detector for DuckGuard.
2
+
3
+ Provides easy-to-use anomaly detection for datasets and columns.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from enum import Enum
11
+ from typing import Any
12
+
13
+ from duckguard.core.dataset import Dataset
14
+ from duckguard.anomaly.methods import (
15
+ AnomalyMethod,
16
+ AnomalyScore,
17
+ ZScoreMethod,
18
+ IQRMethod,
19
+ PercentChangeMethod,
20
+ create_method,
21
+ )
22
+
23
+
24
+ class AnomalyType(Enum):
25
+ """Types of anomalies."""
26
+
27
+ VALUE_OUTLIER = "value_outlier" # Individual value is unusual
28
+ DISTRIBUTION_SHIFT = "distribution_shift" # Overall distribution changed
29
+ VOLUME_ANOMALY = "volume_anomaly" # Row count anomaly
30
+ NULL_SPIKE = "null_spike" # Unusual increase in nulls
31
+ CARDINALITY_CHANGE = "cardinality_change" # Number of distinct values changed
32
+
33
+
34
+ @dataclass
35
+ class AnomalyResult:
36
+ """Result of anomaly detection.
37
+
38
+ Attributes:
39
+ column: Column name (None for table-level)
40
+ anomaly_type: Type of anomaly
41
+ is_anomaly: Whether an anomaly was detected
42
+ score: Anomaly score
43
+ threshold: Detection threshold
44
+ message: Human-readable message
45
+ details: Additional details
46
+ samples: Sample anomalous values
47
+ detected_at: When the anomaly was detected
48
+ """
49
+
50
+ column: str | None
51
+ anomaly_type: AnomalyType
52
+ is_anomaly: bool
53
+ score: float
54
+ threshold: float
55
+ message: str
56
+ details: dict[str, Any] = field(default_factory=dict)
57
+ samples: list[Any] = field(default_factory=list)
58
+ detected_at: datetime = field(default_factory=datetime.now)
59
+
60
+
61
+ @dataclass
62
+ class DatasetAnomalyReport:
63
+ """Anomaly detection report for a dataset.
64
+
65
+ Attributes:
66
+ source: Data source path
67
+ anomalies: List of detected anomalies
68
+ checked_at: When the check was performed
69
+ statistics: Detection statistics
70
+ """
71
+
72
+ source: str
73
+ anomalies: list[AnomalyResult] = field(default_factory=list)
74
+ checked_at: datetime = field(default_factory=datetime.now)
75
+ statistics: dict[str, Any] = field(default_factory=dict)
76
+
77
+ @property
78
+ def has_anomalies(self) -> bool:
79
+ return any(a.is_anomaly for a in self.anomalies)
80
+
81
+ @property
82
+ def anomaly_count(self) -> int:
83
+ return sum(1 for a in self.anomalies if a.is_anomaly)
84
+
85
+ def get_anomalies(self) -> list[AnomalyResult]:
86
+ """Get only the detected anomalies."""
87
+ return [a for a in self.anomalies if a.is_anomaly]
88
+
89
+ def summary(self) -> str:
90
+ """Generate a summary."""
91
+ if not self.has_anomalies:
92
+ return "No anomalies detected."
93
+
94
+ lines = [f"Detected {self.anomaly_count} anomalies:"]
95
+ for anomaly in self.get_anomalies():
96
+ col_str = f"[{anomaly.column}]" if anomaly.column else "[table]"
97
+ lines.append(f" ⚠️ {col_str} {anomaly.message}")
98
+
99
+ return "\n".join(lines)
100
+
101
+
102
+ class AnomalyDetector:
103
+ """Detects anomalies in datasets."""
104
+
105
+ def __init__(
106
+ self,
107
+ method: str = "zscore",
108
+ threshold: float | None = None,
109
+ **method_kwargs
110
+ ):
111
+ """Initialize detector.
112
+
113
+ Args:
114
+ method: Detection method ("zscore", "iqr", "percent_change")
115
+ threshold: Detection threshold (method-specific default if None)
116
+ **method_kwargs: Additional method parameters
117
+ """
118
+ self.method_name = method
119
+
120
+ # Set default thresholds
121
+ if threshold is None:
122
+ defaults = {"zscore": 3.0, "iqr": 1.5, "percent_change": 0.2}
123
+ threshold = defaults.get(method, 3.0)
124
+
125
+ self.threshold = threshold
126
+ self.method_kwargs = method_kwargs
127
+
128
+ def detect(
129
+ self,
130
+ dataset: Dataset,
131
+ columns: list[str] | None = None,
132
+ include_row_count: bool = True,
133
+ include_null_check: bool = True,
134
+ ) -> DatasetAnomalyReport:
135
+ """Detect anomalies in a dataset.
136
+
137
+ Args:
138
+ dataset: Dataset to analyze
139
+ columns: Specific columns to check (None = all numeric)
140
+ include_row_count: Check for row count anomalies
141
+ include_null_check: Check for null percentage spikes
142
+
143
+ Returns:
144
+ DatasetAnomalyReport
145
+ """
146
+ report = DatasetAnomalyReport(source=dataset.source)
147
+
148
+ # Determine columns to check
149
+ if columns is None:
150
+ columns = self._get_numeric_columns(dataset)
151
+
152
+ # Check each column for value anomalies
153
+ for col_name in columns:
154
+ result = self.detect_column(dataset, col_name)
155
+ report.anomalies.append(result)
156
+
157
+ # Check null percentages
158
+ if include_null_check:
159
+ for col_name in dataset.columns:
160
+ null_result = self._check_null_anomaly(dataset, col_name)
161
+ if null_result.is_anomaly:
162
+ report.anomalies.append(null_result)
163
+
164
+ report.statistics = {
165
+ "columns_checked": len(columns),
166
+ "method": self.method_name,
167
+ "threshold": self.threshold,
168
+ }
169
+
170
+ return report
171
+
172
+ def detect_column(
173
+ self,
174
+ dataset: Dataset,
175
+ column: str,
176
+ baseline_values: list[float] | None = None
177
+ ) -> AnomalyResult:
178
+ """Detect anomalies in a specific column.
179
+
180
+ Args:
181
+ dataset: Dataset to analyze
182
+ column: Column name
183
+ baseline_values: Historical values for comparison
184
+
185
+ Returns:
186
+ AnomalyResult
187
+ """
188
+ col = dataset[column]
189
+
190
+ # Get column values
191
+ try:
192
+ # Get numeric stats
193
+ mean = col.mean
194
+ if mean is None:
195
+ return AnomalyResult(
196
+ column=column,
197
+ anomaly_type=AnomalyType.VALUE_OUTLIER,
198
+ is_anomaly=False,
199
+ score=0.0,
200
+ threshold=self.threshold,
201
+ message=f"Column '{column}' is not numeric",
202
+ details={"reason": "not_numeric"},
203
+ )
204
+
205
+ min_val = col.min
206
+ max_val = col.max
207
+ stddev = col.stddev or 0
208
+
209
+ # Create detection method
210
+ method = create_method(
211
+ self.method_name,
212
+ threshold=self.threshold,
213
+ **self.method_kwargs
214
+ )
215
+
216
+ # If we have baseline values, fit on those
217
+ if baseline_values:
218
+ method.fit(baseline_values)
219
+
220
+ # Score current values (using min, max, mean as representatives)
221
+ scores = [
222
+ method.score(min_val),
223
+ method.score(max_val),
224
+ method.score(mean),
225
+ ]
226
+
227
+ # Find worst anomaly
228
+ worst = max(scores, key=lambda s: s.score)
229
+ is_anomaly = worst.is_anomaly
230
+
231
+ else:
232
+ # No baseline - check current distribution characteristics
233
+ # Look for extreme values relative to mean
234
+ if stddev > 0:
235
+ z_min = abs(min_val - mean) / stddev if min_val is not None else 0
236
+ z_max = abs(max_val - mean) / stddev if max_val is not None else 0
237
+
238
+ worst_z = max(z_min, z_max)
239
+ is_anomaly = worst_z > self.threshold
240
+
241
+ worst = AnomalyScore(
242
+ value=max_val if z_max > z_min else min_val,
243
+ score=worst_z,
244
+ is_anomaly=is_anomaly,
245
+ threshold=self.threshold,
246
+ details={
247
+ "mean": mean,
248
+ "stddev": stddev,
249
+ "min": min_val,
250
+ "max": max_val,
251
+ }
252
+ )
253
+ else:
254
+ is_anomaly = False
255
+ worst = AnomalyScore(
256
+ value=mean,
257
+ score=0.0,
258
+ is_anomaly=False,
259
+ threshold=self.threshold,
260
+ )
261
+
262
+ # Build result
263
+ message = self._build_message(column, worst, mean, stddev)
264
+
265
+ return AnomalyResult(
266
+ column=column,
267
+ anomaly_type=AnomalyType.VALUE_OUTLIER,
268
+ is_anomaly=is_anomaly,
269
+ score=worst.score,
270
+ threshold=self.threshold,
271
+ message=message,
272
+ details={
273
+ "mean": mean,
274
+ "stddev": stddev,
275
+ "min": min_val,
276
+ "max": max_val,
277
+ "method": self.method_name,
278
+ **worst.details,
279
+ },
280
+ samples=[worst.value] if is_anomaly else [],
281
+ )
282
+
283
+ except Exception as e:
284
+ return AnomalyResult(
285
+ column=column,
286
+ anomaly_type=AnomalyType.VALUE_OUTLIER,
287
+ is_anomaly=False,
288
+ score=0.0,
289
+ threshold=self.threshold,
290
+ message=f"Error analyzing column '{column}': {e}",
291
+ details={"error": str(e)},
292
+ )
293
+
294
+ def _check_null_anomaly(
295
+ self,
296
+ dataset: Dataset,
297
+ column: str,
298
+ expected_null_pct: float = 5.0
299
+ ) -> AnomalyResult:
300
+ """Check for unusual null percentages."""
301
+ col = dataset[column]
302
+ null_pct = col.null_percent
303
+
304
+ # Consider it anomalous if null % is much higher than expected
305
+ is_anomaly = null_pct > expected_null_pct * 2 and null_pct > 10
306
+
307
+ return AnomalyResult(
308
+ column=column,
309
+ anomaly_type=AnomalyType.NULL_SPIKE,
310
+ is_anomaly=is_anomaly,
311
+ score=null_pct,
312
+ threshold=expected_null_pct,
313
+ message=f"Column '{column}' has {null_pct:.1f}% nulls (threshold: {expected_null_pct}%)",
314
+ details={
315
+ "null_percent": null_pct,
316
+ "null_count": col.null_count,
317
+ "expected_max": expected_null_pct,
318
+ },
319
+ )
320
+
321
+ def _get_numeric_columns(self, dataset: Dataset) -> list[str]:
322
+ """Get list of numeric columns."""
323
+ numeric_cols = []
324
+ for col_name in dataset.columns:
325
+ col = dataset[col_name]
326
+ try:
327
+ if col.mean is not None:
328
+ numeric_cols.append(col_name)
329
+ except Exception:
330
+ pass
331
+ return numeric_cols
332
+
333
+ def _build_message(
334
+ self,
335
+ column: str,
336
+ worst: AnomalyScore,
337
+ mean: float,
338
+ stddev: float
339
+ ) -> str:
340
+ """Build human-readable message."""
341
+ if not worst.is_anomaly:
342
+ return f"Column '{column}' values are within normal range"
343
+
344
+ direction = worst.details.get("deviation_direction", "")
345
+ if direction == "above":
346
+ return f"Column '{column}' has unusually high values (max: {worst.value:.2f}, mean: {mean:.2f})"
347
+ elif direction == "below":
348
+ return f"Column '{column}' has unusually low values (min: {worst.value:.2f}, mean: {mean:.2f})"
349
+ else:
350
+ return f"Column '{column}' has anomalous values (score: {worst.score:.2f})"
351
+
352
+
353
+ def detect_anomalies(
354
+ dataset: Dataset,
355
+ method: str = "zscore",
356
+ threshold: float | None = None,
357
+ columns: list[str] | None = None,
358
+ ) -> DatasetAnomalyReport:
359
+ """Detect anomalies in a dataset.
360
+
361
+ Args:
362
+ dataset: Dataset to analyze
363
+ method: Detection method
364
+ threshold: Detection threshold
365
+ columns: Columns to check
366
+
367
+ Returns:
368
+ DatasetAnomalyReport
369
+ """
370
+ detector = AnomalyDetector(method=method, threshold=threshold)
371
+ return detector.detect(dataset, columns=columns)
372
+
373
+
374
+ def detect_column_anomalies(
375
+ dataset: Dataset,
376
+ column: str,
377
+ method: str = "zscore",
378
+ threshold: float | None = None,
379
+ baseline: list[float] | None = None,
380
+ ) -> AnomalyResult:
381
+ """Detect anomalies in a specific column.
382
+
383
+ Args:
384
+ dataset: Dataset
385
+ column: Column name
386
+ method: Detection method
387
+ threshold: Detection threshold
388
+ baseline: Historical values for comparison
389
+
390
+ Returns:
391
+ AnomalyResult
392
+ """
393
+ detector = AnomalyDetector(method=method, threshold=threshold)
394
+ return detector.detect_column(dataset, column, baseline_values=baseline)