duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,432 @@
1
+ """Anomaly detection methods for DuckGuard.
2
+
3
+ Implements various statistical methods for detecting anomalies in data.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+ import math
12
+
13
+
14
+ @dataclass
15
+ class AnomalyScore:
16
+ """Score for a single value indicating how anomalous it is.
17
+
18
+ Attributes:
19
+ value: The original value
20
+ score: Anomaly score (higher = more anomalous)
21
+ is_anomaly: Whether this value is considered anomalous
22
+ threshold: The threshold used for classification
23
+ details: Additional method-specific details
24
+ """
25
+
26
+ value: Any
27
+ score: float
28
+ is_anomaly: bool
29
+ threshold: float
30
+ details: dict[str, Any] = field(default_factory=dict)
31
+
32
+
33
+ class AnomalyMethod(ABC):
34
+ """Base class for anomaly detection methods."""
35
+
36
+ @property
37
+ @abstractmethod
38
+ def name(self) -> str:
39
+ """Method name."""
40
+ pass
41
+
42
+ @abstractmethod
43
+ def fit(self, values: list[float]) -> None:
44
+ """Fit the method to historical data.
45
+
46
+ Args:
47
+ values: List of numeric values to learn from
48
+ """
49
+ pass
50
+
51
+ @abstractmethod
52
+ def score(self, value: float) -> AnomalyScore:
53
+ """Score a single value.
54
+
55
+ Args:
56
+ value: Value to score
57
+
58
+ Returns:
59
+ AnomalyScore for the value
60
+ """
61
+ pass
62
+
63
+ def detect(self, values: list[float]) -> list[AnomalyScore]:
64
+ """Detect anomalies in a list of values.
65
+
66
+ Args:
67
+ values: Values to check
68
+
69
+ Returns:
70
+ List of AnomalyScore for each value
71
+ """
72
+ return [self.score(v) for v in values]
73
+
74
+
75
+ class ZScoreMethod(AnomalyMethod):
76
+ """Z-Score based anomaly detection.
77
+
78
+ Detects values that are many standard deviations from the mean.
79
+ Good for normally distributed data.
80
+ """
81
+
82
+ def __init__(self, threshold: float = 3.0):
83
+ """Initialize Z-Score method.
84
+
85
+ Args:
86
+ threshold: Number of standard deviations to consider anomalous
87
+ """
88
+ self.threshold = threshold
89
+ self._mean: float = 0.0
90
+ self._std: float = 1.0
91
+ self._fitted = False
92
+
93
+ @property
94
+ def name(self) -> str:
95
+ return "zscore"
96
+
97
+ def fit(self, values: list[float]) -> None:
98
+ """Fit to data by computing mean and standard deviation."""
99
+ if not values:
100
+ return
101
+
102
+ clean_values = [v for v in values if v is not None and not math.isnan(v)]
103
+ if not clean_values:
104
+ return
105
+
106
+ n = len(clean_values)
107
+ self._mean = sum(clean_values) / n
108
+
109
+ if n > 1:
110
+ variance = sum((x - self._mean) ** 2 for x in clean_values) / (n - 1)
111
+ self._std = math.sqrt(variance) if variance > 0 else 1.0
112
+ else:
113
+ self._std = 1.0
114
+
115
+ self._fitted = True
116
+
117
+ def score(self, value: float) -> AnomalyScore:
118
+ """Score a value using z-score."""
119
+ if value is None or math.isnan(value):
120
+ return AnomalyScore(
121
+ value=value,
122
+ score=0.0,
123
+ is_anomaly=False,
124
+ threshold=self.threshold,
125
+ details={"reason": "null_or_nan"}
126
+ )
127
+
128
+ if self._std == 0:
129
+ z_score = 0.0
130
+ else:
131
+ z_score = abs((value - self._mean) / self._std)
132
+
133
+ is_anomaly = z_score > self.threshold
134
+
135
+ return AnomalyScore(
136
+ value=value,
137
+ score=z_score,
138
+ is_anomaly=is_anomaly,
139
+ threshold=self.threshold,
140
+ details={
141
+ "mean": self._mean,
142
+ "std": self._std,
143
+ "z_score": z_score,
144
+ "deviation_direction": "above" if value > self._mean else "below",
145
+ }
146
+ )
147
+
148
+
149
+ class IQRMethod(AnomalyMethod):
150
+ """Interquartile Range based anomaly detection.
151
+
152
+ Detects values outside the typical range defined by quartiles.
153
+ More robust to outliers than z-score.
154
+ """
155
+
156
+ def __init__(self, multiplier: float = 1.5):
157
+ """Initialize IQR method.
158
+
159
+ Args:
160
+ multiplier: IQR multiplier for bounds (1.5 = outlier, 3.0 = extreme)
161
+ """
162
+ self.multiplier = multiplier
163
+ self._q1: float = 0.0
164
+ self._q3: float = 0.0
165
+ self._iqr: float = 0.0
166
+ self._lower_bound: float = float('-inf')
167
+ self._upper_bound: float = float('inf')
168
+ self._fitted = False
169
+
170
+ @property
171
+ def name(self) -> str:
172
+ return "iqr"
173
+
174
+ def fit(self, values: list[float]) -> None:
175
+ """Fit to data by computing quartiles."""
176
+ clean_values = sorted(v for v in values if v is not None and not math.isnan(v))
177
+ if not clean_values:
178
+ return
179
+
180
+ n = len(clean_values)
181
+
182
+ # Calculate Q1 and Q3
183
+ self._q1 = self._percentile(clean_values, 25)
184
+ self._q3 = self._percentile(clean_values, 75)
185
+ self._iqr = self._q3 - self._q1
186
+
187
+ # Calculate bounds
188
+ self._lower_bound = self._q1 - (self.multiplier * self._iqr)
189
+ self._upper_bound = self._q3 + (self.multiplier * self._iqr)
190
+
191
+ self._fitted = True
192
+
193
+ def _percentile(self, sorted_values: list[float], p: float) -> float:
194
+ """Calculate percentile of sorted values."""
195
+ n = len(sorted_values)
196
+ k = (n - 1) * p / 100
197
+ f = math.floor(k)
198
+ c = math.ceil(k)
199
+
200
+ if f == c:
201
+ return sorted_values[int(k)]
202
+
203
+ return sorted_values[int(f)] * (c - k) + sorted_values[int(c)] * (k - f)
204
+
205
+ def score(self, value: float) -> AnomalyScore:
206
+ """Score a value using IQR method."""
207
+ if value is None or math.isnan(value):
208
+ return AnomalyScore(
209
+ value=value,
210
+ score=0.0,
211
+ is_anomaly=False,
212
+ threshold=self.multiplier,
213
+ details={"reason": "null_or_nan"}
214
+ )
215
+
216
+ # Calculate how many IQRs away from bounds
217
+ if value < self._lower_bound:
218
+ distance = (self._lower_bound - value) / self._iqr if self._iqr > 0 else 0
219
+ is_anomaly = True
220
+ direction = "below"
221
+ elif value > self._upper_bound:
222
+ distance = (value - self._upper_bound) / self._iqr if self._iqr > 0 else 0
223
+ is_anomaly = True
224
+ direction = "above"
225
+ else:
226
+ distance = 0
227
+ is_anomaly = False
228
+ direction = "within"
229
+
230
+ return AnomalyScore(
231
+ value=value,
232
+ score=distance,
233
+ is_anomaly=is_anomaly,
234
+ threshold=self.multiplier,
235
+ details={
236
+ "q1": self._q1,
237
+ "q3": self._q3,
238
+ "iqr": self._iqr,
239
+ "lower_bound": self._lower_bound,
240
+ "upper_bound": self._upper_bound,
241
+ "direction": direction,
242
+ }
243
+ )
244
+
245
+
246
+ class PercentChangeMethod(AnomalyMethod):
247
+ """Percent change based anomaly detection.
248
+
249
+ Detects values that differ significantly from a baseline.
250
+ Useful for monitoring metrics over time.
251
+ """
252
+
253
+ def __init__(self, threshold: float = 0.2, baseline_type: str = "mean"):
254
+ """Initialize percent change method.
255
+
256
+ Args:
257
+ threshold: Maximum allowed percent change (0.2 = 20%)
258
+ baseline_type: How to calculate baseline ("mean", "median", "last")
259
+ """
260
+ self.threshold = threshold
261
+ self.baseline_type = baseline_type
262
+ self._baseline: float = 0.0
263
+ self._fitted = False
264
+
265
+ @property
266
+ def name(self) -> str:
267
+ return "percent_change"
268
+
269
+ def fit(self, values: list[float]) -> None:
270
+ """Fit to data by computing baseline."""
271
+ clean_values = [v for v in values if v is not None and not math.isnan(v)]
272
+ if not clean_values:
273
+ return
274
+
275
+ if self.baseline_type == "mean":
276
+ self._baseline = sum(clean_values) / len(clean_values)
277
+ elif self.baseline_type == "median":
278
+ sorted_vals = sorted(clean_values)
279
+ mid = len(sorted_vals) // 2
280
+ if len(sorted_vals) % 2 == 0:
281
+ self._baseline = (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
282
+ else:
283
+ self._baseline = sorted_vals[mid]
284
+ elif self.baseline_type == "last":
285
+ self._baseline = clean_values[-1]
286
+ else:
287
+ self._baseline = sum(clean_values) / len(clean_values)
288
+
289
+ self._fitted = True
290
+
291
+ def score(self, value: float) -> AnomalyScore:
292
+ """Score a value based on percent change from baseline."""
293
+ if value is None or math.isnan(value):
294
+ return AnomalyScore(
295
+ value=value,
296
+ score=0.0,
297
+ is_anomaly=False,
298
+ threshold=self.threshold,
299
+ details={"reason": "null_or_nan"}
300
+ )
301
+
302
+ if self._baseline == 0:
303
+ # Avoid division by zero
304
+ pct_change = float('inf') if value != 0 else 0
305
+ else:
306
+ pct_change = abs(value - self._baseline) / abs(self._baseline)
307
+
308
+ is_anomaly = pct_change > self.threshold
309
+
310
+ return AnomalyScore(
311
+ value=value,
312
+ score=pct_change,
313
+ is_anomaly=is_anomaly,
314
+ threshold=self.threshold,
315
+ details={
316
+ "baseline": self._baseline,
317
+ "baseline_type": self.baseline_type,
318
+ "percent_change": pct_change,
319
+ "change_direction": "increase" if value > self._baseline else "decrease",
320
+ }
321
+ )
322
+
323
+
324
+ class ModifiedZScoreMethod(AnomalyMethod):
325
+ """Modified Z-Score using median and MAD.
326
+
327
+ More robust than standard z-score for non-normal distributions.
328
+ Uses Median Absolute Deviation instead of standard deviation.
329
+ """
330
+
331
+ def __init__(self, threshold: float = 3.5):
332
+ """Initialize Modified Z-Score method.
333
+
334
+ Args:
335
+ threshold: Threshold for anomaly detection
336
+ """
337
+ self.threshold = threshold
338
+ self._median: float = 0.0
339
+ self._mad: float = 1.0
340
+ self._fitted = False
341
+
342
+ @property
343
+ def name(self) -> str:
344
+ return "modified_zscore"
345
+
346
+ def fit(self, values: list[float]) -> None:
347
+ """Fit to data by computing median and MAD."""
348
+ clean_values = sorted(v for v in values if v is not None and not math.isnan(v))
349
+ if not clean_values:
350
+ return
351
+
352
+ n = len(clean_values)
353
+
354
+ # Calculate median
355
+ mid = n // 2
356
+ if n % 2 == 0:
357
+ self._median = (clean_values[mid - 1] + clean_values[mid]) / 2
358
+ else:
359
+ self._median = clean_values[mid]
360
+
361
+ # Calculate MAD (Median Absolute Deviation)
362
+ deviations = sorted(abs(x - self._median) for x in clean_values)
363
+ mid = len(deviations) // 2
364
+ if len(deviations) % 2 == 0:
365
+ self._mad = (deviations[mid - 1] + deviations[mid]) / 2
366
+ else:
367
+ self._mad = deviations[mid]
368
+
369
+ # Avoid zero MAD
370
+ if self._mad == 0:
371
+ self._mad = 1.0
372
+
373
+ self._fitted = True
374
+
375
+ def score(self, value: float) -> AnomalyScore:
376
+ """Score a value using modified z-score."""
377
+ if value is None or math.isnan(value):
378
+ return AnomalyScore(
379
+ value=value,
380
+ score=0.0,
381
+ is_anomaly=False,
382
+ threshold=self.threshold,
383
+ details={"reason": "null_or_nan"}
384
+ )
385
+
386
+ # Modified z-score formula: 0.6745 * (x - median) / MAD
387
+ modified_z = 0.6745 * abs(value - self._median) / self._mad
388
+
389
+ is_anomaly = modified_z > self.threshold
390
+
391
+ return AnomalyScore(
392
+ value=value,
393
+ score=modified_z,
394
+ is_anomaly=is_anomaly,
395
+ threshold=self.threshold,
396
+ details={
397
+ "median": self._median,
398
+ "mad": self._mad,
399
+ "modified_z_score": modified_z,
400
+ }
401
+ )
402
+
403
+
404
+ # Factory for creating methods
405
+ def create_method(
406
+ method_name: str,
407
+ **kwargs
408
+ ) -> AnomalyMethod:
409
+ """Create an anomaly detection method by name.
410
+
411
+ Args:
412
+ method_name: Name of the method
413
+ **kwargs: Method-specific parameters
414
+
415
+ Returns:
416
+ Configured AnomalyMethod
417
+ """
418
+ methods = {
419
+ "zscore": ZScoreMethod,
420
+ "z_score": ZScoreMethod,
421
+ "iqr": IQRMethod,
422
+ "percent_change": PercentChangeMethod,
423
+ "pct_change": PercentChangeMethod,
424
+ "modified_zscore": ModifiedZScoreMethod,
425
+ "mad": ModifiedZScoreMethod,
426
+ }
427
+
428
+ method_class = methods.get(method_name.lower())
429
+ if not method_class:
430
+ raise ValueError(f"Unknown anomaly method: {method_name}")
431
+
432
+ return method_class(**kwargs)
@@ -0,0 +1,5 @@
1
+ """CLI module for DuckGuard."""
2
+
3
+ from duckguard.cli.main import app
4
+
5
+ __all__ = ["app"]