duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Anomaly detection methods for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Implements various statistical methods for detecting anomalies in data.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AnomalyScore:
|
|
16
|
+
"""Score for a single value indicating how anomalous it is.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
value: The original value
|
|
20
|
+
score: Anomaly score (higher = more anomalous)
|
|
21
|
+
is_anomaly: Whether this value is considered anomalous
|
|
22
|
+
threshold: The threshold used for classification
|
|
23
|
+
details: Additional method-specific details
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
value: Any
|
|
27
|
+
score: float
|
|
28
|
+
is_anomaly: bool
|
|
29
|
+
threshold: float
|
|
30
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AnomalyMethod(ABC):
|
|
34
|
+
"""Base class for anomaly detection methods."""
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def name(self) -> str:
|
|
39
|
+
"""Method name."""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def fit(self, values: list[float]) -> None:
|
|
44
|
+
"""Fit the method to historical data.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
values: List of numeric values to learn from
|
|
48
|
+
"""
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def score(self, value: float) -> AnomalyScore:
|
|
53
|
+
"""Score a single value.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
value: Value to score
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
AnomalyScore for the value
|
|
60
|
+
"""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def detect(self, values: list[float]) -> list[AnomalyScore]:
|
|
64
|
+
"""Detect anomalies in a list of values.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
values: Values to check
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of AnomalyScore for each value
|
|
71
|
+
"""
|
|
72
|
+
return [self.score(v) for v in values]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ZScoreMethod(AnomalyMethod):
|
|
76
|
+
"""Z-Score based anomaly detection.
|
|
77
|
+
|
|
78
|
+
Detects values that are many standard deviations from the mean.
|
|
79
|
+
Good for normally distributed data.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(self, threshold: float = 3.0):
|
|
83
|
+
"""Initialize Z-Score method.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
threshold: Number of standard deviations to consider anomalous
|
|
87
|
+
"""
|
|
88
|
+
self.threshold = threshold
|
|
89
|
+
self._mean: float = 0.0
|
|
90
|
+
self._std: float = 1.0
|
|
91
|
+
self._fitted = False
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def name(self) -> str:
|
|
95
|
+
return "zscore"
|
|
96
|
+
|
|
97
|
+
def fit(self, values: list[float]) -> None:
|
|
98
|
+
"""Fit to data by computing mean and standard deviation."""
|
|
99
|
+
if not values:
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
clean_values = [v for v in values if v is not None and not math.isnan(v)]
|
|
103
|
+
if not clean_values:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
n = len(clean_values)
|
|
107
|
+
self._mean = sum(clean_values) / n
|
|
108
|
+
|
|
109
|
+
if n > 1:
|
|
110
|
+
variance = sum((x - self._mean) ** 2 for x in clean_values) / (n - 1)
|
|
111
|
+
self._std = math.sqrt(variance) if variance > 0 else 1.0
|
|
112
|
+
else:
|
|
113
|
+
self._std = 1.0
|
|
114
|
+
|
|
115
|
+
self._fitted = True
|
|
116
|
+
|
|
117
|
+
def score(self, value: float) -> AnomalyScore:
|
|
118
|
+
"""Score a value using z-score."""
|
|
119
|
+
if value is None or math.isnan(value):
|
|
120
|
+
return AnomalyScore(
|
|
121
|
+
value=value,
|
|
122
|
+
score=0.0,
|
|
123
|
+
is_anomaly=False,
|
|
124
|
+
threshold=self.threshold,
|
|
125
|
+
details={"reason": "null_or_nan"}
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
if self._std == 0:
|
|
129
|
+
z_score = 0.0
|
|
130
|
+
else:
|
|
131
|
+
z_score = abs((value - self._mean) / self._std)
|
|
132
|
+
|
|
133
|
+
is_anomaly = z_score > self.threshold
|
|
134
|
+
|
|
135
|
+
return AnomalyScore(
|
|
136
|
+
value=value,
|
|
137
|
+
score=z_score,
|
|
138
|
+
is_anomaly=is_anomaly,
|
|
139
|
+
threshold=self.threshold,
|
|
140
|
+
details={
|
|
141
|
+
"mean": self._mean,
|
|
142
|
+
"std": self._std,
|
|
143
|
+
"z_score": z_score,
|
|
144
|
+
"deviation_direction": "above" if value > self._mean else "below",
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class IQRMethod(AnomalyMethod):
|
|
150
|
+
"""Interquartile Range based anomaly detection.
|
|
151
|
+
|
|
152
|
+
Detects values outside the typical range defined by quartiles.
|
|
153
|
+
More robust to outliers than z-score.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
def __init__(self, multiplier: float = 1.5):
|
|
157
|
+
"""Initialize IQR method.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
multiplier: IQR multiplier for bounds (1.5 = outlier, 3.0 = extreme)
|
|
161
|
+
"""
|
|
162
|
+
self.multiplier = multiplier
|
|
163
|
+
self._q1: float = 0.0
|
|
164
|
+
self._q3: float = 0.0
|
|
165
|
+
self._iqr: float = 0.0
|
|
166
|
+
self._lower_bound: float = float('-inf')
|
|
167
|
+
self._upper_bound: float = float('inf')
|
|
168
|
+
self._fitted = False
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def name(self) -> str:
|
|
172
|
+
return "iqr"
|
|
173
|
+
|
|
174
|
+
def fit(self, values: list[float]) -> None:
|
|
175
|
+
"""Fit to data by computing quartiles."""
|
|
176
|
+
clean_values = sorted(v for v in values if v is not None and not math.isnan(v))
|
|
177
|
+
if not clean_values:
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
n = len(clean_values)
|
|
181
|
+
|
|
182
|
+
# Calculate Q1 and Q3
|
|
183
|
+
self._q1 = self._percentile(clean_values, 25)
|
|
184
|
+
self._q3 = self._percentile(clean_values, 75)
|
|
185
|
+
self._iqr = self._q3 - self._q1
|
|
186
|
+
|
|
187
|
+
# Calculate bounds
|
|
188
|
+
self._lower_bound = self._q1 - (self.multiplier * self._iqr)
|
|
189
|
+
self._upper_bound = self._q3 + (self.multiplier * self._iqr)
|
|
190
|
+
|
|
191
|
+
self._fitted = True
|
|
192
|
+
|
|
193
|
+
def _percentile(self, sorted_values: list[float], p: float) -> float:
|
|
194
|
+
"""Calculate percentile of sorted values."""
|
|
195
|
+
n = len(sorted_values)
|
|
196
|
+
k = (n - 1) * p / 100
|
|
197
|
+
f = math.floor(k)
|
|
198
|
+
c = math.ceil(k)
|
|
199
|
+
|
|
200
|
+
if f == c:
|
|
201
|
+
return sorted_values[int(k)]
|
|
202
|
+
|
|
203
|
+
return sorted_values[int(f)] * (c - k) + sorted_values[int(c)] * (k - f)
|
|
204
|
+
|
|
205
|
+
def score(self, value: float) -> AnomalyScore:
|
|
206
|
+
"""Score a value using IQR method."""
|
|
207
|
+
if value is None or math.isnan(value):
|
|
208
|
+
return AnomalyScore(
|
|
209
|
+
value=value,
|
|
210
|
+
score=0.0,
|
|
211
|
+
is_anomaly=False,
|
|
212
|
+
threshold=self.multiplier,
|
|
213
|
+
details={"reason": "null_or_nan"}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Calculate how many IQRs away from bounds
|
|
217
|
+
if value < self._lower_bound:
|
|
218
|
+
distance = (self._lower_bound - value) / self._iqr if self._iqr > 0 else 0
|
|
219
|
+
is_anomaly = True
|
|
220
|
+
direction = "below"
|
|
221
|
+
elif value > self._upper_bound:
|
|
222
|
+
distance = (value - self._upper_bound) / self._iqr if self._iqr > 0 else 0
|
|
223
|
+
is_anomaly = True
|
|
224
|
+
direction = "above"
|
|
225
|
+
else:
|
|
226
|
+
distance = 0
|
|
227
|
+
is_anomaly = False
|
|
228
|
+
direction = "within"
|
|
229
|
+
|
|
230
|
+
return AnomalyScore(
|
|
231
|
+
value=value,
|
|
232
|
+
score=distance,
|
|
233
|
+
is_anomaly=is_anomaly,
|
|
234
|
+
threshold=self.multiplier,
|
|
235
|
+
details={
|
|
236
|
+
"q1": self._q1,
|
|
237
|
+
"q3": self._q3,
|
|
238
|
+
"iqr": self._iqr,
|
|
239
|
+
"lower_bound": self._lower_bound,
|
|
240
|
+
"upper_bound": self._upper_bound,
|
|
241
|
+
"direction": direction,
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class PercentChangeMethod(AnomalyMethod):
|
|
247
|
+
"""Percent change based anomaly detection.
|
|
248
|
+
|
|
249
|
+
Detects values that differ significantly from a baseline.
|
|
250
|
+
Useful for monitoring metrics over time.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def __init__(self, threshold: float = 0.2, baseline_type: str = "mean"):
|
|
254
|
+
"""Initialize percent change method.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
threshold: Maximum allowed percent change (0.2 = 20%)
|
|
258
|
+
baseline_type: How to calculate baseline ("mean", "median", "last")
|
|
259
|
+
"""
|
|
260
|
+
self.threshold = threshold
|
|
261
|
+
self.baseline_type = baseline_type
|
|
262
|
+
self._baseline: float = 0.0
|
|
263
|
+
self._fitted = False
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def name(self) -> str:
|
|
267
|
+
return "percent_change"
|
|
268
|
+
|
|
269
|
+
def fit(self, values: list[float]) -> None:
|
|
270
|
+
"""Fit to data by computing baseline."""
|
|
271
|
+
clean_values = [v for v in values if v is not None and not math.isnan(v)]
|
|
272
|
+
if not clean_values:
|
|
273
|
+
return
|
|
274
|
+
|
|
275
|
+
if self.baseline_type == "mean":
|
|
276
|
+
self._baseline = sum(clean_values) / len(clean_values)
|
|
277
|
+
elif self.baseline_type == "median":
|
|
278
|
+
sorted_vals = sorted(clean_values)
|
|
279
|
+
mid = len(sorted_vals) // 2
|
|
280
|
+
if len(sorted_vals) % 2 == 0:
|
|
281
|
+
self._baseline = (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
|
|
282
|
+
else:
|
|
283
|
+
self._baseline = sorted_vals[mid]
|
|
284
|
+
elif self.baseline_type == "last":
|
|
285
|
+
self._baseline = clean_values[-1]
|
|
286
|
+
else:
|
|
287
|
+
self._baseline = sum(clean_values) / len(clean_values)
|
|
288
|
+
|
|
289
|
+
self._fitted = True
|
|
290
|
+
|
|
291
|
+
def score(self, value: float) -> AnomalyScore:
|
|
292
|
+
"""Score a value based on percent change from baseline."""
|
|
293
|
+
if value is None or math.isnan(value):
|
|
294
|
+
return AnomalyScore(
|
|
295
|
+
value=value,
|
|
296
|
+
score=0.0,
|
|
297
|
+
is_anomaly=False,
|
|
298
|
+
threshold=self.threshold,
|
|
299
|
+
details={"reason": "null_or_nan"}
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if self._baseline == 0:
|
|
303
|
+
# Avoid division by zero
|
|
304
|
+
pct_change = float('inf') if value != 0 else 0
|
|
305
|
+
else:
|
|
306
|
+
pct_change = abs(value - self._baseline) / abs(self._baseline)
|
|
307
|
+
|
|
308
|
+
is_anomaly = pct_change > self.threshold
|
|
309
|
+
|
|
310
|
+
return AnomalyScore(
|
|
311
|
+
value=value,
|
|
312
|
+
score=pct_change,
|
|
313
|
+
is_anomaly=is_anomaly,
|
|
314
|
+
threshold=self.threshold,
|
|
315
|
+
details={
|
|
316
|
+
"baseline": self._baseline,
|
|
317
|
+
"baseline_type": self.baseline_type,
|
|
318
|
+
"percent_change": pct_change,
|
|
319
|
+
"change_direction": "increase" if value > self._baseline else "decrease",
|
|
320
|
+
}
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class ModifiedZScoreMethod(AnomalyMethod):
|
|
325
|
+
"""Modified Z-Score using median and MAD.
|
|
326
|
+
|
|
327
|
+
More robust than standard z-score for non-normal distributions.
|
|
328
|
+
Uses Median Absolute Deviation instead of standard deviation.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
def __init__(self, threshold: float = 3.5):
|
|
332
|
+
"""Initialize Modified Z-Score method.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
threshold: Threshold for anomaly detection
|
|
336
|
+
"""
|
|
337
|
+
self.threshold = threshold
|
|
338
|
+
self._median: float = 0.0
|
|
339
|
+
self._mad: float = 1.0
|
|
340
|
+
self._fitted = False
|
|
341
|
+
|
|
342
|
+
@property
|
|
343
|
+
def name(self) -> str:
|
|
344
|
+
return "modified_zscore"
|
|
345
|
+
|
|
346
|
+
def fit(self, values: list[float]) -> None:
|
|
347
|
+
"""Fit to data by computing median and MAD."""
|
|
348
|
+
clean_values = sorted(v for v in values if v is not None and not math.isnan(v))
|
|
349
|
+
if not clean_values:
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
n = len(clean_values)
|
|
353
|
+
|
|
354
|
+
# Calculate median
|
|
355
|
+
mid = n // 2
|
|
356
|
+
if n % 2 == 0:
|
|
357
|
+
self._median = (clean_values[mid - 1] + clean_values[mid]) / 2
|
|
358
|
+
else:
|
|
359
|
+
self._median = clean_values[mid]
|
|
360
|
+
|
|
361
|
+
# Calculate MAD (Median Absolute Deviation)
|
|
362
|
+
deviations = sorted(abs(x - self._median) for x in clean_values)
|
|
363
|
+
mid = len(deviations) // 2
|
|
364
|
+
if len(deviations) % 2 == 0:
|
|
365
|
+
self._mad = (deviations[mid - 1] + deviations[mid]) / 2
|
|
366
|
+
else:
|
|
367
|
+
self._mad = deviations[mid]
|
|
368
|
+
|
|
369
|
+
# Avoid zero MAD
|
|
370
|
+
if self._mad == 0:
|
|
371
|
+
self._mad = 1.0
|
|
372
|
+
|
|
373
|
+
self._fitted = True
|
|
374
|
+
|
|
375
|
+
def score(self, value: float) -> AnomalyScore:
|
|
376
|
+
"""Score a value using modified z-score."""
|
|
377
|
+
if value is None or math.isnan(value):
|
|
378
|
+
return AnomalyScore(
|
|
379
|
+
value=value,
|
|
380
|
+
score=0.0,
|
|
381
|
+
is_anomaly=False,
|
|
382
|
+
threshold=self.threshold,
|
|
383
|
+
details={"reason": "null_or_nan"}
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Modified z-score formula: 0.6745 * (x - median) / MAD
|
|
387
|
+
modified_z = 0.6745 * abs(value - self._median) / self._mad
|
|
388
|
+
|
|
389
|
+
is_anomaly = modified_z > self.threshold
|
|
390
|
+
|
|
391
|
+
return AnomalyScore(
|
|
392
|
+
value=value,
|
|
393
|
+
score=modified_z,
|
|
394
|
+
is_anomaly=is_anomaly,
|
|
395
|
+
threshold=self.threshold,
|
|
396
|
+
details={
|
|
397
|
+
"median": self._median,
|
|
398
|
+
"mad": self._mad,
|
|
399
|
+
"modified_z_score": modified_z,
|
|
400
|
+
}
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# Factory for creating methods
|
|
405
|
+
def create_method(
|
|
406
|
+
method_name: str,
|
|
407
|
+
**kwargs
|
|
408
|
+
) -> AnomalyMethod:
|
|
409
|
+
"""Create an anomaly detection method by name.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
method_name: Name of the method
|
|
413
|
+
**kwargs: Method-specific parameters
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
Configured AnomalyMethod
|
|
417
|
+
"""
|
|
418
|
+
methods = {
|
|
419
|
+
"zscore": ZScoreMethod,
|
|
420
|
+
"z_score": ZScoreMethod,
|
|
421
|
+
"iqr": IQRMethod,
|
|
422
|
+
"percent_change": PercentChangeMethod,
|
|
423
|
+
"pct_change": PercentChangeMethod,
|
|
424
|
+
"modified_zscore": ModifiedZScoreMethod,
|
|
425
|
+
"mad": ModifiedZScoreMethod,
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
method_class = methods.get(method_name.lower())
|
|
429
|
+
if not method_class:
|
|
430
|
+
raise ValueError(f"Unknown anomaly method: {method_name}")
|
|
431
|
+
|
|
432
|
+
return method_class(**kwargs)
|