duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/cli/main.py +257 -2
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +851 -1
- duckguard/core/dataset.py +1035 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/executor.py +642 -0
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +54 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-3.0.0.dist-info/METADATA +1072 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
"""ML-based anomaly detection methods for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides advanced anomaly detection methods that learn from historical data
|
|
4
|
+
rather than requiring manual threshold configuration.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from duckguard.anomaly.baselines import BaselineStorage
|
|
14
|
+
from duckguard.anomaly.methods import AnomalyMethod, AnomalyScore
|
|
15
|
+
from duckguard.history.storage import HistoryStorage
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BaselineComparison:
|
|
20
|
+
"""Result of comparing current data to baseline.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
column_name: Name of the column
|
|
24
|
+
metric: Metric being compared
|
|
25
|
+
baseline_value: Stored baseline value
|
|
26
|
+
current_value: Current value
|
|
27
|
+
deviation: How far current deviates from baseline
|
|
28
|
+
deviation_percent: Deviation as percentage
|
|
29
|
+
is_anomalous: Whether this deviation is anomalous
|
|
30
|
+
details: Additional comparison details
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
column_name: str
|
|
34
|
+
metric: str
|
|
35
|
+
baseline_value: float
|
|
36
|
+
current_value: float
|
|
37
|
+
deviation: float
|
|
38
|
+
deviation_percent: float
|
|
39
|
+
is_anomalous: bool
|
|
40
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class DistributionComparison:
|
|
45
|
+
"""Result of comparing distributions.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
column_name: Name of the column
|
|
49
|
+
statistic: Test statistic value
|
|
50
|
+
p_value: P-value of the test
|
|
51
|
+
is_drifted: Whether significant drift was detected
|
|
52
|
+
method: Statistical test used
|
|
53
|
+
details: Additional test details
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
column_name: str
|
|
57
|
+
statistic: float
|
|
58
|
+
p_value: float
|
|
59
|
+
is_drifted: bool
|
|
60
|
+
method: str
|
|
61
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class BaselineMethod(AnomalyMethod):
|
|
65
|
+
"""Detect anomalies by comparing to learned baseline.
|
|
66
|
+
|
|
67
|
+
This method learns statistical properties from historical data and
|
|
68
|
+
detects values that deviate significantly from the learned baseline.
|
|
69
|
+
|
|
70
|
+
Usage:
|
|
71
|
+
from duckguard.anomaly.ml_methods import BaselineMethod
|
|
72
|
+
|
|
73
|
+
# Create method with storage
|
|
74
|
+
method = BaselineMethod(sensitivity=2.0)
|
|
75
|
+
|
|
76
|
+
# Fit to baseline data
|
|
77
|
+
baseline_values = [100, 102, 98, 105, 97, 103]
|
|
78
|
+
method.fit(baseline_values)
|
|
79
|
+
|
|
80
|
+
# Score new values
|
|
81
|
+
score = method.score(150) # High score = anomalous
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
storage: HistoryStorage | None = None,
|
|
87
|
+
sensitivity: float = 2.0,
|
|
88
|
+
min_samples: int = 5,
|
|
89
|
+
):
|
|
90
|
+
"""Initialize baseline method.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
storage: Optional HistoryStorage for persisting baselines
|
|
94
|
+
sensitivity: Number of standard deviations for anomaly threshold
|
|
95
|
+
min_samples: Minimum samples needed before flagging anomalies
|
|
96
|
+
"""
|
|
97
|
+
self._storage = storage
|
|
98
|
+
self._baseline_storage = BaselineStorage(storage) if storage else None
|
|
99
|
+
self.sensitivity = sensitivity
|
|
100
|
+
self.min_samples = min_samples
|
|
101
|
+
|
|
102
|
+
# Learned parameters
|
|
103
|
+
self._mean: float = 0.0
|
|
104
|
+
self._stddev: float = 1.0
|
|
105
|
+
self._min: float = float('-inf')
|
|
106
|
+
self._max: float = float('inf')
|
|
107
|
+
self._sample_count: int = 0
|
|
108
|
+
self._fitted = False
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def name(self) -> str:
|
|
112
|
+
return "baseline"
|
|
113
|
+
|
|
114
|
+
def fit(self, values: list[float]) -> None:
|
|
115
|
+
"""Learn baseline from values.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
values: List of numeric values to learn from
|
|
119
|
+
"""
|
|
120
|
+
clean = [v for v in values if v is not None and not math.isnan(v)]
|
|
121
|
+
if not clean:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
n = len(clean)
|
|
125
|
+
self._mean = sum(clean) / n
|
|
126
|
+
self._min = min(clean)
|
|
127
|
+
self._max = max(clean)
|
|
128
|
+
|
|
129
|
+
if n > 1:
|
|
130
|
+
variance = sum((x - self._mean) ** 2 for x in clean) / (n - 1)
|
|
131
|
+
self._stddev = math.sqrt(variance) if variance > 0 else 1.0
|
|
132
|
+
else:
|
|
133
|
+
self._stddev = 1.0
|
|
134
|
+
|
|
135
|
+
self._sample_count = n
|
|
136
|
+
self._fitted = True
|
|
137
|
+
|
|
138
|
+
def score(self, value: float) -> AnomalyScore:
|
|
139
|
+
"""Score a value against the baseline.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
value: Value to score
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
AnomalyScore indicating how anomalous the value is
|
|
146
|
+
"""
|
|
147
|
+
if value is None or math.isnan(value):
|
|
148
|
+
return AnomalyScore(
|
|
149
|
+
value=value,
|
|
150
|
+
score=0.0,
|
|
151
|
+
is_anomaly=False,
|
|
152
|
+
threshold=self.sensitivity,
|
|
153
|
+
details={"reason": "null_or_nan"}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Not enough samples to determine anomaly
|
|
157
|
+
if self._sample_count < self.min_samples:
|
|
158
|
+
return AnomalyScore(
|
|
159
|
+
value=value,
|
|
160
|
+
score=0.0,
|
|
161
|
+
is_anomaly=False,
|
|
162
|
+
threshold=self.sensitivity,
|
|
163
|
+
details={"reason": "insufficient_samples", "sample_count": self._sample_count}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Calculate z-score deviation from baseline
|
|
167
|
+
if self._stddev == 0:
|
|
168
|
+
deviation = 0.0
|
|
169
|
+
else:
|
|
170
|
+
deviation = abs((value - self._mean) / self._stddev)
|
|
171
|
+
|
|
172
|
+
is_anomaly = deviation > self.sensitivity
|
|
173
|
+
|
|
174
|
+
return AnomalyScore(
|
|
175
|
+
value=value,
|
|
176
|
+
score=deviation,
|
|
177
|
+
is_anomaly=is_anomaly,
|
|
178
|
+
threshold=self.sensitivity,
|
|
179
|
+
details={
|
|
180
|
+
"baseline_mean": self._mean,
|
|
181
|
+
"baseline_stddev": self._stddev,
|
|
182
|
+
"baseline_min": self._min,
|
|
183
|
+
"baseline_max": self._max,
|
|
184
|
+
"deviation_stddevs": deviation,
|
|
185
|
+
"sample_count": self._sample_count,
|
|
186
|
+
}
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def compare_to_baseline(
|
|
190
|
+
self,
|
|
191
|
+
values: list[float],
|
|
192
|
+
metric: str = "mean",
|
|
193
|
+
) -> BaselineComparison:
|
|
194
|
+
"""Compare current values to stored baseline.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
values: Current values to compare
|
|
198
|
+
metric: Metric to compare (mean, stddev, min, max)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
BaselineComparison result
|
|
202
|
+
"""
|
|
203
|
+
clean = [v for v in values if v is not None and not math.isnan(v)]
|
|
204
|
+
if not clean:
|
|
205
|
+
raise ValueError("No valid values to compare")
|
|
206
|
+
|
|
207
|
+
# Calculate current metric
|
|
208
|
+
if metric == "mean":
|
|
209
|
+
current = sum(clean) / len(clean)
|
|
210
|
+
baseline = self._mean
|
|
211
|
+
elif metric == "stddev":
|
|
212
|
+
current_mean = sum(clean) / len(clean)
|
|
213
|
+
variance = sum((x - current_mean) ** 2 for x in clean) / (len(clean) - 1)
|
|
214
|
+
current = math.sqrt(variance) if variance > 0 else 0.0
|
|
215
|
+
baseline = self._stddev
|
|
216
|
+
elif metric == "min":
|
|
217
|
+
current = min(clean)
|
|
218
|
+
baseline = self._min
|
|
219
|
+
elif metric == "max":
|
|
220
|
+
current = max(clean)
|
|
221
|
+
baseline = self._max
|
|
222
|
+
else:
|
|
223
|
+
raise ValueError(f"Unknown metric: {metric}")
|
|
224
|
+
|
|
225
|
+
deviation = abs(current - baseline)
|
|
226
|
+
deviation_percent = (deviation / baseline * 100) if baseline != 0 else 0.0
|
|
227
|
+
|
|
228
|
+
# Use sensitivity threshold for anomaly detection
|
|
229
|
+
threshold = self.sensitivity * self._stddev if metric == "mean" else deviation_percent > 20
|
|
230
|
+
|
|
231
|
+
is_anomalous = deviation > threshold if isinstance(threshold, float) else threshold
|
|
232
|
+
|
|
233
|
+
return BaselineComparison(
|
|
234
|
+
column_name="", # Set by caller
|
|
235
|
+
metric=metric,
|
|
236
|
+
baseline_value=baseline,
|
|
237
|
+
current_value=current,
|
|
238
|
+
deviation=deviation,
|
|
239
|
+
deviation_percent=deviation_percent,
|
|
240
|
+
is_anomalous=is_anomalous,
|
|
241
|
+
details={
|
|
242
|
+
"sensitivity": self.sensitivity,
|
|
243
|
+
"threshold": threshold if isinstance(threshold, float) else 20.0,
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
def save_baseline(
|
|
248
|
+
self,
|
|
249
|
+
source: str,
|
|
250
|
+
column_name: str,
|
|
251
|
+
) -> None:
|
|
252
|
+
"""Save learned baseline to storage.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
source: Data source path
|
|
256
|
+
column_name: Column name
|
|
257
|
+
"""
|
|
258
|
+
if not self._baseline_storage:
|
|
259
|
+
raise ValueError("No storage configured for baseline persistence")
|
|
260
|
+
|
|
261
|
+
if not self._fitted:
|
|
262
|
+
raise ValueError("Method not fitted - call fit() first")
|
|
263
|
+
|
|
264
|
+
self._baseline_storage.store(source, column_name, "mean", self._mean, sample_size=self._sample_count)
|
|
265
|
+
self._baseline_storage.store(source, column_name, "stddev", self._stddev, sample_size=self._sample_count)
|
|
266
|
+
self._baseline_storage.store(source, column_name, "min", self._min, sample_size=self._sample_count)
|
|
267
|
+
self._baseline_storage.store(source, column_name, "max", self._max, sample_size=self._sample_count)
|
|
268
|
+
|
|
269
|
+
def load_baseline(
|
|
270
|
+
self,
|
|
271
|
+
source: str,
|
|
272
|
+
column_name: str,
|
|
273
|
+
) -> bool:
|
|
274
|
+
"""Load baseline from storage.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
source: Data source path
|
|
278
|
+
column_name: Column name
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
True if baseline was loaded, False if not found
|
|
282
|
+
"""
|
|
283
|
+
if not self._baseline_storage:
|
|
284
|
+
raise ValueError("No storage configured for baseline persistence")
|
|
285
|
+
|
|
286
|
+
mean_bl = self._baseline_storage.get(source, column_name, "mean")
|
|
287
|
+
if not mean_bl:
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
self._mean = mean_bl.value
|
|
291
|
+
self._sample_count = mean_bl.sample_size or 0
|
|
292
|
+
|
|
293
|
+
stddev_bl = self._baseline_storage.get(source, column_name, "stddev")
|
|
294
|
+
if stddev_bl:
|
|
295
|
+
self._stddev = stddev_bl.value
|
|
296
|
+
|
|
297
|
+
min_bl = self._baseline_storage.get(source, column_name, "min")
|
|
298
|
+
if min_bl:
|
|
299
|
+
self._min = min_bl.value
|
|
300
|
+
|
|
301
|
+
max_bl = self._baseline_storage.get(source, column_name, "max")
|
|
302
|
+
if max_bl:
|
|
303
|
+
self._max = max_bl.value
|
|
304
|
+
|
|
305
|
+
self._fitted = True
|
|
306
|
+
return True
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class KSTestMethod(AnomalyMethod):
|
|
310
|
+
"""Detect distribution drift using Kolmogorov-Smirnov test.
|
|
311
|
+
|
|
312
|
+
This method compares the current data distribution to a baseline
|
|
313
|
+
distribution and detects statistically significant differences.
|
|
314
|
+
|
|
315
|
+
Usage:
|
|
316
|
+
from duckguard.anomaly.ml_methods import KSTestMethod
|
|
317
|
+
|
|
318
|
+
method = KSTestMethod(p_value_threshold=0.05)
|
|
319
|
+
|
|
320
|
+
# Fit to baseline data
|
|
321
|
+
baseline_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
|
322
|
+
method.fit(baseline_data)
|
|
323
|
+
|
|
324
|
+
# Detect if new data has drifted
|
|
325
|
+
new_data = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
|
326
|
+
comparison = method.compare_distributions(new_data)
|
|
327
|
+
if comparison.is_drifted:
|
|
328
|
+
print(f"Distribution drift detected! p-value: {comparison.p_value}")
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
def __init__(self, p_value_threshold: float = 0.05):
|
|
332
|
+
"""Initialize KS test method.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
p_value_threshold: P-value below which drift is detected
|
|
336
|
+
"""
|
|
337
|
+
self.p_value_threshold = p_value_threshold
|
|
338
|
+
self._baseline_values: list[float] = []
|
|
339
|
+
self._baseline_ecdf: list[tuple[float, float]] = []
|
|
340
|
+
self._fitted = False
|
|
341
|
+
|
|
342
|
+
@property
|
|
343
|
+
def name(self) -> str:
|
|
344
|
+
return "ks_test"
|
|
345
|
+
|
|
346
|
+
def fit(self, values: list[float]) -> None:
|
|
347
|
+
"""Learn baseline distribution.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
values: List of numeric values for baseline
|
|
351
|
+
"""
|
|
352
|
+
clean = sorted(v for v in values if v is not None and not math.isnan(v))
|
|
353
|
+
if not clean:
|
|
354
|
+
return
|
|
355
|
+
|
|
356
|
+
self._baseline_values = clean
|
|
357
|
+
self._baseline_ecdf = self._compute_ecdf(clean)
|
|
358
|
+
self._fitted = True
|
|
359
|
+
|
|
360
|
+
def score(self, value: float) -> AnomalyScore:
|
|
361
|
+
"""Score a single value (uses empirical CDF).
|
|
362
|
+
|
|
363
|
+
For distribution testing, use compare_distributions() instead.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
value: Value to score
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
AnomalyScore based on position in baseline distribution
|
|
370
|
+
"""
|
|
371
|
+
if value is None or math.isnan(value):
|
|
372
|
+
return AnomalyScore(
|
|
373
|
+
value=value,
|
|
374
|
+
score=0.0,
|
|
375
|
+
is_anomaly=False,
|
|
376
|
+
threshold=self.p_value_threshold,
|
|
377
|
+
details={"reason": "null_or_nan"}
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
if not self._fitted:
|
|
381
|
+
return AnomalyScore(
|
|
382
|
+
value=value,
|
|
383
|
+
score=0.0,
|
|
384
|
+
is_anomaly=False,
|
|
385
|
+
threshold=self.p_value_threshold,
|
|
386
|
+
details={"reason": "not_fitted"}
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Find percentile in baseline
|
|
390
|
+
percentile = self._get_percentile(value)
|
|
391
|
+
|
|
392
|
+
# Extreme percentiles indicate potential anomalies
|
|
393
|
+
is_anomaly = percentile < 0.01 or percentile > 0.99
|
|
394
|
+
|
|
395
|
+
return AnomalyScore(
|
|
396
|
+
value=value,
|
|
397
|
+
score=min(percentile, 1 - percentile), # Distance from extremes
|
|
398
|
+
is_anomaly=is_anomaly,
|
|
399
|
+
threshold=0.01,
|
|
400
|
+
details={
|
|
401
|
+
"percentile": percentile,
|
|
402
|
+
"baseline_size": len(self._baseline_values),
|
|
403
|
+
}
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def compare_distributions(
|
|
407
|
+
self,
|
|
408
|
+
current_values: list[float],
|
|
409
|
+
) -> DistributionComparison:
|
|
410
|
+
"""Compare current distribution to baseline using KS test.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
current_values: Current values to compare
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
DistributionComparison with test results
|
|
417
|
+
"""
|
|
418
|
+
if not self._fitted:
|
|
419
|
+
raise ValueError("Method not fitted - call fit() first")
|
|
420
|
+
|
|
421
|
+
clean_current = sorted(v for v in current_values if v is not None and not math.isnan(v))
|
|
422
|
+
if not clean_current:
|
|
423
|
+
raise ValueError("No valid values to compare")
|
|
424
|
+
|
|
425
|
+
# Compute KS statistic (two-sample)
|
|
426
|
+
ks_stat, p_value = self._ks_two_sample(self._baseline_values, clean_current)
|
|
427
|
+
|
|
428
|
+
return DistributionComparison(
|
|
429
|
+
column_name="", # Set by caller
|
|
430
|
+
statistic=ks_stat,
|
|
431
|
+
p_value=p_value,
|
|
432
|
+
is_drifted=p_value < self.p_value_threshold,
|
|
433
|
+
method="ks_test",
|
|
434
|
+
details={
|
|
435
|
+
"baseline_size": len(self._baseline_values),
|
|
436
|
+
"current_size": len(clean_current),
|
|
437
|
+
"threshold": self.p_value_threshold,
|
|
438
|
+
}
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
def _compute_ecdf(self, sorted_values: list[float]) -> list[tuple[float, float]]:
|
|
442
|
+
"""Compute empirical CDF from sorted values."""
|
|
443
|
+
n = len(sorted_values)
|
|
444
|
+
return [(v, (i + 1) / n) for i, v in enumerate(sorted_values)]
|
|
445
|
+
|
|
446
|
+
def _get_percentile(self, value: float) -> float:
|
|
447
|
+
"""Get percentile of value in baseline distribution."""
|
|
448
|
+
if not self._baseline_values:
|
|
449
|
+
return 0.5
|
|
450
|
+
|
|
451
|
+
count_below = sum(1 for v in self._baseline_values if v <= value)
|
|
452
|
+
return count_below / len(self._baseline_values)
|
|
453
|
+
|
|
454
|
+
def _ks_two_sample(
|
|
455
|
+
self,
|
|
456
|
+
sample1: list[float],
|
|
457
|
+
sample2: list[float],
|
|
458
|
+
) -> tuple[float, float]:
|
|
459
|
+
"""Compute two-sample KS test statistic and approximate p-value.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
Tuple of (KS statistic, approximate p-value)
|
|
463
|
+
"""
|
|
464
|
+
n1, n2 = len(sample1), len(sample2)
|
|
465
|
+
if n1 == 0 or n2 == 0:
|
|
466
|
+
return 0.0, 1.0
|
|
467
|
+
|
|
468
|
+
# Merge and sort all values with source labels
|
|
469
|
+
combined = [(v, 1) for v in sample1] + [(v, 2) for v in sample2]
|
|
470
|
+
combined.sort(key=lambda x: x[0])
|
|
471
|
+
|
|
472
|
+
# Compute ECDFs and find max difference
|
|
473
|
+
ecdf1, ecdf2 = 0.0, 0.0
|
|
474
|
+
max_diff = 0.0
|
|
475
|
+
|
|
476
|
+
for value, source in combined:
|
|
477
|
+
if source == 1:
|
|
478
|
+
ecdf1 += 1 / n1
|
|
479
|
+
else:
|
|
480
|
+
ecdf2 += 1 / n2
|
|
481
|
+
max_diff = max(max_diff, abs(ecdf1 - ecdf2))
|
|
482
|
+
|
|
483
|
+
# Approximate p-value using asymptotic formula
|
|
484
|
+
# P(D > d) ≈ 2 * exp(-2 * n * d^2) where n = n1*n2/(n1+n2)
|
|
485
|
+
n_effective = (n1 * n2) / (n1 + n2)
|
|
486
|
+
p_value = 2 * math.exp(-2 * n_effective * max_diff ** 2)
|
|
487
|
+
p_value = min(1.0, max(0.0, p_value))
|
|
488
|
+
|
|
489
|
+
return max_diff, p_value
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class SeasonalMethod(AnomalyMethod):
|
|
493
|
+
"""Detect anomalies accounting for seasonal patterns.
|
|
494
|
+
|
|
495
|
+
This method learns typical values for different time periods
|
|
496
|
+
(hour of day, day of week, etc.) and detects deviations from
|
|
497
|
+
expected seasonal patterns.
|
|
498
|
+
|
|
499
|
+
Usage:
|
|
500
|
+
from duckguard.anomaly.ml_methods import SeasonalMethod
|
|
501
|
+
|
|
502
|
+
method = SeasonalMethod(period="daily", sensitivity=2.0)
|
|
503
|
+
|
|
504
|
+
# Fit with time-value pairs
|
|
505
|
+
# values format: [(timestamp, value), ...]
|
|
506
|
+
method.fit_with_timestamps(historical_data)
|
|
507
|
+
|
|
508
|
+
# Score new values
|
|
509
|
+
score = method.score_with_timestamp(new_timestamp, new_value)
|
|
510
|
+
"""
|
|
511
|
+
|
|
512
|
+
PERIODS = {
|
|
513
|
+
"hourly": 24, # 24 buckets (hours of day)
|
|
514
|
+
"daily": 7, # 7 buckets (days of week)
|
|
515
|
+
"weekly": 52, # 52 buckets (weeks of year)
|
|
516
|
+
"monthly": 12, # 12 buckets (months of year)
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
def __init__(
|
|
520
|
+
self,
|
|
521
|
+
period: str = "daily",
|
|
522
|
+
sensitivity: float = 2.0,
|
|
523
|
+
):
|
|
524
|
+
"""Initialize seasonal method.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
period: Seasonality period (hourly, daily, weekly, monthly)
|
|
528
|
+
sensitivity: Number of standard deviations for anomaly threshold
|
|
529
|
+
"""
|
|
530
|
+
if period not in self.PERIODS:
|
|
531
|
+
raise ValueError(f"Unknown period: {period}. Valid: {list(self.PERIODS.keys())}")
|
|
532
|
+
|
|
533
|
+
self.period = period
|
|
534
|
+
self.sensitivity = sensitivity
|
|
535
|
+
self._num_buckets = self.PERIODS[period]
|
|
536
|
+
|
|
537
|
+
# Learned parameters per bucket
|
|
538
|
+
self._bucket_means: dict[int, float] = {}
|
|
539
|
+
self._bucket_stddevs: dict[int, float] = {}
|
|
540
|
+
self._bucket_counts: dict[int, int] = {}
|
|
541
|
+
self._fitted = False
|
|
542
|
+
|
|
543
|
+
# For non-timestamped fitting
|
|
544
|
+
self._global_mean: float = 0.0
|
|
545
|
+
self._global_stddev: float = 1.0
|
|
546
|
+
|
|
547
|
+
@property
|
|
548
|
+
def name(self) -> str:
|
|
549
|
+
return f"seasonal_{self.period}"
|
|
550
|
+
|
|
551
|
+
def fit(self, values: list[float]) -> None:
|
|
552
|
+
"""Fit without timestamps (falls back to global statistics).
|
|
553
|
+
|
|
554
|
+
For proper seasonal detection, use fit_with_timestamps().
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
values: List of numeric values
|
|
558
|
+
"""
|
|
559
|
+
clean = [v for v in values if v is not None and not math.isnan(v)]
|
|
560
|
+
if not clean:
|
|
561
|
+
return
|
|
562
|
+
|
|
563
|
+
n = len(clean)
|
|
564
|
+
self._global_mean = sum(clean) / n
|
|
565
|
+
|
|
566
|
+
if n > 1:
|
|
567
|
+
variance = sum((x - self._global_mean) ** 2 for x in clean) / (n - 1)
|
|
568
|
+
self._global_stddev = math.sqrt(variance) if variance > 0 else 1.0
|
|
569
|
+
|
|
570
|
+
self._fitted = True
|
|
571
|
+
|
|
572
|
+
def fit_with_timestamps(
|
|
573
|
+
self,
|
|
574
|
+
data: list[tuple[Any, float]],
|
|
575
|
+
) -> None:
|
|
576
|
+
"""Fit with timestamps for seasonal pattern learning.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
data: List of (timestamp, value) tuples.
|
|
580
|
+
Timestamps can be datetime objects or timestamps.
|
|
581
|
+
"""
|
|
582
|
+
from datetime import datetime
|
|
583
|
+
|
|
584
|
+
# Group values by bucket
|
|
585
|
+
buckets: dict[int, list[float]] = {i: [] for i in range(self._num_buckets)}
|
|
586
|
+
|
|
587
|
+
for timestamp, value in data:
|
|
588
|
+
if value is None or math.isnan(value):
|
|
589
|
+
continue
|
|
590
|
+
|
|
591
|
+
if isinstance(timestamp, (int, float)):
|
|
592
|
+
timestamp = datetime.fromtimestamp(timestamp)
|
|
593
|
+
|
|
594
|
+
bucket = self._get_bucket(timestamp)
|
|
595
|
+
buckets[bucket].append(value)
|
|
596
|
+
|
|
597
|
+
# Compute statistics per bucket
|
|
598
|
+
for bucket, values in buckets.items():
|
|
599
|
+
if values:
|
|
600
|
+
n = len(values)
|
|
601
|
+
mean = sum(values) / n
|
|
602
|
+
self._bucket_means[bucket] = mean
|
|
603
|
+
self._bucket_counts[bucket] = n
|
|
604
|
+
|
|
605
|
+
if n > 1:
|
|
606
|
+
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
|
607
|
+
self._bucket_stddevs[bucket] = math.sqrt(variance) if variance > 0 else 1.0
|
|
608
|
+
else:
|
|
609
|
+
self._bucket_stddevs[bucket] = 1.0
|
|
610
|
+
|
|
611
|
+
# Compute global stats as fallback
|
|
612
|
+
all_values = [v for bucket_values in buckets.values() for v in bucket_values]
|
|
613
|
+
if all_values:
|
|
614
|
+
self._global_mean = sum(all_values) / len(all_values)
|
|
615
|
+
if len(all_values) > 1:
|
|
616
|
+
variance = sum((x - self._global_mean) ** 2 for x in all_values) / (len(all_values) - 1)
|
|
617
|
+
self._global_stddev = math.sqrt(variance) if variance > 0 else 1.0
|
|
618
|
+
|
|
619
|
+
self._fitted = True
|
|
620
|
+
|
|
621
|
+
def score(self, value: float) -> AnomalyScore:
|
|
622
|
+
"""Score a value without timestamp (uses global stats).
|
|
623
|
+
|
|
624
|
+
For proper seasonal scoring, use score_with_timestamp().
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
value: Value to score
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
AnomalyScore using global statistics
|
|
631
|
+
"""
|
|
632
|
+
if value is None or math.isnan(value):
|
|
633
|
+
return AnomalyScore(
|
|
634
|
+
value=value,
|
|
635
|
+
score=0.0,
|
|
636
|
+
is_anomaly=False,
|
|
637
|
+
threshold=self.sensitivity,
|
|
638
|
+
details={"reason": "null_or_nan"}
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
deviation = abs((value - self._global_mean) / self._global_stddev) if self._global_stddev != 0 else 0.0
|
|
642
|
+
is_anomaly = deviation > self.sensitivity
|
|
643
|
+
|
|
644
|
+
return AnomalyScore(
|
|
645
|
+
value=value,
|
|
646
|
+
score=deviation,
|
|
647
|
+
is_anomaly=is_anomaly,
|
|
648
|
+
threshold=self.sensitivity,
|
|
649
|
+
details={
|
|
650
|
+
"global_mean": self._global_mean,
|
|
651
|
+
"global_stddev": self._global_stddev,
|
|
652
|
+
"note": "No timestamp provided - using global statistics",
|
|
653
|
+
}
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
def score_with_timestamp(
|
|
657
|
+
self,
|
|
658
|
+
timestamp: Any,
|
|
659
|
+
value: float,
|
|
660
|
+
) -> AnomalyScore:
|
|
661
|
+
"""Score a value with timestamp for seasonal comparison.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
timestamp: Datetime or timestamp
|
|
665
|
+
value: Value to score
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
AnomalyScore considering seasonal patterns
|
|
669
|
+
"""
|
|
670
|
+
from datetime import datetime
|
|
671
|
+
|
|
672
|
+
if value is None or math.isnan(value):
|
|
673
|
+
return AnomalyScore(
|
|
674
|
+
value=value,
|
|
675
|
+
score=0.0,
|
|
676
|
+
is_anomaly=False,
|
|
677
|
+
threshold=self.sensitivity,
|
|
678
|
+
details={"reason": "null_or_nan"}
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
if isinstance(timestamp, (int, float)):
|
|
682
|
+
timestamp = datetime.fromtimestamp(timestamp)
|
|
683
|
+
|
|
684
|
+
bucket = self._get_bucket(timestamp)
|
|
685
|
+
|
|
686
|
+
# Use bucket-specific stats if available, otherwise global
|
|
687
|
+
if bucket in self._bucket_means and self._bucket_counts.get(bucket, 0) >= 3:
|
|
688
|
+
mean = self._bucket_means[bucket]
|
|
689
|
+
stddev = self._bucket_stddevs.get(bucket, 1.0)
|
|
690
|
+
used_bucket = True
|
|
691
|
+
else:
|
|
692
|
+
mean = self._global_mean
|
|
693
|
+
stddev = self._global_stddev
|
|
694
|
+
used_bucket = False
|
|
695
|
+
|
|
696
|
+
deviation = abs((value - mean) / stddev) if stddev != 0 else 0.0
|
|
697
|
+
is_anomaly = deviation > self.sensitivity
|
|
698
|
+
|
|
699
|
+
return AnomalyScore(
|
|
700
|
+
value=value,
|
|
701
|
+
score=deviation,
|
|
702
|
+
is_anomaly=is_anomaly,
|
|
703
|
+
threshold=self.sensitivity,
|
|
704
|
+
details={
|
|
705
|
+
"bucket": bucket,
|
|
706
|
+
"period": self.period,
|
|
707
|
+
"bucket_mean": mean,
|
|
708
|
+
"bucket_stddev": stddev,
|
|
709
|
+
"used_seasonal": used_bucket,
|
|
710
|
+
"bucket_sample_count": self._bucket_counts.get(bucket, 0),
|
|
711
|
+
}
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
def _get_bucket(self, timestamp) -> int:
|
|
715
|
+
"""Get bucket index for a timestamp."""
|
|
716
|
+
if self.period == "hourly":
|
|
717
|
+
return timestamp.hour
|
|
718
|
+
elif self.period == "daily":
|
|
719
|
+
return timestamp.weekday()
|
|
720
|
+
elif self.period == "weekly":
|
|
721
|
+
return timestamp.isocalendar()[1] - 1 # Week of year (0-indexed)
|
|
722
|
+
elif self.period == "monthly":
|
|
723
|
+
return timestamp.month - 1 # Month (0-indexed)
|
|
724
|
+
return 0
|