duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/cli/main.py +257 -2
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +851 -1
- duckguard/core/dataset.py +1035 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/executor.py +642 -0
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +54 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-3.0.0.dist-info/METADATA +1072 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/__init__.py
CHANGED
duckguard/anomaly/__init__.py
CHANGED
|
@@ -9,6 +9,11 @@ Example:
|
|
|
9
9
|
anomalies = detector.detect(dataset, column="amount")
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
from duckguard.anomaly.baselines import (
|
|
13
|
+
BaselineStorage,
|
|
14
|
+
ColumnBaseline,
|
|
15
|
+
StoredBaseline,
|
|
16
|
+
)
|
|
12
17
|
from duckguard.anomaly.detector import (
|
|
13
18
|
AnomalyDetector,
|
|
14
19
|
AnomalyResult,
|
|
@@ -18,17 +23,40 @@ from duckguard.anomaly.detector import (
|
|
|
18
23
|
)
|
|
19
24
|
from duckguard.anomaly.methods import (
|
|
20
25
|
IQRMethod,
|
|
26
|
+
ModifiedZScoreMethod,
|
|
21
27
|
PercentChangeMethod,
|
|
22
28
|
ZScoreMethod,
|
|
29
|
+
create_method,
|
|
30
|
+
)
|
|
31
|
+
from duckguard.anomaly.ml_methods import (
|
|
32
|
+
BaselineComparison,
|
|
33
|
+
BaselineMethod,
|
|
34
|
+
DistributionComparison,
|
|
35
|
+
KSTestMethod,
|
|
36
|
+
SeasonalMethod,
|
|
23
37
|
)
|
|
24
38
|
|
|
25
39
|
__all__ = [
|
|
40
|
+
# Detector
|
|
26
41
|
"AnomalyDetector",
|
|
27
42
|
"AnomalyResult",
|
|
28
43
|
"AnomalyType",
|
|
29
44
|
"detect_anomalies",
|
|
30
45
|
"detect_column_anomalies",
|
|
46
|
+
# Standard methods
|
|
31
47
|
"ZScoreMethod",
|
|
32
48
|
"IQRMethod",
|
|
33
49
|
"PercentChangeMethod",
|
|
50
|
+
"ModifiedZScoreMethod",
|
|
51
|
+
"create_method",
|
|
52
|
+
# ML methods
|
|
53
|
+
"BaselineMethod",
|
|
54
|
+
"KSTestMethod",
|
|
55
|
+
"SeasonalMethod",
|
|
56
|
+
"BaselineComparison",
|
|
57
|
+
"DistributionComparison",
|
|
58
|
+
# Baselines
|
|
59
|
+
"BaselineStorage",
|
|
60
|
+
"StoredBaseline",
|
|
61
|
+
"ColumnBaseline",
|
|
34
62
|
]
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Baseline storage for ML-based anomaly detection.
|
|
2
|
+
|
|
3
|
+
Provides functionality to store and retrieve learned baselines for
|
|
4
|
+
comparison-based anomaly detection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from duckguard.history.schema import QUERIES
|
|
15
|
+
from duckguard.history.storage import HistoryStorage
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class StoredBaseline:
|
|
20
|
+
"""Represents a stored baseline.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
source: Data source path
|
|
24
|
+
column_name: Column name
|
|
25
|
+
metric: Metric name (mean, stddev, distribution, etc.)
|
|
26
|
+
value: Baseline value (can be complex for distributions)
|
|
27
|
+
sample_size: Number of samples used to compute baseline
|
|
28
|
+
created_at: When baseline was first created
|
|
29
|
+
updated_at: When baseline was last updated
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
source: str
|
|
33
|
+
column_name: str
|
|
34
|
+
metric: str
|
|
35
|
+
value: Any
|
|
36
|
+
sample_size: int | None
|
|
37
|
+
created_at: datetime
|
|
38
|
+
updated_at: datetime | None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BaselineStorage:
|
|
42
|
+
"""Store and retrieve learned baselines for anomaly detection.
|
|
43
|
+
|
|
44
|
+
Usage:
|
|
45
|
+
from duckguard.anomaly.baselines import BaselineStorage
|
|
46
|
+
from duckguard.history import HistoryStorage
|
|
47
|
+
|
|
48
|
+
storage = BaselineStorage()
|
|
49
|
+
|
|
50
|
+
# Store a baseline
|
|
51
|
+
storage.store("data.csv", "amount", "mean", 150.5, sample_size=1000)
|
|
52
|
+
|
|
53
|
+
# Get a baseline
|
|
54
|
+
baseline = storage.get("data.csv", "amount", "mean")
|
|
55
|
+
if baseline:
|
|
56
|
+
print(f"Baseline mean: {baseline.value}")
|
|
57
|
+
|
|
58
|
+
# Update with rolling average
|
|
59
|
+
storage.update("data.csv", "amount", "mean", 155.2,
|
|
60
|
+
sample_size=100, method="rolling")
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, storage: HistoryStorage | None = None):
|
|
64
|
+
"""Initialize baseline storage.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
storage: Optional HistoryStorage instance. Uses default if not provided.
|
|
68
|
+
"""
|
|
69
|
+
self._storage = storage or HistoryStorage()
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def storage(self) -> HistoryStorage:
|
|
73
|
+
"""Get the underlying storage."""
|
|
74
|
+
return self._storage
|
|
75
|
+
|
|
76
|
+
def store(
|
|
77
|
+
self,
|
|
78
|
+
source: str,
|
|
79
|
+
column_name: str,
|
|
80
|
+
metric: str,
|
|
81
|
+
value: Any,
|
|
82
|
+
*,
|
|
83
|
+
sample_size: int | None = None,
|
|
84
|
+
) -> None:
|
|
85
|
+
"""Store or update a baseline.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
source: Data source path
|
|
89
|
+
column_name: Column name
|
|
90
|
+
metric: Metric name (mean, stddev, min, max, distribution, etc.)
|
|
91
|
+
value: Baseline value (will be JSON serialized if complex)
|
|
92
|
+
sample_size: Number of samples used to compute the baseline
|
|
93
|
+
"""
|
|
94
|
+
conn = self._storage._get_connection()
|
|
95
|
+
now = datetime.now().isoformat()
|
|
96
|
+
|
|
97
|
+
# Serialize complex values to JSON
|
|
98
|
+
if isinstance(value, (dict, list)):
|
|
99
|
+
serialized_value = json.dumps(value)
|
|
100
|
+
else:
|
|
101
|
+
serialized_value = json.dumps(value)
|
|
102
|
+
|
|
103
|
+
conn.execute(
|
|
104
|
+
QUERIES["upsert_baseline"],
|
|
105
|
+
(
|
|
106
|
+
source,
|
|
107
|
+
column_name,
|
|
108
|
+
metric,
|
|
109
|
+
serialized_value,
|
|
110
|
+
sample_size,
|
|
111
|
+
now,
|
|
112
|
+
now,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
conn.commit()
|
|
116
|
+
|
|
117
|
+
def get(
|
|
118
|
+
self,
|
|
119
|
+
source: str,
|
|
120
|
+
column_name: str,
|
|
121
|
+
metric: str,
|
|
122
|
+
) -> StoredBaseline | None:
|
|
123
|
+
"""Get a specific baseline.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
source: Data source path
|
|
127
|
+
column_name: Column name
|
|
128
|
+
metric: Metric name
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
StoredBaseline or None if not found
|
|
132
|
+
"""
|
|
133
|
+
conn = self._storage._get_connection()
|
|
134
|
+
cursor = conn.execute(
|
|
135
|
+
QUERIES["get_baseline"],
|
|
136
|
+
(source, column_name, metric),
|
|
137
|
+
)
|
|
138
|
+
row = cursor.fetchone()
|
|
139
|
+
|
|
140
|
+
if not row:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
return self._row_to_baseline(row)
|
|
144
|
+
|
|
145
|
+
def get_all(self, source: str) -> list[StoredBaseline]:
|
|
146
|
+
"""Get all baselines for a source.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
source: Data source path
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of StoredBaseline objects
|
|
153
|
+
"""
|
|
154
|
+
conn = self._storage._get_connection()
|
|
155
|
+
cursor = conn.execute(
|
|
156
|
+
QUERIES["get_baselines_for_source"],
|
|
157
|
+
(source,),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return [self._row_to_baseline(row) for row in cursor.fetchall()]
|
|
161
|
+
|
|
162
|
+
def update(
|
|
163
|
+
self,
|
|
164
|
+
source: str,
|
|
165
|
+
column_name: str,
|
|
166
|
+
metric: str,
|
|
167
|
+
new_value: Any,
|
|
168
|
+
*,
|
|
169
|
+
sample_size: int | None = None,
|
|
170
|
+
method: str = "replace",
|
|
171
|
+
) -> None:
|
|
172
|
+
"""Update an existing baseline.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
source: Data source path
|
|
176
|
+
column_name: Column name
|
|
177
|
+
metric: Metric name
|
|
178
|
+
new_value: New value
|
|
179
|
+
sample_size: Number of samples in new data
|
|
180
|
+
method: Update method - "replace" or "rolling"
|
|
181
|
+
"""
|
|
182
|
+
if method == "replace":
|
|
183
|
+
self.store(source, column_name, metric, new_value, sample_size=sample_size)
|
|
184
|
+
elif method == "rolling":
|
|
185
|
+
# Get existing baseline
|
|
186
|
+
existing = self.get(source, column_name, metric)
|
|
187
|
+
if existing and isinstance(existing.value, (int, float)):
|
|
188
|
+
# Rolling average
|
|
189
|
+
old_weight = 0.7 # Give more weight to historical
|
|
190
|
+
new_weight = 0.3
|
|
191
|
+
blended = old_weight * existing.value + new_weight * new_value
|
|
192
|
+
total_samples = (existing.sample_size or 0) + (sample_size or 0)
|
|
193
|
+
self.store(source, column_name, metric, blended, sample_size=total_samples)
|
|
194
|
+
else:
|
|
195
|
+
self.store(source, column_name, metric, new_value, sample_size=sample_size)
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(f"Unknown update method: {method}")
|
|
198
|
+
|
|
199
|
+
def delete(self, source: str) -> int:
|
|
200
|
+
"""Delete all baselines for a source.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
source: Data source path
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Number of baselines deleted
|
|
207
|
+
"""
|
|
208
|
+
conn = self._storage._get_connection()
|
|
209
|
+
|
|
210
|
+
# Get count first
|
|
211
|
+
cursor = conn.execute(
|
|
212
|
+
"SELECT COUNT(*) FROM baselines WHERE source = ?",
|
|
213
|
+
(source,),
|
|
214
|
+
)
|
|
215
|
+
count = cursor.fetchone()[0]
|
|
216
|
+
|
|
217
|
+
conn.execute(QUERIES["delete_baselines_for_source"], (source,))
|
|
218
|
+
conn.commit()
|
|
219
|
+
|
|
220
|
+
return count
|
|
221
|
+
|
|
222
|
+
def _row_to_baseline(self, row) -> StoredBaseline:
|
|
223
|
+
"""Convert database row to StoredBaseline."""
|
|
224
|
+
value = json.loads(row["baseline_value"])
|
|
225
|
+
|
|
226
|
+
return StoredBaseline(
|
|
227
|
+
source=row["source"],
|
|
228
|
+
column_name=row["column_name"],
|
|
229
|
+
metric=row["metric"],
|
|
230
|
+
value=value,
|
|
231
|
+
sample_size=row["sample_size"],
|
|
232
|
+
created_at=datetime.fromisoformat(row["created_at"]),
|
|
233
|
+
updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@dataclass
|
|
238
|
+
class ColumnBaseline:
|
|
239
|
+
"""Complete baseline for a single column.
|
|
240
|
+
|
|
241
|
+
Attributes:
|
|
242
|
+
column_name: Column name
|
|
243
|
+
mean: Mean value
|
|
244
|
+
stddev: Standard deviation
|
|
245
|
+
min: Minimum value
|
|
246
|
+
max: Maximum value
|
|
247
|
+
median: Median value
|
|
248
|
+
null_percent: Percentage of nulls
|
|
249
|
+
unique_percent: Percentage of unique values
|
|
250
|
+
sample_size: Number of samples
|
|
251
|
+
distribution: Optional distribution histogram
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
column_name: str
|
|
255
|
+
mean: float | None = None
|
|
256
|
+
stddev: float | None = None
|
|
257
|
+
min: float | None = None
|
|
258
|
+
max: float | None = None
|
|
259
|
+
median: float | None = None
|
|
260
|
+
null_percent: float | None = None
|
|
261
|
+
unique_percent: float | None = None
|
|
262
|
+
sample_size: int | None = None
|
|
263
|
+
distribution: dict[str, Any] | None = None
|
|
264
|
+
|
|
265
|
+
def to_dict(self) -> dict[str, Any]:
|
|
266
|
+
"""Convert to dictionary."""
|
|
267
|
+
return {
|
|
268
|
+
"column_name": self.column_name,
|
|
269
|
+
"mean": self.mean,
|
|
270
|
+
"stddev": self.stddev,
|
|
271
|
+
"min": self.min,
|
|
272
|
+
"max": self.max,
|
|
273
|
+
"median": self.median,
|
|
274
|
+
"null_percent": self.null_percent,
|
|
275
|
+
"unique_percent": self.unique_percent,
|
|
276
|
+
"sample_size": self.sample_size,
|
|
277
|
+
"distribution": self.distribution,
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def from_dict(cls, data: dict[str, Any]) -> ColumnBaseline:
|
|
282
|
+
"""Create from dictionary."""
|
|
283
|
+
return cls(
|
|
284
|
+
column_name=data["column_name"],
|
|
285
|
+
mean=data.get("mean"),
|
|
286
|
+
stddev=data.get("stddev"),
|
|
287
|
+
min=data.get("min"),
|
|
288
|
+
max=data.get("max"),
|
|
289
|
+
median=data.get("median"),
|
|
290
|
+
null_percent=data.get("null_percent"),
|
|
291
|
+
unique_percent=data.get("unique_percent"),
|
|
292
|
+
sample_size=data.get("sample_size"),
|
|
293
|
+
distribution=data.get("distribution"),
|
|
294
|
+
)
|
duckguard/anomaly/methods.py
CHANGED
|
@@ -407,12 +407,22 @@ def create_method(
|
|
|
407
407
|
"""Create an anomaly detection method by name.
|
|
408
408
|
|
|
409
409
|
Args:
|
|
410
|
-
method_name: Name of the method
|
|
410
|
+
method_name: Name of the method. Options:
|
|
411
|
+
- "zscore", "z_score": Z-Score method
|
|
412
|
+
- "iqr": Interquartile Range method
|
|
413
|
+
- "percent_change", "pct_change": Percent change method
|
|
414
|
+
- "modified_zscore", "mad": Modified Z-Score (MAD) method
|
|
415
|
+
- "baseline": ML-based baseline comparison
|
|
416
|
+
- "ks_test": Kolmogorov-Smirnov distribution test
|
|
417
|
+
- "seasonal": Seasonal pattern detection
|
|
411
418
|
**kwargs: Method-specific parameters
|
|
412
419
|
|
|
413
420
|
Returns:
|
|
414
421
|
Configured AnomalyMethod
|
|
415
422
|
"""
|
|
423
|
+
# Import ML methods lazily to avoid circular imports
|
|
424
|
+
from duckguard.anomaly.ml_methods import BaselineMethod, KSTestMethod, SeasonalMethod
|
|
425
|
+
|
|
416
426
|
methods = {
|
|
417
427
|
"zscore": ZScoreMethod,
|
|
418
428
|
"z_score": ZScoreMethod,
|
|
@@ -421,10 +431,14 @@ def create_method(
|
|
|
421
431
|
"pct_change": PercentChangeMethod,
|
|
422
432
|
"modified_zscore": ModifiedZScoreMethod,
|
|
423
433
|
"mad": ModifiedZScoreMethod,
|
|
434
|
+
"baseline": BaselineMethod,
|
|
435
|
+
"ks_test": KSTestMethod,
|
|
436
|
+
"ks": KSTestMethod,
|
|
437
|
+
"seasonal": SeasonalMethod,
|
|
424
438
|
}
|
|
425
439
|
|
|
426
440
|
method_class = methods.get(method_name.lower())
|
|
427
441
|
if not method_class:
|
|
428
|
-
raise ValueError(f"Unknown anomaly method: {method_name}")
|
|
442
|
+
raise ValueError(f"Unknown anomaly method: {method_name}. Available: {list(methods.keys())}")
|
|
429
443
|
|
|
430
444
|
return method_class(**kwargs)
|