duckguard 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckguard/__init__.py CHANGED
@@ -85,7 +85,7 @@ from duckguard.semantic import (
85
85
  detect_types_for_dataset,
86
86
  )
87
87
 
88
- __version__ = "2.2.0"
88
+ __version__ = "2.3.0"
89
89
 
90
90
  __all__ = [
91
91
  # Core classes
@@ -9,6 +9,11 @@ Example:
9
9
  anomalies = detector.detect(dataset, column="amount")
10
10
  """
11
11
 
12
+ from duckguard.anomaly.baselines import (
13
+ BaselineStorage,
14
+ ColumnBaseline,
15
+ StoredBaseline,
16
+ )
12
17
  from duckguard.anomaly.detector import (
13
18
  AnomalyDetector,
14
19
  AnomalyResult,
@@ -18,17 +23,40 @@ from duckguard.anomaly.detector import (
18
23
  )
19
24
  from duckguard.anomaly.methods import (
20
25
  IQRMethod,
26
+ ModifiedZScoreMethod,
21
27
  PercentChangeMethod,
22
28
  ZScoreMethod,
29
+ create_method,
30
+ )
31
+ from duckguard.anomaly.ml_methods import (
32
+ BaselineComparison,
33
+ BaselineMethod,
34
+ DistributionComparison,
35
+ KSTestMethod,
36
+ SeasonalMethod,
23
37
  )
24
38
 
25
39
  __all__ = [
40
+ # Detector
26
41
  "AnomalyDetector",
27
42
  "AnomalyResult",
28
43
  "AnomalyType",
29
44
  "detect_anomalies",
30
45
  "detect_column_anomalies",
46
+ # Standard methods
31
47
  "ZScoreMethod",
32
48
  "IQRMethod",
33
49
  "PercentChangeMethod",
50
+ "ModifiedZScoreMethod",
51
+ "create_method",
52
+ # ML methods
53
+ "BaselineMethod",
54
+ "KSTestMethod",
55
+ "SeasonalMethod",
56
+ "BaselineComparison",
57
+ "DistributionComparison",
58
+ # Baselines
59
+ "BaselineStorage",
60
+ "StoredBaseline",
61
+ "ColumnBaseline",
34
62
  ]
@@ -0,0 +1,294 @@
1
+ """Baseline storage for ML-based anomaly detection.
2
+
3
+ Provides functionality to store and retrieve learned baselines for
4
+ comparison-based anomaly detection.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from typing import Any
13
+
14
+ from duckguard.history.schema import QUERIES
15
+ from duckguard.history.storage import HistoryStorage
16
+
17
+
18
+ @dataclass
19
+ class StoredBaseline:
20
+ """Represents a stored baseline.
21
+
22
+ Attributes:
23
+ source: Data source path
24
+ column_name: Column name
25
+ metric: Metric name (mean, stddev, distribution, etc.)
26
+ value: Baseline value (can be complex for distributions)
27
+ sample_size: Number of samples used to compute baseline
28
+ created_at: When baseline was first created
29
+ updated_at: When baseline was last updated
30
+ """
31
+
32
+ source: str
33
+ column_name: str
34
+ metric: str
35
+ value: Any
36
+ sample_size: int | None
37
+ created_at: datetime
38
+ updated_at: datetime | None
39
+
40
+
41
+ class BaselineStorage:
42
+ """Store and retrieve learned baselines for anomaly detection.
43
+
44
+ Usage:
45
+ from duckguard.anomaly.baselines import BaselineStorage
46
+ from duckguard.history import HistoryStorage
47
+
48
+ storage = BaselineStorage()
49
+
50
+ # Store a baseline
51
+ storage.store("data.csv", "amount", "mean", 150.5, sample_size=1000)
52
+
53
+ # Get a baseline
54
+ baseline = storage.get("data.csv", "amount", "mean")
55
+ if baseline:
56
+ print(f"Baseline mean: {baseline.value}")
57
+
58
+ # Update with rolling average
59
+ storage.update("data.csv", "amount", "mean", 155.2,
60
+ sample_size=100, method="rolling")
61
+ """
62
+
63
+ def __init__(self, storage: HistoryStorage | None = None):
64
+ """Initialize baseline storage.
65
+
66
+ Args:
67
+ storage: Optional HistoryStorage instance. Uses default if not provided.
68
+ """
69
+ self._storage = storage or HistoryStorage()
70
+
71
+ @property
72
+ def storage(self) -> HistoryStorage:
73
+ """Get the underlying storage."""
74
+ return self._storage
75
+
76
+ def store(
77
+ self,
78
+ source: str,
79
+ column_name: str,
80
+ metric: str,
81
+ value: Any,
82
+ *,
83
+ sample_size: int | None = None,
84
+ ) -> None:
85
+ """Store or update a baseline.
86
+
87
+ Args:
88
+ source: Data source path
89
+ column_name: Column name
90
+ metric: Metric name (mean, stddev, min, max, distribution, etc.)
91
+ value: Baseline value (will be JSON serialized if complex)
92
+ sample_size: Number of samples used to compute the baseline
93
+ """
94
+ conn = self._storage._get_connection()
95
+ now = datetime.now().isoformat()
96
+
97
+ # Serialize complex values to JSON
98
+ if isinstance(value, (dict, list)):
99
+ serialized_value = json.dumps(value)
100
+ else:
101
+ serialized_value = json.dumps(value)
102
+
103
+ conn.execute(
104
+ QUERIES["upsert_baseline"],
105
+ (
106
+ source,
107
+ column_name,
108
+ metric,
109
+ serialized_value,
110
+ sample_size,
111
+ now,
112
+ now,
113
+ ),
114
+ )
115
+ conn.commit()
116
+
117
+ def get(
118
+ self,
119
+ source: str,
120
+ column_name: str,
121
+ metric: str,
122
+ ) -> StoredBaseline | None:
123
+ """Get a specific baseline.
124
+
125
+ Args:
126
+ source: Data source path
127
+ column_name: Column name
128
+ metric: Metric name
129
+
130
+ Returns:
131
+ StoredBaseline or None if not found
132
+ """
133
+ conn = self._storage._get_connection()
134
+ cursor = conn.execute(
135
+ QUERIES["get_baseline"],
136
+ (source, column_name, metric),
137
+ )
138
+ row = cursor.fetchone()
139
+
140
+ if not row:
141
+ return None
142
+
143
+ return self._row_to_baseline(row)
144
+
145
+ def get_all(self, source: str) -> list[StoredBaseline]:
146
+ """Get all baselines for a source.
147
+
148
+ Args:
149
+ source: Data source path
150
+
151
+ Returns:
152
+ List of StoredBaseline objects
153
+ """
154
+ conn = self._storage._get_connection()
155
+ cursor = conn.execute(
156
+ QUERIES["get_baselines_for_source"],
157
+ (source,),
158
+ )
159
+
160
+ return [self._row_to_baseline(row) for row in cursor.fetchall()]
161
+
162
+ def update(
163
+ self,
164
+ source: str,
165
+ column_name: str,
166
+ metric: str,
167
+ new_value: Any,
168
+ *,
169
+ sample_size: int | None = None,
170
+ method: str = "replace",
171
+ ) -> None:
172
+ """Update an existing baseline.
173
+
174
+ Args:
175
+ source: Data source path
176
+ column_name: Column name
177
+ metric: Metric name
178
+ new_value: New value
179
+ sample_size: Number of samples in new data
180
+ method: Update method - "replace" or "rolling"
181
+ """
182
+ if method == "replace":
183
+ self.store(source, column_name, metric, new_value, sample_size=sample_size)
184
+ elif method == "rolling":
185
+ # Get existing baseline
186
+ existing = self.get(source, column_name, metric)
187
+ if existing and isinstance(existing.value, (int, float)):
188
+ # Rolling average
189
+ old_weight = 0.7 # Give more weight to historical
190
+ new_weight = 0.3
191
+ blended = old_weight * existing.value + new_weight * new_value
192
+ total_samples = (existing.sample_size or 0) + (sample_size or 0)
193
+ self.store(source, column_name, metric, blended, sample_size=total_samples)
194
+ else:
195
+ self.store(source, column_name, metric, new_value, sample_size=sample_size)
196
+ else:
197
+ raise ValueError(f"Unknown update method: {method}")
198
+
199
+ def delete(self, source: str) -> int:
200
+ """Delete all baselines for a source.
201
+
202
+ Args:
203
+ source: Data source path
204
+
205
+ Returns:
206
+ Number of baselines deleted
207
+ """
208
+ conn = self._storage._get_connection()
209
+
210
+ # Get count first
211
+ cursor = conn.execute(
212
+ "SELECT COUNT(*) FROM baselines WHERE source = ?",
213
+ (source,),
214
+ )
215
+ count = cursor.fetchone()[0]
216
+
217
+ conn.execute(QUERIES["delete_baselines_for_source"], (source,))
218
+ conn.commit()
219
+
220
+ return count
221
+
222
+ def _row_to_baseline(self, row) -> StoredBaseline:
223
+ """Convert database row to StoredBaseline."""
224
+ value = json.loads(row["baseline_value"])
225
+
226
+ return StoredBaseline(
227
+ source=row["source"],
228
+ column_name=row["column_name"],
229
+ metric=row["metric"],
230
+ value=value,
231
+ sample_size=row["sample_size"],
232
+ created_at=datetime.fromisoformat(row["created_at"]),
233
+ updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
234
+ )
235
+
236
+
237
+ @dataclass
238
+ class ColumnBaseline:
239
+ """Complete baseline for a single column.
240
+
241
+ Attributes:
242
+ column_name: Column name
243
+ mean: Mean value
244
+ stddev: Standard deviation
245
+ min: Minimum value
246
+ max: Maximum value
247
+ median: Median value
248
+ null_percent: Percentage of nulls
249
+ unique_percent: Percentage of unique values
250
+ sample_size: Number of samples
251
+ distribution: Optional distribution histogram
252
+ """
253
+
254
+ column_name: str
255
+ mean: float | None = None
256
+ stddev: float | None = None
257
+ min: float | None = None
258
+ max: float | None = None
259
+ median: float | None = None
260
+ null_percent: float | None = None
261
+ unique_percent: float | None = None
262
+ sample_size: int | None = None
263
+ distribution: dict[str, Any] | None = None
264
+
265
+ def to_dict(self) -> dict[str, Any]:
266
+ """Convert to dictionary."""
267
+ return {
268
+ "column_name": self.column_name,
269
+ "mean": self.mean,
270
+ "stddev": self.stddev,
271
+ "min": self.min,
272
+ "max": self.max,
273
+ "median": self.median,
274
+ "null_percent": self.null_percent,
275
+ "unique_percent": self.unique_percent,
276
+ "sample_size": self.sample_size,
277
+ "distribution": self.distribution,
278
+ }
279
+
280
+ @classmethod
281
+ def from_dict(cls, data: dict[str, Any]) -> ColumnBaseline:
282
+ """Create from dictionary."""
283
+ return cls(
284
+ column_name=data["column_name"],
285
+ mean=data.get("mean"),
286
+ stddev=data.get("stddev"),
287
+ min=data.get("min"),
288
+ max=data.get("max"),
289
+ median=data.get("median"),
290
+ null_percent=data.get("null_percent"),
291
+ unique_percent=data.get("unique_percent"),
292
+ sample_size=data.get("sample_size"),
293
+ distribution=data.get("distribution"),
294
+ )
@@ -407,12 +407,22 @@ def create_method(
407
407
  """Create an anomaly detection method by name.
408
408
 
409
409
  Args:
410
- method_name: Name of the method
410
+ method_name: Name of the method. Options:
411
+ - "zscore", "z_score": Z-Score method
412
+ - "iqr": Interquartile Range method
413
+ - "percent_change", "pct_change": Percent change method
414
+ - "modified_zscore", "mad": Modified Z-Score (MAD) method
415
+ - "baseline": ML-based baseline comparison
416
+ - "ks_test": Kolmogorov-Smirnov distribution test
417
+ - "seasonal": Seasonal pattern detection
411
418
  **kwargs: Method-specific parameters
412
419
 
413
420
  Returns:
414
421
  Configured AnomalyMethod
415
422
  """
423
+ # Import ML methods lazily to avoid circular imports
424
+ from duckguard.anomaly.ml_methods import BaselineMethod, KSTestMethod, SeasonalMethod
425
+
416
426
  methods = {
417
427
  "zscore": ZScoreMethod,
418
428
  "z_score": ZScoreMethod,
@@ -421,10 +431,14 @@ def create_method(
421
431
  "pct_change": PercentChangeMethod,
422
432
  "modified_zscore": ModifiedZScoreMethod,
423
433
  "mad": ModifiedZScoreMethod,
434
+ "baseline": BaselineMethod,
435
+ "ks_test": KSTestMethod,
436
+ "ks": KSTestMethod,
437
+ "seasonal": SeasonalMethod,
424
438
  }
425
439
 
426
440
  method_class = methods.get(method_name.lower())
427
441
  if not method_class:
428
- raise ValueError(f"Unknown anomaly method: {method_name}")
442
+ raise ValueError(f"Unknown anomaly method: {method_name}. Available: {list(methods.keys())}")
429
443
 
430
444
  return method_class(**kwargs)