duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ DuckGuard - Data quality that just works.
|
|
|
3
3
|
|
|
4
4
|
A Python-native data quality tool built on DuckDB for speed.
|
|
5
5
|
Features YAML-based rules, semantic type detection, data contracts,
|
|
6
|
-
and
|
|
6
|
+
anomaly detection, notifications, and dbt integration.
|
|
7
7
|
|
|
8
8
|
Quick Start:
|
|
9
9
|
# Python API
|
|
@@ -12,61 +12,80 @@ Quick Start:
|
|
|
12
12
|
assert orders.row_count > 0
|
|
13
13
|
assert orders.customer_id.null_percent == 0
|
|
14
14
|
|
|
15
|
+
# With row-level error capture
|
|
16
|
+
result = orders.quantity.between(1, 100)
|
|
17
|
+
if not result:
|
|
18
|
+
print(result.summary()) # See which rows failed
|
|
19
|
+
|
|
20
|
+
# Notifications
|
|
21
|
+
from duckguard.notifications import SlackNotifier
|
|
22
|
+
slack = SlackNotifier(webhook_url="...")
|
|
23
|
+
slack.send_failure_alert(result)
|
|
24
|
+
|
|
15
25
|
# CLI
|
|
16
26
|
$ duckguard check data.csv
|
|
17
27
|
$ duckguard discover data.csv --output duckguard.yaml
|
|
18
28
|
$ duckguard contract generate data.csv
|
|
19
29
|
|
|
20
|
-
Documentation: https://github.com/
|
|
30
|
+
Documentation: https://github.com/XDataHubAI/duckguard
|
|
21
31
|
"""
|
|
22
32
|
|
|
23
33
|
# Core classes
|
|
24
|
-
|
|
34
|
+
# Anomaly detection
|
|
35
|
+
from duckguard.anomaly import (
|
|
36
|
+
AnomalyDetector,
|
|
37
|
+
AnomalyResult,
|
|
38
|
+
detect_anomalies,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Connectors
|
|
42
|
+
from duckguard.connectors import connect
|
|
43
|
+
|
|
44
|
+
# Data contracts
|
|
45
|
+
from duckguard.contracts import (
|
|
46
|
+
DataContract,
|
|
47
|
+
diff_contracts,
|
|
48
|
+
generate_contract,
|
|
49
|
+
load_contract,
|
|
50
|
+
validate_contract,
|
|
51
|
+
)
|
|
25
52
|
from duckguard.core.column import Column
|
|
53
|
+
from duckguard.core.dataset import Dataset
|
|
26
54
|
from duckguard.core.engine import DuckGuardEngine
|
|
27
|
-
from duckguard.core.result import
|
|
55
|
+
from duckguard.core.result import CheckResult, FailedRow, ValidationResult
|
|
28
56
|
from duckguard.core.scoring import QualityScore, QualityScorer, score
|
|
29
57
|
|
|
30
|
-
#
|
|
31
|
-
from duckguard.
|
|
58
|
+
# Error classes
|
|
59
|
+
from duckguard.errors import (
|
|
60
|
+
ColumnNotFoundError,
|
|
61
|
+
ContractViolationError,
|
|
62
|
+
DuckGuardError,
|
|
63
|
+
RuleParseError,
|
|
64
|
+
UnsupportedConnectorError,
|
|
65
|
+
ValidationError,
|
|
66
|
+
)
|
|
32
67
|
|
|
33
68
|
# Profiling
|
|
34
|
-
from duckguard.profiler import
|
|
69
|
+
from duckguard.profiler import AutoProfiler, profile
|
|
35
70
|
|
|
36
71
|
# Rules (YAML-based)
|
|
37
72
|
from duckguard.rules import (
|
|
38
|
-
|
|
39
|
-
load_rules_from_string,
|
|
73
|
+
RuleSet,
|
|
40
74
|
execute_rules,
|
|
41
75
|
generate_rules,
|
|
42
|
-
|
|
76
|
+
load_rules,
|
|
77
|
+
load_rules_from_string,
|
|
43
78
|
)
|
|
44
79
|
|
|
45
80
|
# Semantic type detection
|
|
46
81
|
from duckguard.semantic import (
|
|
47
|
-
SemanticType,
|
|
48
82
|
SemanticAnalyzer,
|
|
83
|
+
SemanticType,
|
|
49
84
|
detect_type,
|
|
50
85
|
detect_types_for_dataset,
|
|
51
86
|
)
|
|
52
87
|
|
|
53
|
-
|
|
54
|
-
from duckguard.contracts import (
|
|
55
|
-
DataContract,
|
|
56
|
-
load_contract,
|
|
57
|
-
validate_contract,
|
|
58
|
-
generate_contract,
|
|
59
|
-
diff_contracts,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
# Anomaly detection
|
|
63
|
-
from duckguard.anomaly import (
|
|
64
|
-
AnomalyDetector,
|
|
65
|
-
AnomalyResult,
|
|
66
|
-
detect_anomalies,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
__version__ = "2.0.0"
|
|
88
|
+
__version__ = "2.3.0"
|
|
70
89
|
|
|
71
90
|
__all__ = [
|
|
72
91
|
# Core classes
|
|
@@ -75,6 +94,7 @@ __all__ = [
|
|
|
75
94
|
"DuckGuardEngine",
|
|
76
95
|
"ValidationResult",
|
|
77
96
|
"CheckResult",
|
|
97
|
+
"FailedRow",
|
|
78
98
|
# Scoring
|
|
79
99
|
"QualityScore",
|
|
80
100
|
"QualityScorer",
|
|
@@ -105,6 +125,13 @@ __all__ = [
|
|
|
105
125
|
"AnomalyDetector",
|
|
106
126
|
"AnomalyResult",
|
|
107
127
|
"detect_anomalies",
|
|
128
|
+
# Errors
|
|
129
|
+
"DuckGuardError",
|
|
130
|
+
"ColumnNotFoundError",
|
|
131
|
+
"ContractViolationError",
|
|
132
|
+
"RuleParseError",
|
|
133
|
+
"UnsupportedConnectorError",
|
|
134
|
+
"ValidationError",
|
|
108
135
|
# Version
|
|
109
136
|
"__version__",
|
|
110
137
|
]
|
duckguard/anomaly/__init__.py
CHANGED
|
@@ -9,6 +9,11 @@ Example:
|
|
|
9
9
|
anomalies = detector.detect(dataset, column="amount")
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
from duckguard.anomaly.baselines import (
|
|
13
|
+
BaselineStorage,
|
|
14
|
+
ColumnBaseline,
|
|
15
|
+
StoredBaseline,
|
|
16
|
+
)
|
|
12
17
|
from duckguard.anomaly.detector import (
|
|
13
18
|
AnomalyDetector,
|
|
14
19
|
AnomalyResult,
|
|
@@ -17,18 +22,41 @@ from duckguard.anomaly.detector import (
|
|
|
17
22
|
detect_column_anomalies,
|
|
18
23
|
)
|
|
19
24
|
from duckguard.anomaly.methods import (
|
|
20
|
-
ZScoreMethod,
|
|
21
25
|
IQRMethod,
|
|
26
|
+
ModifiedZScoreMethod,
|
|
22
27
|
PercentChangeMethod,
|
|
28
|
+
ZScoreMethod,
|
|
29
|
+
create_method,
|
|
30
|
+
)
|
|
31
|
+
from duckguard.anomaly.ml_methods import (
|
|
32
|
+
BaselineComparison,
|
|
33
|
+
BaselineMethod,
|
|
34
|
+
DistributionComparison,
|
|
35
|
+
KSTestMethod,
|
|
36
|
+
SeasonalMethod,
|
|
23
37
|
)
|
|
24
38
|
|
|
25
39
|
__all__ = [
|
|
40
|
+
# Detector
|
|
26
41
|
"AnomalyDetector",
|
|
27
42
|
"AnomalyResult",
|
|
28
43
|
"AnomalyType",
|
|
29
44
|
"detect_anomalies",
|
|
30
45
|
"detect_column_anomalies",
|
|
46
|
+
# Standard methods
|
|
31
47
|
"ZScoreMethod",
|
|
32
48
|
"IQRMethod",
|
|
33
49
|
"PercentChangeMethod",
|
|
50
|
+
"ModifiedZScoreMethod",
|
|
51
|
+
"create_method",
|
|
52
|
+
# ML methods
|
|
53
|
+
"BaselineMethod",
|
|
54
|
+
"KSTestMethod",
|
|
55
|
+
"SeasonalMethod",
|
|
56
|
+
"BaselineComparison",
|
|
57
|
+
"DistributionComparison",
|
|
58
|
+
# Baselines
|
|
59
|
+
"BaselineStorage",
|
|
60
|
+
"StoredBaseline",
|
|
61
|
+
"ColumnBaseline",
|
|
34
62
|
]
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Baseline storage for ML-based anomaly detection.
|
|
2
|
+
|
|
3
|
+
Provides functionality to store and retrieve learned baselines for
|
|
4
|
+
comparison-based anomaly detection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from duckguard.history.schema import QUERIES
|
|
15
|
+
from duckguard.history.storage import HistoryStorage
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class StoredBaseline:
|
|
20
|
+
"""Represents a stored baseline.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
source: Data source path
|
|
24
|
+
column_name: Column name
|
|
25
|
+
metric: Metric name (mean, stddev, distribution, etc.)
|
|
26
|
+
value: Baseline value (can be complex for distributions)
|
|
27
|
+
sample_size: Number of samples used to compute baseline
|
|
28
|
+
created_at: When baseline was first created
|
|
29
|
+
updated_at: When baseline was last updated
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
source: str
|
|
33
|
+
column_name: str
|
|
34
|
+
metric: str
|
|
35
|
+
value: Any
|
|
36
|
+
sample_size: int | None
|
|
37
|
+
created_at: datetime
|
|
38
|
+
updated_at: datetime | None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BaselineStorage:
|
|
42
|
+
"""Store and retrieve learned baselines for anomaly detection.
|
|
43
|
+
|
|
44
|
+
Usage:
|
|
45
|
+
from duckguard.anomaly.baselines import BaselineStorage
|
|
46
|
+
from duckguard.history import HistoryStorage
|
|
47
|
+
|
|
48
|
+
storage = BaselineStorage()
|
|
49
|
+
|
|
50
|
+
# Store a baseline
|
|
51
|
+
storage.store("data.csv", "amount", "mean", 150.5, sample_size=1000)
|
|
52
|
+
|
|
53
|
+
# Get a baseline
|
|
54
|
+
baseline = storage.get("data.csv", "amount", "mean")
|
|
55
|
+
if baseline:
|
|
56
|
+
print(f"Baseline mean: {baseline.value}")
|
|
57
|
+
|
|
58
|
+
# Update with rolling average
|
|
59
|
+
storage.update("data.csv", "amount", "mean", 155.2,
|
|
60
|
+
sample_size=100, method="rolling")
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, storage: HistoryStorage | None = None):
|
|
64
|
+
"""Initialize baseline storage.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
storage: Optional HistoryStorage instance. Uses default if not provided.
|
|
68
|
+
"""
|
|
69
|
+
self._storage = storage or HistoryStorage()
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def storage(self) -> HistoryStorage:
|
|
73
|
+
"""Get the underlying storage."""
|
|
74
|
+
return self._storage
|
|
75
|
+
|
|
76
|
+
def store(
|
|
77
|
+
self,
|
|
78
|
+
source: str,
|
|
79
|
+
column_name: str,
|
|
80
|
+
metric: str,
|
|
81
|
+
value: Any,
|
|
82
|
+
*,
|
|
83
|
+
sample_size: int | None = None,
|
|
84
|
+
) -> None:
|
|
85
|
+
"""Store or update a baseline.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
source: Data source path
|
|
89
|
+
column_name: Column name
|
|
90
|
+
metric: Metric name (mean, stddev, min, max, distribution, etc.)
|
|
91
|
+
value: Baseline value (will be JSON serialized if complex)
|
|
92
|
+
sample_size: Number of samples used to compute the baseline
|
|
93
|
+
"""
|
|
94
|
+
conn = self._storage._get_connection()
|
|
95
|
+
now = datetime.now().isoformat()
|
|
96
|
+
|
|
97
|
+
# Serialize complex values to JSON
|
|
98
|
+
if isinstance(value, (dict, list)):
|
|
99
|
+
serialized_value = json.dumps(value)
|
|
100
|
+
else:
|
|
101
|
+
serialized_value = json.dumps(value)
|
|
102
|
+
|
|
103
|
+
conn.execute(
|
|
104
|
+
QUERIES["upsert_baseline"],
|
|
105
|
+
(
|
|
106
|
+
source,
|
|
107
|
+
column_name,
|
|
108
|
+
metric,
|
|
109
|
+
serialized_value,
|
|
110
|
+
sample_size,
|
|
111
|
+
now,
|
|
112
|
+
now,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
conn.commit()
|
|
116
|
+
|
|
117
|
+
def get(
|
|
118
|
+
self,
|
|
119
|
+
source: str,
|
|
120
|
+
column_name: str,
|
|
121
|
+
metric: str,
|
|
122
|
+
) -> StoredBaseline | None:
|
|
123
|
+
"""Get a specific baseline.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
source: Data source path
|
|
127
|
+
column_name: Column name
|
|
128
|
+
metric: Metric name
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
StoredBaseline or None if not found
|
|
132
|
+
"""
|
|
133
|
+
conn = self._storage._get_connection()
|
|
134
|
+
cursor = conn.execute(
|
|
135
|
+
QUERIES["get_baseline"],
|
|
136
|
+
(source, column_name, metric),
|
|
137
|
+
)
|
|
138
|
+
row = cursor.fetchone()
|
|
139
|
+
|
|
140
|
+
if not row:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
return self._row_to_baseline(row)
|
|
144
|
+
|
|
145
|
+
def get_all(self, source: str) -> list[StoredBaseline]:
|
|
146
|
+
"""Get all baselines for a source.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
source: Data source path
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of StoredBaseline objects
|
|
153
|
+
"""
|
|
154
|
+
conn = self._storage._get_connection()
|
|
155
|
+
cursor = conn.execute(
|
|
156
|
+
QUERIES["get_baselines_for_source"],
|
|
157
|
+
(source,),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return [self._row_to_baseline(row) for row in cursor.fetchall()]
|
|
161
|
+
|
|
162
|
+
def update(
|
|
163
|
+
self,
|
|
164
|
+
source: str,
|
|
165
|
+
column_name: str,
|
|
166
|
+
metric: str,
|
|
167
|
+
new_value: Any,
|
|
168
|
+
*,
|
|
169
|
+
sample_size: int | None = None,
|
|
170
|
+
method: str = "replace",
|
|
171
|
+
) -> None:
|
|
172
|
+
"""Update an existing baseline.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
source: Data source path
|
|
176
|
+
column_name: Column name
|
|
177
|
+
metric: Metric name
|
|
178
|
+
new_value: New value
|
|
179
|
+
sample_size: Number of samples in new data
|
|
180
|
+
method: Update method - "replace" or "rolling"
|
|
181
|
+
"""
|
|
182
|
+
if method == "replace":
|
|
183
|
+
self.store(source, column_name, metric, new_value, sample_size=sample_size)
|
|
184
|
+
elif method == "rolling":
|
|
185
|
+
# Get existing baseline
|
|
186
|
+
existing = self.get(source, column_name, metric)
|
|
187
|
+
if existing and isinstance(existing.value, (int, float)):
|
|
188
|
+
# Rolling average
|
|
189
|
+
old_weight = 0.7 # Give more weight to historical
|
|
190
|
+
new_weight = 0.3
|
|
191
|
+
blended = old_weight * existing.value + new_weight * new_value
|
|
192
|
+
total_samples = (existing.sample_size or 0) + (sample_size or 0)
|
|
193
|
+
self.store(source, column_name, metric, blended, sample_size=total_samples)
|
|
194
|
+
else:
|
|
195
|
+
self.store(source, column_name, metric, new_value, sample_size=sample_size)
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(f"Unknown update method: {method}")
|
|
198
|
+
|
|
199
|
+
def delete(self, source: str) -> int:
|
|
200
|
+
"""Delete all baselines for a source.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
source: Data source path
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Number of baselines deleted
|
|
207
|
+
"""
|
|
208
|
+
conn = self._storage._get_connection()
|
|
209
|
+
|
|
210
|
+
# Get count first
|
|
211
|
+
cursor = conn.execute(
|
|
212
|
+
"SELECT COUNT(*) FROM baselines WHERE source = ?",
|
|
213
|
+
(source,),
|
|
214
|
+
)
|
|
215
|
+
count = cursor.fetchone()[0]
|
|
216
|
+
|
|
217
|
+
conn.execute(QUERIES["delete_baselines_for_source"], (source,))
|
|
218
|
+
conn.commit()
|
|
219
|
+
|
|
220
|
+
return count
|
|
221
|
+
|
|
222
|
+
def _row_to_baseline(self, row) -> StoredBaseline:
|
|
223
|
+
"""Convert database row to StoredBaseline."""
|
|
224
|
+
value = json.loads(row["baseline_value"])
|
|
225
|
+
|
|
226
|
+
return StoredBaseline(
|
|
227
|
+
source=row["source"],
|
|
228
|
+
column_name=row["column_name"],
|
|
229
|
+
metric=row["metric"],
|
|
230
|
+
value=value,
|
|
231
|
+
sample_size=row["sample_size"],
|
|
232
|
+
created_at=datetime.fromisoformat(row["created_at"]),
|
|
233
|
+
updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@dataclass
|
|
238
|
+
class ColumnBaseline:
|
|
239
|
+
"""Complete baseline for a single column.
|
|
240
|
+
|
|
241
|
+
Attributes:
|
|
242
|
+
column_name: Column name
|
|
243
|
+
mean: Mean value
|
|
244
|
+
stddev: Standard deviation
|
|
245
|
+
min: Minimum value
|
|
246
|
+
max: Maximum value
|
|
247
|
+
median: Median value
|
|
248
|
+
null_percent: Percentage of nulls
|
|
249
|
+
unique_percent: Percentage of unique values
|
|
250
|
+
sample_size: Number of samples
|
|
251
|
+
distribution: Optional distribution histogram
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
column_name: str
|
|
255
|
+
mean: float | None = None
|
|
256
|
+
stddev: float | None = None
|
|
257
|
+
min: float | None = None
|
|
258
|
+
max: float | None = None
|
|
259
|
+
median: float | None = None
|
|
260
|
+
null_percent: float | None = None
|
|
261
|
+
unique_percent: float | None = None
|
|
262
|
+
sample_size: int | None = None
|
|
263
|
+
distribution: dict[str, Any] | None = None
|
|
264
|
+
|
|
265
|
+
def to_dict(self) -> dict[str, Any]:
|
|
266
|
+
"""Convert to dictionary."""
|
|
267
|
+
return {
|
|
268
|
+
"column_name": self.column_name,
|
|
269
|
+
"mean": self.mean,
|
|
270
|
+
"stddev": self.stddev,
|
|
271
|
+
"min": self.min,
|
|
272
|
+
"max": self.max,
|
|
273
|
+
"median": self.median,
|
|
274
|
+
"null_percent": self.null_percent,
|
|
275
|
+
"unique_percent": self.unique_percent,
|
|
276
|
+
"sample_size": self.sample_size,
|
|
277
|
+
"distribution": self.distribution,
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def from_dict(cls, data: dict[str, Any]) -> ColumnBaseline:
|
|
282
|
+
"""Create from dictionary."""
|
|
283
|
+
return cls(
|
|
284
|
+
column_name=data["column_name"],
|
|
285
|
+
mean=data.get("mean"),
|
|
286
|
+
stddev=data.get("stddev"),
|
|
287
|
+
min=data.get("min"),
|
|
288
|
+
max=data.get("max"),
|
|
289
|
+
median=data.get("median"),
|
|
290
|
+
null_percent=data.get("null_percent"),
|
|
291
|
+
unique_percent=data.get("unique_percent"),
|
|
292
|
+
sample_size=data.get("sample_size"),
|
|
293
|
+
distribution=data.get("distribution"),
|
|
294
|
+
)
|
duckguard/anomaly/detector.py
CHANGED
|
@@ -10,15 +10,11 @@ from datetime import datetime
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
-
from duckguard.core.dataset import Dataset
|
|
14
13
|
from duckguard.anomaly.methods import (
|
|
15
|
-
AnomalyMethod,
|
|
16
14
|
AnomalyScore,
|
|
17
|
-
ZScoreMethod,
|
|
18
|
-
IQRMethod,
|
|
19
|
-
PercentChangeMethod,
|
|
20
15
|
create_method,
|
|
21
16
|
)
|
|
17
|
+
from duckguard.core.dataset import Dataset
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
class AnomalyType(Enum):
|
duckguard/anomaly/methods.py
CHANGED
|
@@ -5,10 +5,10 @@ Implements various statistical methods for detecting anomalies in data.
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import math
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
9
10
|
from dataclasses import dataclass, field
|
|
10
11
|
from typing import Any
|
|
11
|
-
import math
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
@@ -177,8 +177,6 @@ class IQRMethod(AnomalyMethod):
|
|
|
177
177
|
if not clean_values:
|
|
178
178
|
return
|
|
179
179
|
|
|
180
|
-
n = len(clean_values)
|
|
181
|
-
|
|
182
180
|
# Calculate Q1 and Q3
|
|
183
181
|
self._q1 = self._percentile(clean_values, 25)
|
|
184
182
|
self._q3 = self._percentile(clean_values, 75)
|
|
@@ -409,12 +407,22 @@ def create_method(
|
|
|
409
407
|
"""Create an anomaly detection method by name.
|
|
410
408
|
|
|
411
409
|
Args:
|
|
412
|
-
method_name: Name of the method
|
|
410
|
+
method_name: Name of the method. Options:
|
|
411
|
+
- "zscore", "z_score": Z-Score method
|
|
412
|
+
- "iqr": Interquartile Range method
|
|
413
|
+
- "percent_change", "pct_change": Percent change method
|
|
414
|
+
- "modified_zscore", "mad": Modified Z-Score (MAD) method
|
|
415
|
+
- "baseline": ML-based baseline comparison
|
|
416
|
+
- "ks_test": Kolmogorov-Smirnov distribution test
|
|
417
|
+
- "seasonal": Seasonal pattern detection
|
|
413
418
|
**kwargs: Method-specific parameters
|
|
414
419
|
|
|
415
420
|
Returns:
|
|
416
421
|
Configured AnomalyMethod
|
|
417
422
|
"""
|
|
423
|
+
# Import ML methods lazily to avoid circular imports
|
|
424
|
+
from duckguard.anomaly.ml_methods import BaselineMethod, KSTestMethod, SeasonalMethod
|
|
425
|
+
|
|
418
426
|
methods = {
|
|
419
427
|
"zscore": ZScoreMethod,
|
|
420
428
|
"z_score": ZScoreMethod,
|
|
@@ -423,10 +431,14 @@ def create_method(
|
|
|
423
431
|
"pct_change": PercentChangeMethod,
|
|
424
432
|
"modified_zscore": ModifiedZScoreMethod,
|
|
425
433
|
"mad": ModifiedZScoreMethod,
|
|
434
|
+
"baseline": BaselineMethod,
|
|
435
|
+
"ks_test": KSTestMethod,
|
|
436
|
+
"ks": KSTestMethod,
|
|
437
|
+
"seasonal": SeasonalMethod,
|
|
426
438
|
}
|
|
427
439
|
|
|
428
440
|
method_class = methods.get(method_name.lower())
|
|
429
441
|
if not method_class:
|
|
430
|
-
raise ValueError(f"Unknown anomaly method: {method_name}")
|
|
442
|
+
raise ValueError(f"Unknown anomaly method: {method_name}. Available: {list(methods.keys())}")
|
|
431
443
|
|
|
432
444
|
return method_class(**kwargs)
|