duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
duckguard/__init__.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DuckGuard - Data quality that just works.
|
|
3
|
+
|
|
4
|
+
A Python-native data quality tool built on DuckDB for speed.
|
|
5
|
+
Features YAML-based rules, semantic type detection, data contracts,
|
|
6
|
+
and anomaly detection.
|
|
7
|
+
|
|
8
|
+
Quick Start:
|
|
9
|
+
# Python API
|
|
10
|
+
from duckguard import connect
|
|
11
|
+
orders = connect("data/orders.csv")
|
|
12
|
+
assert orders.row_count > 0
|
|
13
|
+
assert orders.customer_id.null_percent == 0
|
|
14
|
+
|
|
15
|
+
# CLI
|
|
16
|
+
$ duckguard check data.csv
|
|
17
|
+
$ duckguard discover data.csv --output duckguard.yaml
|
|
18
|
+
$ duckguard contract generate data.csv
|
|
19
|
+
|
|
20
|
+
Documentation: https://github.com/duckguard/duckguard
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
# Core classes
|
|
24
|
+
from duckguard.core.dataset import Dataset
|
|
25
|
+
from duckguard.core.column import Column
|
|
26
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
27
|
+
from duckguard.core.result import ValidationResult, CheckResult
|
|
28
|
+
from duckguard.core.scoring import QualityScore, QualityScorer, score
|
|
29
|
+
|
|
30
|
+
# Connectors
|
|
31
|
+
from duckguard.connectors import connect
|
|
32
|
+
|
|
33
|
+
# Profiling
|
|
34
|
+
from duckguard.profiler import profile, AutoProfiler
|
|
35
|
+
|
|
36
|
+
# Rules (YAML-based)
|
|
37
|
+
from duckguard.rules import (
|
|
38
|
+
load_rules,
|
|
39
|
+
load_rules_from_string,
|
|
40
|
+
execute_rules,
|
|
41
|
+
generate_rules,
|
|
42
|
+
RuleSet,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Semantic type detection
|
|
46
|
+
from duckguard.semantic import (
|
|
47
|
+
SemanticType,
|
|
48
|
+
SemanticAnalyzer,
|
|
49
|
+
detect_type,
|
|
50
|
+
detect_types_for_dataset,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Data contracts
|
|
54
|
+
from duckguard.contracts import (
|
|
55
|
+
DataContract,
|
|
56
|
+
load_contract,
|
|
57
|
+
validate_contract,
|
|
58
|
+
generate_contract,
|
|
59
|
+
diff_contracts,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Anomaly detection
|
|
63
|
+
from duckguard.anomaly import (
|
|
64
|
+
AnomalyDetector,
|
|
65
|
+
AnomalyResult,
|
|
66
|
+
detect_anomalies,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
__version__ = "2.0.0"
|
|
70
|
+
|
|
71
|
+
__all__ = [
|
|
72
|
+
# Core classes
|
|
73
|
+
"Dataset",
|
|
74
|
+
"Column",
|
|
75
|
+
"DuckGuardEngine",
|
|
76
|
+
"ValidationResult",
|
|
77
|
+
"CheckResult",
|
|
78
|
+
# Scoring
|
|
79
|
+
"QualityScore",
|
|
80
|
+
"QualityScorer",
|
|
81
|
+
"score",
|
|
82
|
+
# Connectors
|
|
83
|
+
"connect",
|
|
84
|
+
# Profiling
|
|
85
|
+
"profile",
|
|
86
|
+
"AutoProfiler",
|
|
87
|
+
# Rules
|
|
88
|
+
"load_rules",
|
|
89
|
+
"load_rules_from_string",
|
|
90
|
+
"execute_rules",
|
|
91
|
+
"generate_rules",
|
|
92
|
+
"RuleSet",
|
|
93
|
+
# Semantic
|
|
94
|
+
"SemanticType",
|
|
95
|
+
"SemanticAnalyzer",
|
|
96
|
+
"detect_type",
|
|
97
|
+
"detect_types_for_dataset",
|
|
98
|
+
# Contracts
|
|
99
|
+
"DataContract",
|
|
100
|
+
"load_contract",
|
|
101
|
+
"validate_contract",
|
|
102
|
+
"generate_contract",
|
|
103
|
+
"diff_contracts",
|
|
104
|
+
# Anomaly
|
|
105
|
+
"AnomalyDetector",
|
|
106
|
+
"AnomalyResult",
|
|
107
|
+
"detect_anomalies",
|
|
108
|
+
# Version
|
|
109
|
+
"__version__",
|
|
110
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Anomaly detection for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides statistical and ML-based anomaly detection for data quality monitoring.
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
from duckguard.anomaly import detect_anomalies, AnomalyDetector
|
|
7
|
+
|
|
8
|
+
detector = AnomalyDetector()
|
|
9
|
+
anomalies = detector.detect(dataset, column="amount")
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from duckguard.anomaly.detector import (
|
|
13
|
+
AnomalyDetector,
|
|
14
|
+
AnomalyResult,
|
|
15
|
+
AnomalyType,
|
|
16
|
+
detect_anomalies,
|
|
17
|
+
detect_column_anomalies,
|
|
18
|
+
)
|
|
19
|
+
from duckguard.anomaly.methods import (
|
|
20
|
+
ZScoreMethod,
|
|
21
|
+
IQRMethod,
|
|
22
|
+
PercentChangeMethod,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"AnomalyDetector",
|
|
27
|
+
"AnomalyResult",
|
|
28
|
+
"AnomalyType",
|
|
29
|
+
"detect_anomalies",
|
|
30
|
+
"detect_column_anomalies",
|
|
31
|
+
"ZScoreMethod",
|
|
32
|
+
"IQRMethod",
|
|
33
|
+
"PercentChangeMethod",
|
|
34
|
+
]
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""High-level anomaly detector for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides easy-to-use anomaly detection for datasets and columns.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from duckguard.core.dataset import Dataset
|
|
14
|
+
from duckguard.anomaly.methods import (
|
|
15
|
+
AnomalyMethod,
|
|
16
|
+
AnomalyScore,
|
|
17
|
+
ZScoreMethod,
|
|
18
|
+
IQRMethod,
|
|
19
|
+
PercentChangeMethod,
|
|
20
|
+
create_method,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AnomalyType(Enum):
|
|
25
|
+
"""Types of anomalies."""
|
|
26
|
+
|
|
27
|
+
VALUE_OUTLIER = "value_outlier" # Individual value is unusual
|
|
28
|
+
DISTRIBUTION_SHIFT = "distribution_shift" # Overall distribution changed
|
|
29
|
+
VOLUME_ANOMALY = "volume_anomaly" # Row count anomaly
|
|
30
|
+
NULL_SPIKE = "null_spike" # Unusual increase in nulls
|
|
31
|
+
CARDINALITY_CHANGE = "cardinality_change" # Number of distinct values changed
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class AnomalyResult:
|
|
36
|
+
"""Result of anomaly detection.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
column: Column name (None for table-level)
|
|
40
|
+
anomaly_type: Type of anomaly
|
|
41
|
+
is_anomaly: Whether an anomaly was detected
|
|
42
|
+
score: Anomaly score
|
|
43
|
+
threshold: Detection threshold
|
|
44
|
+
message: Human-readable message
|
|
45
|
+
details: Additional details
|
|
46
|
+
samples: Sample anomalous values
|
|
47
|
+
detected_at: When the anomaly was detected
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
column: str | None
|
|
51
|
+
anomaly_type: AnomalyType
|
|
52
|
+
is_anomaly: bool
|
|
53
|
+
score: float
|
|
54
|
+
threshold: float
|
|
55
|
+
message: str
|
|
56
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
samples: list[Any] = field(default_factory=list)
|
|
58
|
+
detected_at: datetime = field(default_factory=datetime.now)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class DatasetAnomalyReport:
|
|
63
|
+
"""Anomaly detection report for a dataset.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
source: Data source path
|
|
67
|
+
anomalies: List of detected anomalies
|
|
68
|
+
checked_at: When the check was performed
|
|
69
|
+
statistics: Detection statistics
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
source: str
|
|
73
|
+
anomalies: list[AnomalyResult] = field(default_factory=list)
|
|
74
|
+
checked_at: datetime = field(default_factory=datetime.now)
|
|
75
|
+
statistics: dict[str, Any] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def has_anomalies(self) -> bool:
|
|
79
|
+
return any(a.is_anomaly for a in self.anomalies)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def anomaly_count(self) -> int:
|
|
83
|
+
return sum(1 for a in self.anomalies if a.is_anomaly)
|
|
84
|
+
|
|
85
|
+
def get_anomalies(self) -> list[AnomalyResult]:
|
|
86
|
+
"""Get only the detected anomalies."""
|
|
87
|
+
return [a for a in self.anomalies if a.is_anomaly]
|
|
88
|
+
|
|
89
|
+
def summary(self) -> str:
|
|
90
|
+
"""Generate a summary."""
|
|
91
|
+
if not self.has_anomalies:
|
|
92
|
+
return "No anomalies detected."
|
|
93
|
+
|
|
94
|
+
lines = [f"Detected {self.anomaly_count} anomalies:"]
|
|
95
|
+
for anomaly in self.get_anomalies():
|
|
96
|
+
col_str = f"[{anomaly.column}]" if anomaly.column else "[table]"
|
|
97
|
+
lines.append(f" ⚠️ {col_str} {anomaly.message}")
|
|
98
|
+
|
|
99
|
+
return "\n".join(lines)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class AnomalyDetector:
|
|
103
|
+
"""Detects anomalies in datasets."""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
method: str = "zscore",
|
|
108
|
+
threshold: float | None = None,
|
|
109
|
+
**method_kwargs
|
|
110
|
+
):
|
|
111
|
+
"""Initialize detector.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
method: Detection method ("zscore", "iqr", "percent_change")
|
|
115
|
+
threshold: Detection threshold (method-specific default if None)
|
|
116
|
+
**method_kwargs: Additional method parameters
|
|
117
|
+
"""
|
|
118
|
+
self.method_name = method
|
|
119
|
+
|
|
120
|
+
# Set default thresholds
|
|
121
|
+
if threshold is None:
|
|
122
|
+
defaults = {"zscore": 3.0, "iqr": 1.5, "percent_change": 0.2}
|
|
123
|
+
threshold = defaults.get(method, 3.0)
|
|
124
|
+
|
|
125
|
+
self.threshold = threshold
|
|
126
|
+
self.method_kwargs = method_kwargs
|
|
127
|
+
|
|
128
|
+
def detect(
|
|
129
|
+
self,
|
|
130
|
+
dataset: Dataset,
|
|
131
|
+
columns: list[str] | None = None,
|
|
132
|
+
include_row_count: bool = True,
|
|
133
|
+
include_null_check: bool = True,
|
|
134
|
+
) -> DatasetAnomalyReport:
|
|
135
|
+
"""Detect anomalies in a dataset.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
dataset: Dataset to analyze
|
|
139
|
+
columns: Specific columns to check (None = all numeric)
|
|
140
|
+
include_row_count: Check for row count anomalies
|
|
141
|
+
include_null_check: Check for null percentage spikes
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
DatasetAnomalyReport
|
|
145
|
+
"""
|
|
146
|
+
report = DatasetAnomalyReport(source=dataset.source)
|
|
147
|
+
|
|
148
|
+
# Determine columns to check
|
|
149
|
+
if columns is None:
|
|
150
|
+
columns = self._get_numeric_columns(dataset)
|
|
151
|
+
|
|
152
|
+
# Check each column for value anomalies
|
|
153
|
+
for col_name in columns:
|
|
154
|
+
result = self.detect_column(dataset, col_name)
|
|
155
|
+
report.anomalies.append(result)
|
|
156
|
+
|
|
157
|
+
# Check null percentages
|
|
158
|
+
if include_null_check:
|
|
159
|
+
for col_name in dataset.columns:
|
|
160
|
+
null_result = self._check_null_anomaly(dataset, col_name)
|
|
161
|
+
if null_result.is_anomaly:
|
|
162
|
+
report.anomalies.append(null_result)
|
|
163
|
+
|
|
164
|
+
report.statistics = {
|
|
165
|
+
"columns_checked": len(columns),
|
|
166
|
+
"method": self.method_name,
|
|
167
|
+
"threshold": self.threshold,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return report
|
|
171
|
+
|
|
172
|
+
def detect_column(
|
|
173
|
+
self,
|
|
174
|
+
dataset: Dataset,
|
|
175
|
+
column: str,
|
|
176
|
+
baseline_values: list[float] | None = None
|
|
177
|
+
) -> AnomalyResult:
|
|
178
|
+
"""Detect anomalies in a specific column.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
dataset: Dataset to analyze
|
|
182
|
+
column: Column name
|
|
183
|
+
baseline_values: Historical values for comparison
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
AnomalyResult
|
|
187
|
+
"""
|
|
188
|
+
col = dataset[column]
|
|
189
|
+
|
|
190
|
+
# Get column values
|
|
191
|
+
try:
|
|
192
|
+
# Get numeric stats
|
|
193
|
+
mean = col.mean
|
|
194
|
+
if mean is None:
|
|
195
|
+
return AnomalyResult(
|
|
196
|
+
column=column,
|
|
197
|
+
anomaly_type=AnomalyType.VALUE_OUTLIER,
|
|
198
|
+
is_anomaly=False,
|
|
199
|
+
score=0.0,
|
|
200
|
+
threshold=self.threshold,
|
|
201
|
+
message=f"Column '{column}' is not numeric",
|
|
202
|
+
details={"reason": "not_numeric"},
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
min_val = col.min
|
|
206
|
+
max_val = col.max
|
|
207
|
+
stddev = col.stddev or 0
|
|
208
|
+
|
|
209
|
+
# Create detection method
|
|
210
|
+
method = create_method(
|
|
211
|
+
self.method_name,
|
|
212
|
+
threshold=self.threshold,
|
|
213
|
+
**self.method_kwargs
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# If we have baseline values, fit on those
|
|
217
|
+
if baseline_values:
|
|
218
|
+
method.fit(baseline_values)
|
|
219
|
+
|
|
220
|
+
# Score current values (using min, max, mean as representatives)
|
|
221
|
+
scores = [
|
|
222
|
+
method.score(min_val),
|
|
223
|
+
method.score(max_val),
|
|
224
|
+
method.score(mean),
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
# Find worst anomaly
|
|
228
|
+
worst = max(scores, key=lambda s: s.score)
|
|
229
|
+
is_anomaly = worst.is_anomaly
|
|
230
|
+
|
|
231
|
+
else:
|
|
232
|
+
# No baseline - check current distribution characteristics
|
|
233
|
+
# Look for extreme values relative to mean
|
|
234
|
+
if stddev > 0:
|
|
235
|
+
z_min = abs(min_val - mean) / stddev if min_val is not None else 0
|
|
236
|
+
z_max = abs(max_val - mean) / stddev if max_val is not None else 0
|
|
237
|
+
|
|
238
|
+
worst_z = max(z_min, z_max)
|
|
239
|
+
is_anomaly = worst_z > self.threshold
|
|
240
|
+
|
|
241
|
+
worst = AnomalyScore(
|
|
242
|
+
value=max_val if z_max > z_min else min_val,
|
|
243
|
+
score=worst_z,
|
|
244
|
+
is_anomaly=is_anomaly,
|
|
245
|
+
threshold=self.threshold,
|
|
246
|
+
details={
|
|
247
|
+
"mean": mean,
|
|
248
|
+
"stddev": stddev,
|
|
249
|
+
"min": min_val,
|
|
250
|
+
"max": max_val,
|
|
251
|
+
}
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
is_anomaly = False
|
|
255
|
+
worst = AnomalyScore(
|
|
256
|
+
value=mean,
|
|
257
|
+
score=0.0,
|
|
258
|
+
is_anomaly=False,
|
|
259
|
+
threshold=self.threshold,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Build result
|
|
263
|
+
message = self._build_message(column, worst, mean, stddev)
|
|
264
|
+
|
|
265
|
+
return AnomalyResult(
|
|
266
|
+
column=column,
|
|
267
|
+
anomaly_type=AnomalyType.VALUE_OUTLIER,
|
|
268
|
+
is_anomaly=is_anomaly,
|
|
269
|
+
score=worst.score,
|
|
270
|
+
threshold=self.threshold,
|
|
271
|
+
message=message,
|
|
272
|
+
details={
|
|
273
|
+
"mean": mean,
|
|
274
|
+
"stddev": stddev,
|
|
275
|
+
"min": min_val,
|
|
276
|
+
"max": max_val,
|
|
277
|
+
"method": self.method_name,
|
|
278
|
+
**worst.details,
|
|
279
|
+
},
|
|
280
|
+
samples=[worst.value] if is_anomaly else [],
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
return AnomalyResult(
|
|
285
|
+
column=column,
|
|
286
|
+
anomaly_type=AnomalyType.VALUE_OUTLIER,
|
|
287
|
+
is_anomaly=False,
|
|
288
|
+
score=0.0,
|
|
289
|
+
threshold=self.threshold,
|
|
290
|
+
message=f"Error analyzing column '{column}': {e}",
|
|
291
|
+
details={"error": str(e)},
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
def _check_null_anomaly(
|
|
295
|
+
self,
|
|
296
|
+
dataset: Dataset,
|
|
297
|
+
column: str,
|
|
298
|
+
expected_null_pct: float = 5.0
|
|
299
|
+
) -> AnomalyResult:
|
|
300
|
+
"""Check for unusual null percentages."""
|
|
301
|
+
col = dataset[column]
|
|
302
|
+
null_pct = col.null_percent
|
|
303
|
+
|
|
304
|
+
# Consider it anomalous if null % is much higher than expected
|
|
305
|
+
is_anomaly = null_pct > expected_null_pct * 2 and null_pct > 10
|
|
306
|
+
|
|
307
|
+
return AnomalyResult(
|
|
308
|
+
column=column,
|
|
309
|
+
anomaly_type=AnomalyType.NULL_SPIKE,
|
|
310
|
+
is_anomaly=is_anomaly,
|
|
311
|
+
score=null_pct,
|
|
312
|
+
threshold=expected_null_pct,
|
|
313
|
+
message=f"Column '{column}' has {null_pct:.1f}% nulls (threshold: {expected_null_pct}%)",
|
|
314
|
+
details={
|
|
315
|
+
"null_percent": null_pct,
|
|
316
|
+
"null_count": col.null_count,
|
|
317
|
+
"expected_max": expected_null_pct,
|
|
318
|
+
},
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def _get_numeric_columns(self, dataset: Dataset) -> list[str]:
|
|
322
|
+
"""Get list of numeric columns."""
|
|
323
|
+
numeric_cols = []
|
|
324
|
+
for col_name in dataset.columns:
|
|
325
|
+
col = dataset[col_name]
|
|
326
|
+
try:
|
|
327
|
+
if col.mean is not None:
|
|
328
|
+
numeric_cols.append(col_name)
|
|
329
|
+
except Exception:
|
|
330
|
+
pass
|
|
331
|
+
return numeric_cols
|
|
332
|
+
|
|
333
|
+
def _build_message(
|
|
334
|
+
self,
|
|
335
|
+
column: str,
|
|
336
|
+
worst: AnomalyScore,
|
|
337
|
+
mean: float,
|
|
338
|
+
stddev: float
|
|
339
|
+
) -> str:
|
|
340
|
+
"""Build human-readable message."""
|
|
341
|
+
if not worst.is_anomaly:
|
|
342
|
+
return f"Column '{column}' values are within normal range"
|
|
343
|
+
|
|
344
|
+
direction = worst.details.get("deviation_direction", "")
|
|
345
|
+
if direction == "above":
|
|
346
|
+
return f"Column '{column}' has unusually high values (max: {worst.value:.2f}, mean: {mean:.2f})"
|
|
347
|
+
elif direction == "below":
|
|
348
|
+
return f"Column '{column}' has unusually low values (min: {worst.value:.2f}, mean: {mean:.2f})"
|
|
349
|
+
else:
|
|
350
|
+
return f"Column '{column}' has anomalous values (score: {worst.score:.2f})"
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def detect_anomalies(
|
|
354
|
+
dataset: Dataset,
|
|
355
|
+
method: str = "zscore",
|
|
356
|
+
threshold: float | None = None,
|
|
357
|
+
columns: list[str] | None = None,
|
|
358
|
+
) -> DatasetAnomalyReport:
|
|
359
|
+
"""Detect anomalies in a dataset.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
dataset: Dataset to analyze
|
|
363
|
+
method: Detection method
|
|
364
|
+
threshold: Detection threshold
|
|
365
|
+
columns: Columns to check
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
DatasetAnomalyReport
|
|
369
|
+
"""
|
|
370
|
+
detector = AnomalyDetector(method=method, threshold=threshold)
|
|
371
|
+
return detector.detect(dataset, columns=columns)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def detect_column_anomalies(
|
|
375
|
+
dataset: Dataset,
|
|
376
|
+
column: str,
|
|
377
|
+
method: str = "zscore",
|
|
378
|
+
threshold: float | None = None,
|
|
379
|
+
baseline: list[float] | None = None,
|
|
380
|
+
) -> AnomalyResult:
|
|
381
|
+
"""Detect anomalies in a specific column.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
dataset: Dataset
|
|
385
|
+
column: Column name
|
|
386
|
+
method: Detection method
|
|
387
|
+
threshold: Detection threshold
|
|
388
|
+
baseline: Historical values for comparison
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
AnomalyResult
|
|
392
|
+
"""
|
|
393
|
+
detector = AnomalyDetector(method=method, threshold=threshold)
|
|
394
|
+
return detector.detect_column(dataset, column, baseline_values=baseline)
|