duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""Trend analysis for historical validation data.
|
|
2
|
+
|
|
3
|
+
Provides analysis of quality score trends over time,
|
|
4
|
+
helping identify patterns and regressions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from duckguard.history.storage import HistoryStorage, TrendDataPoint
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class TrendAnalysis:
|
|
17
|
+
"""Result of trend analysis for a data source.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
source: The data source being analyzed
|
|
21
|
+
period_days: Number of days analyzed
|
|
22
|
+
current_score: Most recent quality score
|
|
23
|
+
average_score: Average score over the period
|
|
24
|
+
min_score: Minimum score in the period
|
|
25
|
+
max_score: Maximum score in the period
|
|
26
|
+
score_trend: Trend direction ("improving", "declining", "stable")
|
|
27
|
+
trend_change: Percentage change in score
|
|
28
|
+
total_runs: Total number of validation runs
|
|
29
|
+
pass_rate: Percentage of runs that passed
|
|
30
|
+
daily_data: Daily trend data points
|
|
31
|
+
anomalies: List of dates with anomalous scores
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
source: str
|
|
35
|
+
period_days: int
|
|
36
|
+
current_score: float
|
|
37
|
+
average_score: float
|
|
38
|
+
min_score: float
|
|
39
|
+
max_score: float
|
|
40
|
+
score_trend: str
|
|
41
|
+
trend_change: float
|
|
42
|
+
total_runs: int
|
|
43
|
+
pass_rate: float
|
|
44
|
+
daily_data: list[TrendDataPoint] = field(default_factory=list)
|
|
45
|
+
anomalies: list[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def is_improving(self) -> bool:
|
|
49
|
+
"""Check if trend is improving."""
|
|
50
|
+
return self.score_trend == "improving"
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def is_declining(self) -> bool:
|
|
54
|
+
"""Check if trend is declining."""
|
|
55
|
+
return self.score_trend == "declining"
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def has_anomalies(self) -> bool:
|
|
59
|
+
"""Check if there are any anomalies."""
|
|
60
|
+
return len(self.anomalies) > 0
|
|
61
|
+
|
|
62
|
+
def summary(self) -> str:
|
|
63
|
+
"""Get a human-readable summary of the trend."""
|
|
64
|
+
trend_symbol = {"improving": "[+]", "declining": "[-]", "stable": "[=]"}.get(
|
|
65
|
+
self.score_trend, "[=]"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return (
|
|
69
|
+
f"{trend_symbol} Quality trend is {self.score_trend} "
|
|
70
|
+
f"({self.trend_change:+.1f}% over {self.period_days} days). "
|
|
71
|
+
f"Current score: {self.current_score:.1f}%, "
|
|
72
|
+
f"Average: {self.average_score:.1f}%, "
|
|
73
|
+
f"Pass rate: {self.pass_rate:.1f}%"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class TrendAnalyzer:
|
|
78
|
+
"""Analyzes quality score trends over time.
|
|
79
|
+
|
|
80
|
+
Usage:
|
|
81
|
+
from duckguard.history import HistoryStorage, TrendAnalyzer
|
|
82
|
+
|
|
83
|
+
storage = HistoryStorage()
|
|
84
|
+
analyzer = TrendAnalyzer(storage)
|
|
85
|
+
|
|
86
|
+
# Analyze trends for a source
|
|
87
|
+
analysis = analyzer.analyze("data.csv", days=30)
|
|
88
|
+
print(analysis.summary())
|
|
89
|
+
|
|
90
|
+
# Check for regressions
|
|
91
|
+
if analyzer.has_regression("data.csv"):
|
|
92
|
+
print("Quality regression detected!")
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
# Threshold for considering a score change significant
|
|
96
|
+
SIGNIFICANT_CHANGE_THRESHOLD = 5.0 # 5%
|
|
97
|
+
|
|
98
|
+
# Standard deviations for anomaly detection
|
|
99
|
+
ANOMALY_THRESHOLD = 2.0
|
|
100
|
+
|
|
101
|
+
def __init__(self, storage: HistoryStorage):
|
|
102
|
+
"""Initialize analyzer.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
storage: HistoryStorage instance
|
|
106
|
+
"""
|
|
107
|
+
self.storage = storage
|
|
108
|
+
|
|
109
|
+
def analyze(self, source: str, days: int = 30) -> TrendAnalysis:
|
|
110
|
+
"""Analyze trends for a data source.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
source: Data source path
|
|
114
|
+
days: Number of days to analyze
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
TrendAnalysis with detailed trend information
|
|
118
|
+
"""
|
|
119
|
+
daily_data = self.storage.get_trend(source, days=days)
|
|
120
|
+
|
|
121
|
+
# Handle case with no data
|
|
122
|
+
if not daily_data:
|
|
123
|
+
latest = self.storage.get_latest_run(source)
|
|
124
|
+
return TrendAnalysis(
|
|
125
|
+
source=source,
|
|
126
|
+
period_days=days,
|
|
127
|
+
current_score=latest.quality_score if latest else 0.0,
|
|
128
|
+
average_score=latest.quality_score if latest else 0.0,
|
|
129
|
+
min_score=latest.quality_score if latest else 0.0,
|
|
130
|
+
max_score=latest.quality_score if latest else 0.0,
|
|
131
|
+
score_trend="stable",
|
|
132
|
+
trend_change=0.0,
|
|
133
|
+
total_runs=1 if latest else 0,
|
|
134
|
+
pass_rate=100.0 if latest and latest.passed else 0.0,
|
|
135
|
+
daily_data=[],
|
|
136
|
+
anomalies=[],
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Calculate statistics
|
|
140
|
+
scores = [d.avg_score for d in daily_data]
|
|
141
|
+
total_runs = sum(d.run_count for d in daily_data)
|
|
142
|
+
total_passed = sum(d.passed_count for d in daily_data)
|
|
143
|
+
|
|
144
|
+
current_score = scores[-1] if scores else 0.0
|
|
145
|
+
average_score = sum(scores) / len(scores) if scores else 0.0
|
|
146
|
+
min_score = min(scores) if scores else 0.0
|
|
147
|
+
max_score = max(scores) if scores else 0.0
|
|
148
|
+
pass_rate = (total_passed / total_runs * 100) if total_runs > 0 else 0.0
|
|
149
|
+
|
|
150
|
+
# Determine trend direction
|
|
151
|
+
trend, trend_change = self._calculate_trend(scores)
|
|
152
|
+
|
|
153
|
+
# Detect anomalies
|
|
154
|
+
anomalies = self._detect_anomalies(daily_data, average_score, scores)
|
|
155
|
+
|
|
156
|
+
return TrendAnalysis(
|
|
157
|
+
source=source,
|
|
158
|
+
period_days=days,
|
|
159
|
+
current_score=current_score,
|
|
160
|
+
average_score=average_score,
|
|
161
|
+
min_score=min_score,
|
|
162
|
+
max_score=max_score,
|
|
163
|
+
score_trend=trend,
|
|
164
|
+
trend_change=trend_change,
|
|
165
|
+
total_runs=total_runs,
|
|
166
|
+
pass_rate=pass_rate,
|
|
167
|
+
daily_data=daily_data,
|
|
168
|
+
anomalies=anomalies,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def has_regression(
|
|
172
|
+
self,
|
|
173
|
+
source: str,
|
|
174
|
+
threshold: float | None = None,
|
|
175
|
+
days: int = 7,
|
|
176
|
+
) -> bool:
|
|
177
|
+
"""Check if there's been a quality regression.
|
|
178
|
+
|
|
179
|
+
A regression is defined as a significant decline in quality score
|
|
180
|
+
compared to the previous period.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
source: Data source path
|
|
184
|
+
threshold: Score drop threshold (default: SIGNIFICANT_CHANGE_THRESHOLD)
|
|
185
|
+
days: Number of days to compare
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
True if a regression is detected
|
|
189
|
+
"""
|
|
190
|
+
if threshold is None:
|
|
191
|
+
threshold = self.SIGNIFICANT_CHANGE_THRESHOLD
|
|
192
|
+
|
|
193
|
+
analysis = self.analyze(source, days=days * 2)
|
|
194
|
+
|
|
195
|
+
if len(analysis.daily_data) < 2:
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
# Compare recent scores to earlier scores
|
|
199
|
+
midpoint = len(analysis.daily_data) // 2
|
|
200
|
+
earlier_scores = [d.avg_score for d in analysis.daily_data[:midpoint]]
|
|
201
|
+
recent_scores = [d.avg_score for d in analysis.daily_data[midpoint:]]
|
|
202
|
+
|
|
203
|
+
if not earlier_scores or not recent_scores:
|
|
204
|
+
return False
|
|
205
|
+
|
|
206
|
+
earlier_avg = sum(earlier_scores) / len(earlier_scores)
|
|
207
|
+
recent_avg = sum(recent_scores) / len(recent_scores)
|
|
208
|
+
|
|
209
|
+
return (earlier_avg - recent_avg) >= threshold
|
|
210
|
+
|
|
211
|
+
def compare_periods(
|
|
212
|
+
self,
|
|
213
|
+
source: str,
|
|
214
|
+
period1_days: int = 7,
|
|
215
|
+
period2_days: int = 7,
|
|
216
|
+
) -> dict[str, Any]:
|
|
217
|
+
"""Compare quality metrics between two periods.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
source: Data source path
|
|
221
|
+
period1_days: Days in recent period
|
|
222
|
+
period2_days: Days in comparison period
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Dictionary with comparison metrics
|
|
226
|
+
"""
|
|
227
|
+
total_days = period1_days + period2_days
|
|
228
|
+
daily_data = self.storage.get_trend(source, days=total_days)
|
|
229
|
+
|
|
230
|
+
if not daily_data:
|
|
231
|
+
return {
|
|
232
|
+
"recent_avg": 0.0,
|
|
233
|
+
"previous_avg": 0.0,
|
|
234
|
+
"change": 0.0,
|
|
235
|
+
"change_percent": 0.0,
|
|
236
|
+
"improved": False,
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
# Split into periods
|
|
240
|
+
if len(daily_data) <= period1_days:
|
|
241
|
+
recent_data = daily_data
|
|
242
|
+
previous_data = []
|
|
243
|
+
else:
|
|
244
|
+
recent_data = daily_data[-period1_days:]
|
|
245
|
+
previous_data = daily_data[:-period1_days]
|
|
246
|
+
|
|
247
|
+
recent_avg = (
|
|
248
|
+
sum(d.avg_score for d in recent_data) / len(recent_data)
|
|
249
|
+
if recent_data
|
|
250
|
+
else 0.0
|
|
251
|
+
)
|
|
252
|
+
previous_avg = (
|
|
253
|
+
sum(d.avg_score for d in previous_data) / len(previous_data)
|
|
254
|
+
if previous_data
|
|
255
|
+
else recent_avg
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
change = recent_avg - previous_avg
|
|
259
|
+
change_percent = (change / previous_avg * 100) if previous_avg > 0 else 0.0
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
"recent_avg": recent_avg,
|
|
263
|
+
"previous_avg": previous_avg,
|
|
264
|
+
"change": change,
|
|
265
|
+
"change_percent": change_percent,
|
|
266
|
+
"improved": change > 0,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
def _calculate_trend(self, scores: list[float]) -> tuple[str, float]:
|
|
270
|
+
"""Calculate trend direction and magnitude.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
scores: List of scores ordered by date
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Tuple of (trend_direction, change_percentage)
|
|
277
|
+
"""
|
|
278
|
+
if len(scores) < 2:
|
|
279
|
+
return "stable", 0.0
|
|
280
|
+
|
|
281
|
+
# Use first and last week averages for comparison
|
|
282
|
+
window = min(7, len(scores) // 2) or 1
|
|
283
|
+
|
|
284
|
+
first_avg = sum(scores[:window]) / window
|
|
285
|
+
last_avg = sum(scores[-window:]) / window
|
|
286
|
+
|
|
287
|
+
change = last_avg - first_avg
|
|
288
|
+
|
|
289
|
+
if change >= self.SIGNIFICANT_CHANGE_THRESHOLD:
|
|
290
|
+
return "improving", change
|
|
291
|
+
elif change <= -self.SIGNIFICANT_CHANGE_THRESHOLD:
|
|
292
|
+
return "declining", change
|
|
293
|
+
else:
|
|
294
|
+
return "stable", change
|
|
295
|
+
|
|
296
|
+
def _detect_anomalies(
|
|
297
|
+
self,
|
|
298
|
+
daily_data: list[TrendDataPoint],
|
|
299
|
+
mean: float,
|
|
300
|
+
scores: list[float],
|
|
301
|
+
) -> list[str]:
|
|
302
|
+
"""Detect anomalous days based on score deviation.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
daily_data: List of daily trend data
|
|
306
|
+
mean: Mean score
|
|
307
|
+
scores: List of scores
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
List of dates with anomalous scores
|
|
311
|
+
"""
|
|
312
|
+
if len(scores) < 3:
|
|
313
|
+
return []
|
|
314
|
+
|
|
315
|
+
# Calculate standard deviation
|
|
316
|
+
variance = sum((s - mean) ** 2 for s in scores) / len(scores)
|
|
317
|
+
stddev = variance**0.5
|
|
318
|
+
|
|
319
|
+
if stddev == 0:
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
anomalies = []
|
|
323
|
+
for data in daily_data:
|
|
324
|
+
z_score = abs(data.avg_score - mean) / stddev
|
|
325
|
+
if z_score > self.ANOMALY_THRESHOLD:
|
|
326
|
+
anomalies.append(data.date)
|
|
327
|
+
|
|
328
|
+
return anomalies
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def analyze_trends(
|
|
332
|
+
source: str,
|
|
333
|
+
days: int = 30,
|
|
334
|
+
db_path: str | None = None,
|
|
335
|
+
) -> TrendAnalysis:
|
|
336
|
+
"""Convenience function for trend analysis.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
source: Data source path
|
|
340
|
+
days: Number of days to analyze
|
|
341
|
+
db_path: Path to history database
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
TrendAnalysis result
|
|
345
|
+
"""
|
|
346
|
+
storage = HistoryStorage(db_path=db_path)
|
|
347
|
+
analyzer = TrendAnalyzer(storage)
|
|
348
|
+
return analyzer.analyze(source, days=days)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DuckGuard Integrations - Connect with dbt, Airflow, and more.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from duckguard.integrations import dbt
|
|
6
|
+
|
|
7
|
+
# Export DuckGuard rules to dbt tests
|
|
8
|
+
dbt.export_to_schema("duckguard.yaml", "models/schema.yml")
|
|
9
|
+
|
|
10
|
+
# Generate dbt generic tests from rules
|
|
11
|
+
tests = dbt.rules_to_dbt_tests(rules)
|
|
12
|
+
|
|
13
|
+
# Airflow integration (requires apache-airflow)
|
|
14
|
+
from duckguard.integrations.airflow import DuckGuardOperator
|
|
15
|
+
|
|
16
|
+
check_task = DuckGuardOperator(
|
|
17
|
+
task_id="check_quality",
|
|
18
|
+
source="s3://bucket/data.parquet",
|
|
19
|
+
rules="rules.yaml",
|
|
20
|
+
)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from duckguard.integrations import dbt
|
|
24
|
+
|
|
25
|
+
# Airflow integration is optional - only import if available
|
|
26
|
+
try:
|
|
27
|
+
from duckguard.integrations import airflow
|
|
28
|
+
except ImportError:
|
|
29
|
+
airflow = None # type: ignore[assignment]
|
|
30
|
+
|
|
31
|
+
__all__ = ["dbt", "airflow"]
|