duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,348 @@
1
+ """Trend analysis for historical validation data.
2
+
3
+ Provides analysis of quality score trends over time,
4
+ helping identify patterns and regressions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+
12
+ from duckguard.history.storage import HistoryStorage, TrendDataPoint
13
+
14
+
15
+ @dataclass
16
+ class TrendAnalysis:
17
+ """Result of trend analysis for a data source.
18
+
19
+ Attributes:
20
+ source: The data source being analyzed
21
+ period_days: Number of days analyzed
22
+ current_score: Most recent quality score
23
+ average_score: Average score over the period
24
+ min_score: Minimum score in the period
25
+ max_score: Maximum score in the period
26
+ score_trend: Trend direction ("improving", "declining", "stable")
27
+ trend_change: Percentage change in score
28
+ total_runs: Total number of validation runs
29
+ pass_rate: Percentage of runs that passed
30
+ daily_data: Daily trend data points
31
+ anomalies: List of dates with anomalous scores
32
+ """
33
+
34
+ source: str
35
+ period_days: int
36
+ current_score: float
37
+ average_score: float
38
+ min_score: float
39
+ max_score: float
40
+ score_trend: str
41
+ trend_change: float
42
+ total_runs: int
43
+ pass_rate: float
44
+ daily_data: list[TrendDataPoint] = field(default_factory=list)
45
+ anomalies: list[str] = field(default_factory=list)
46
+
47
+ @property
48
+ def is_improving(self) -> bool:
49
+ """Check if trend is improving."""
50
+ return self.score_trend == "improving"
51
+
52
+ @property
53
+ def is_declining(self) -> bool:
54
+ """Check if trend is declining."""
55
+ return self.score_trend == "declining"
56
+
57
+ @property
58
+ def has_anomalies(self) -> bool:
59
+ """Check if there are any anomalies."""
60
+ return len(self.anomalies) > 0
61
+
62
+ def summary(self) -> str:
63
+ """Get a human-readable summary of the trend."""
64
+ trend_symbol = {"improving": "[+]", "declining": "[-]", "stable": "[=]"}.get(
65
+ self.score_trend, "[=]"
66
+ )
67
+
68
+ return (
69
+ f"{trend_symbol} Quality trend is {self.score_trend} "
70
+ f"({self.trend_change:+.1f}% over {self.period_days} days). "
71
+ f"Current score: {self.current_score:.1f}%, "
72
+ f"Average: {self.average_score:.1f}%, "
73
+ f"Pass rate: {self.pass_rate:.1f}%"
74
+ )
75
+
76
+
77
+ class TrendAnalyzer:
78
+ """Analyzes quality score trends over time.
79
+
80
+ Usage:
81
+ from duckguard.history import HistoryStorage, TrendAnalyzer
82
+
83
+ storage = HistoryStorage()
84
+ analyzer = TrendAnalyzer(storage)
85
+
86
+ # Analyze trends for a source
87
+ analysis = analyzer.analyze("data.csv", days=30)
88
+ print(analysis.summary())
89
+
90
+ # Check for regressions
91
+ if analyzer.has_regression("data.csv"):
92
+ print("Quality regression detected!")
93
+ """
94
+
95
+ # Threshold for considering a score change significant
96
+ SIGNIFICANT_CHANGE_THRESHOLD = 5.0 # 5%
97
+
98
+ # Standard deviations for anomaly detection
99
+ ANOMALY_THRESHOLD = 2.0
100
+
101
+ def __init__(self, storage: HistoryStorage):
102
+ """Initialize analyzer.
103
+
104
+ Args:
105
+ storage: HistoryStorage instance
106
+ """
107
+ self.storage = storage
108
+
109
+ def analyze(self, source: str, days: int = 30) -> TrendAnalysis:
110
+ """Analyze trends for a data source.
111
+
112
+ Args:
113
+ source: Data source path
114
+ days: Number of days to analyze
115
+
116
+ Returns:
117
+ TrendAnalysis with detailed trend information
118
+ """
119
+ daily_data = self.storage.get_trend(source, days=days)
120
+
121
+ # Handle case with no data
122
+ if not daily_data:
123
+ latest = self.storage.get_latest_run(source)
124
+ return TrendAnalysis(
125
+ source=source,
126
+ period_days=days,
127
+ current_score=latest.quality_score if latest else 0.0,
128
+ average_score=latest.quality_score if latest else 0.0,
129
+ min_score=latest.quality_score if latest else 0.0,
130
+ max_score=latest.quality_score if latest else 0.0,
131
+ score_trend="stable",
132
+ trend_change=0.0,
133
+ total_runs=1 if latest else 0,
134
+ pass_rate=100.0 if latest and latest.passed else 0.0,
135
+ daily_data=[],
136
+ anomalies=[],
137
+ )
138
+
139
+ # Calculate statistics
140
+ scores = [d.avg_score for d in daily_data]
141
+ total_runs = sum(d.run_count for d in daily_data)
142
+ total_passed = sum(d.passed_count for d in daily_data)
143
+
144
+ current_score = scores[-1] if scores else 0.0
145
+ average_score = sum(scores) / len(scores) if scores else 0.0
146
+ min_score = min(scores) if scores else 0.0
147
+ max_score = max(scores) if scores else 0.0
148
+ pass_rate = (total_passed / total_runs * 100) if total_runs > 0 else 0.0
149
+
150
+ # Determine trend direction
151
+ trend, trend_change = self._calculate_trend(scores)
152
+
153
+ # Detect anomalies
154
+ anomalies = self._detect_anomalies(daily_data, average_score, scores)
155
+
156
+ return TrendAnalysis(
157
+ source=source,
158
+ period_days=days,
159
+ current_score=current_score,
160
+ average_score=average_score,
161
+ min_score=min_score,
162
+ max_score=max_score,
163
+ score_trend=trend,
164
+ trend_change=trend_change,
165
+ total_runs=total_runs,
166
+ pass_rate=pass_rate,
167
+ daily_data=daily_data,
168
+ anomalies=anomalies,
169
+ )
170
+
171
+ def has_regression(
172
+ self,
173
+ source: str,
174
+ threshold: float | None = None,
175
+ days: int = 7,
176
+ ) -> bool:
177
+ """Check if there's been a quality regression.
178
+
179
+ A regression is defined as a significant decline in quality score
180
+ compared to the previous period.
181
+
182
+ Args:
183
+ source: Data source path
184
+ threshold: Score drop threshold (default: SIGNIFICANT_CHANGE_THRESHOLD)
185
+ days: Number of days to compare
186
+
187
+ Returns:
188
+ True if a regression is detected
189
+ """
190
+ if threshold is None:
191
+ threshold = self.SIGNIFICANT_CHANGE_THRESHOLD
192
+
193
+ analysis = self.analyze(source, days=days * 2)
194
+
195
+ if len(analysis.daily_data) < 2:
196
+ return False
197
+
198
+ # Compare recent scores to earlier scores
199
+ midpoint = len(analysis.daily_data) // 2
200
+ earlier_scores = [d.avg_score for d in analysis.daily_data[:midpoint]]
201
+ recent_scores = [d.avg_score for d in analysis.daily_data[midpoint:]]
202
+
203
+ if not earlier_scores or not recent_scores:
204
+ return False
205
+
206
+ earlier_avg = sum(earlier_scores) / len(earlier_scores)
207
+ recent_avg = sum(recent_scores) / len(recent_scores)
208
+
209
+ return (earlier_avg - recent_avg) >= threshold
210
+
211
+ def compare_periods(
212
+ self,
213
+ source: str,
214
+ period1_days: int = 7,
215
+ period2_days: int = 7,
216
+ ) -> dict[str, Any]:
217
+ """Compare quality metrics between two periods.
218
+
219
+ Args:
220
+ source: Data source path
221
+ period1_days: Days in recent period
222
+ period2_days: Days in comparison period
223
+
224
+ Returns:
225
+ Dictionary with comparison metrics
226
+ """
227
+ total_days = period1_days + period2_days
228
+ daily_data = self.storage.get_trend(source, days=total_days)
229
+
230
+ if not daily_data:
231
+ return {
232
+ "recent_avg": 0.0,
233
+ "previous_avg": 0.0,
234
+ "change": 0.0,
235
+ "change_percent": 0.0,
236
+ "improved": False,
237
+ }
238
+
239
+ # Split into periods
240
+ if len(daily_data) <= period1_days:
241
+ recent_data = daily_data
242
+ previous_data = []
243
+ else:
244
+ recent_data = daily_data[-period1_days:]
245
+ previous_data = daily_data[:-period1_days]
246
+
247
+ recent_avg = (
248
+ sum(d.avg_score for d in recent_data) / len(recent_data)
249
+ if recent_data
250
+ else 0.0
251
+ )
252
+ previous_avg = (
253
+ sum(d.avg_score for d in previous_data) / len(previous_data)
254
+ if previous_data
255
+ else recent_avg
256
+ )
257
+
258
+ change = recent_avg - previous_avg
259
+ change_percent = (change / previous_avg * 100) if previous_avg > 0 else 0.0
260
+
261
+ return {
262
+ "recent_avg": recent_avg,
263
+ "previous_avg": previous_avg,
264
+ "change": change,
265
+ "change_percent": change_percent,
266
+ "improved": change > 0,
267
+ }
268
+
269
+ def _calculate_trend(self, scores: list[float]) -> tuple[str, float]:
270
+ """Calculate trend direction and magnitude.
271
+
272
+ Args:
273
+ scores: List of scores ordered by date
274
+
275
+ Returns:
276
+ Tuple of (trend_direction, change_percentage)
277
+ """
278
+ if len(scores) < 2:
279
+ return "stable", 0.0
280
+
281
+ # Use first and last week averages for comparison
282
+ window = min(7, len(scores) // 2) or 1
283
+
284
+ first_avg = sum(scores[:window]) / window
285
+ last_avg = sum(scores[-window:]) / window
286
+
287
+ change = last_avg - first_avg
288
+
289
+ if change >= self.SIGNIFICANT_CHANGE_THRESHOLD:
290
+ return "improving", change
291
+ elif change <= -self.SIGNIFICANT_CHANGE_THRESHOLD:
292
+ return "declining", change
293
+ else:
294
+ return "stable", change
295
+
296
+ def _detect_anomalies(
297
+ self,
298
+ daily_data: list[TrendDataPoint],
299
+ mean: float,
300
+ scores: list[float],
301
+ ) -> list[str]:
302
+ """Detect anomalous days based on score deviation.
303
+
304
+ Args:
305
+ daily_data: List of daily trend data
306
+ mean: Mean score
307
+ scores: List of scores
308
+
309
+ Returns:
310
+ List of dates with anomalous scores
311
+ """
312
+ if len(scores) < 3:
313
+ return []
314
+
315
+ # Calculate standard deviation
316
+ variance = sum((s - mean) ** 2 for s in scores) / len(scores)
317
+ stddev = variance**0.5
318
+
319
+ if stddev == 0:
320
+ return []
321
+
322
+ anomalies = []
323
+ for data in daily_data:
324
+ z_score = abs(data.avg_score - mean) / stddev
325
+ if z_score > self.ANOMALY_THRESHOLD:
326
+ anomalies.append(data.date)
327
+
328
+ return anomalies
329
+
330
+
331
+ def analyze_trends(
332
+ source: str,
333
+ days: int = 30,
334
+ db_path: str | None = None,
335
+ ) -> TrendAnalysis:
336
+ """Convenience function for trend analysis.
337
+
338
+ Args:
339
+ source: Data source path
340
+ days: Number of days to analyze
341
+ db_path: Path to history database
342
+
343
+ Returns:
344
+ TrendAnalysis result
345
+ """
346
+ storage = HistoryStorage(db_path=db_path)
347
+ analyzer = TrendAnalyzer(storage)
348
+ return analyzer.analyze(source, days=days)
@@ -0,0 +1,31 @@
1
+ """
2
+ DuckGuard Integrations - Connect with dbt, Airflow, and more.
3
+
4
+ Usage:
5
+ from duckguard.integrations import dbt
6
+
7
+ # Export DuckGuard rules to dbt tests
8
+ dbt.export_to_schema("duckguard.yaml", "models/schema.yml")
9
+
10
+ # Generate dbt generic tests from rules
11
+ tests = dbt.rules_to_dbt_tests(rules)
12
+
13
+ # Airflow integration (requires apache-airflow)
14
+ from duckguard.integrations.airflow import DuckGuardOperator
15
+
16
+ check_task = DuckGuardOperator(
17
+ task_id="check_quality",
18
+ source="s3://bucket/data.parquet",
19
+ rules="rules.yaml",
20
+ )
21
+ """
22
+
23
+ from duckguard.integrations import dbt
24
+
25
+ # Airflow integration is optional - only import if available
26
+ try:
27
+ from duckguard.integrations import airflow
28
+ except ImportError:
29
+ airflow = None # type: ignore[assignment]
30
+
31
+ __all__ = ["dbt", "airflow"]