duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,429 @@
1
+ """Freshness monitoring implementation.
2
+
3
+ Provides functionality to check data freshness via file modification times
4
+ and timestamp columns in the data.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timedelta
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+ from urllib.parse import urlparse
16
+
17
+ if TYPE_CHECKING:
18
+ from duckguard.core.dataset import Dataset
19
+
20
+
21
+ class FreshnessMethod(str, Enum):
22
+ """Methods for checking freshness."""
23
+
24
+ FILE_MTIME = "file_mtime"
25
+ COLUMN_MAX = "column_max"
26
+ COLUMN_MIN = "column_min"
27
+ METADATA = "metadata"
28
+ UNKNOWN = "unknown"
29
+
30
+
31
+ @dataclass
32
+ class FreshnessResult:
33
+ """Result of a freshness check.
34
+
35
+ Attributes:
36
+ source: Data source path
37
+ last_modified: Timestamp of last modification
38
+ age_seconds: Age in seconds (None if unknown)
39
+ age_human: Human-readable age string
40
+ is_fresh: Whether the data meets freshness threshold
41
+ threshold_seconds: Threshold used (None if no threshold)
42
+ method: Method used to determine freshness
43
+ details: Additional details about the check
44
+ """
45
+
46
+ source: str
47
+ last_modified: datetime | None
48
+ age_seconds: float | None
49
+ age_human: str
50
+ is_fresh: bool
51
+ threshold_seconds: float | None
52
+ method: FreshnessMethod
53
+ details: dict[str, Any] | None = None
54
+
55
+ def __str__(self) -> str:
56
+ """Human-readable string representation."""
57
+ status = "FRESH" if self.is_fresh else "STALE"
58
+ return f"[{status}] {self.source}: {self.age_human} (method: {self.method.value})"
59
+
60
+ def to_dict(self) -> dict[str, Any]:
61
+ """Convert to dictionary."""
62
+ return {
63
+ "source": self.source,
64
+ "last_modified": self.last_modified.isoformat() if self.last_modified else None,
65
+ "age_seconds": self.age_seconds,
66
+ "age_human": self.age_human,
67
+ "is_fresh": self.is_fresh,
68
+ "threshold_seconds": self.threshold_seconds,
69
+ "method": self.method.value,
70
+ "details": self.details,
71
+ }
72
+
73
+
74
+ class FreshnessMonitor:
75
+ """Monitor data freshness.
76
+
77
+ Usage:
78
+ from duckguard.freshness import FreshnessMonitor
79
+ from datetime import timedelta
80
+
81
+ # Create monitor with default 24-hour threshold
82
+ monitor = FreshnessMonitor()
83
+
84
+ # Check file freshness
85
+ result = monitor.check("data.csv")
86
+ print(f"Fresh: {result.is_fresh}, Age: {result.age_human}")
87
+
88
+ # Check with custom threshold
89
+ monitor = FreshnessMonitor(threshold=timedelta(hours=6))
90
+ result = monitor.check("data.csv")
91
+
92
+ # Check column timestamp
93
+ from duckguard import connect
94
+ data = connect("data.csv")
95
+ result = monitor.check_column_timestamp(data, "updated_at")
96
+ """
97
+
98
+ def __init__(self, threshold: timedelta | None = None):
99
+ """Initialize freshness monitor.
100
+
101
+ Args:
102
+ threshold: Maximum acceptable age for data to be considered fresh.
103
+ Defaults to 24 hours.
104
+ """
105
+ self.threshold = threshold or timedelta(hours=24)
106
+
107
+ @property
108
+ def threshold_seconds(self) -> float:
109
+ """Get threshold in seconds."""
110
+ return self.threshold.total_seconds()
111
+
112
+ def check(
113
+ self,
114
+ source: str | Dataset,
115
+ column: str | None = None,
116
+ ) -> FreshnessResult:
117
+ """Check freshness using the most appropriate method.
118
+
119
+ Args:
120
+ source: Data source path or Dataset object
121
+ column: Optional timestamp column to check
122
+
123
+ Returns:
124
+ FreshnessResult with freshness information
125
+ """
126
+ # Import here to avoid circular imports
127
+ from duckguard.core.dataset import Dataset
128
+
129
+ if isinstance(source, Dataset):
130
+ dataset = source
131
+ source_path = dataset.source
132
+ else:
133
+ source_path = source
134
+ dataset = None
135
+
136
+ # If column specified, use column method
137
+ if column and dataset:
138
+ return self.check_column_timestamp(dataset, column)
139
+
140
+ # Try to determine best method
141
+ if self._is_local_file(source_path):
142
+ return self.check_file_mtime(source_path)
143
+ elif dataset:
144
+ # Try to auto-detect timestamp column
145
+ timestamp_col = self._detect_timestamp_column(dataset)
146
+ if timestamp_col:
147
+ return self.check_column_timestamp(dataset, timestamp_col)
148
+
149
+ # Return unknown result
150
+ return FreshnessResult(
151
+ source=source_path,
152
+ last_modified=None,
153
+ age_seconds=None,
154
+ age_human="unknown",
155
+ is_fresh=True, # Default to fresh if can't determine
156
+ threshold_seconds=self.threshold_seconds,
157
+ method=FreshnessMethod.UNKNOWN,
158
+ details={"reason": "Cannot determine freshness for this source type"},
159
+ )
160
+
161
+ def check_file_mtime(self, path: str | Path) -> FreshnessResult:
162
+ """Check freshness via file modification time.
163
+
164
+ Args:
165
+ path: Path to the file
166
+
167
+ Returns:
168
+ FreshnessResult with file modification information
169
+ """
170
+ path = Path(path)
171
+ source_str = str(path)
172
+
173
+ if not path.exists():
174
+ return FreshnessResult(
175
+ source=source_str,
176
+ last_modified=None,
177
+ age_seconds=None,
178
+ age_human="file not found",
179
+ is_fresh=False,
180
+ threshold_seconds=self.threshold_seconds,
181
+ method=FreshnessMethod.FILE_MTIME,
182
+ details={"error": "File does not exist"},
183
+ )
184
+
185
+ try:
186
+ mtime = os.path.getmtime(path)
187
+ last_modified = datetime.fromtimestamp(mtime)
188
+ now = datetime.now()
189
+ age = now - last_modified
190
+ age_seconds = age.total_seconds()
191
+
192
+ is_fresh = age_seconds <= self.threshold_seconds
193
+
194
+ return FreshnessResult(
195
+ source=source_str,
196
+ last_modified=last_modified,
197
+ age_seconds=age_seconds,
198
+ age_human=self._format_age(age),
199
+ is_fresh=is_fresh,
200
+ threshold_seconds=self.threshold_seconds,
201
+ method=FreshnessMethod.FILE_MTIME,
202
+ details={
203
+ "file_size": path.stat().st_size,
204
+ "threshold_human": self._format_age(self.threshold),
205
+ },
206
+ )
207
+ except OSError as e:
208
+ return FreshnessResult(
209
+ source=source_str,
210
+ last_modified=None,
211
+ age_seconds=None,
212
+ age_human="error reading file",
213
+ is_fresh=False,
214
+ threshold_seconds=self.threshold_seconds,
215
+ method=FreshnessMethod.FILE_MTIME,
216
+ details={"error": str(e)},
217
+ )
218
+
219
+ def check_column_timestamp(
220
+ self,
221
+ dataset: Dataset,
222
+ column: str,
223
+ use_max: bool = True,
224
+ ) -> FreshnessResult:
225
+ """Check freshness via timestamp column.
226
+
227
+ Args:
228
+ dataset: Dataset to check
229
+ column: Timestamp column name
230
+ use_max: Use MAX (most recent) if True, MIN (oldest) if False
231
+
232
+ Returns:
233
+ FreshnessResult with column timestamp information
234
+ """
235
+ source_str = dataset.source
236
+ method = FreshnessMethod.COLUMN_MAX if use_max else FreshnessMethod.COLUMN_MIN
237
+
238
+ # Verify column exists
239
+ if column not in dataset.columns:
240
+ return FreshnessResult(
241
+ source=source_str,
242
+ last_modified=None,
243
+ age_seconds=None,
244
+ age_human="column not found",
245
+ is_fresh=False,
246
+ threshold_seconds=self.threshold_seconds,
247
+ method=method,
248
+ details={"error": f"Column '{column}' not found in dataset"},
249
+ )
250
+
251
+ try:
252
+ # Get max/min timestamp from column
253
+ ref = dataset.engine.get_source_reference(dataset.source)
254
+ agg_func = "MAX" if use_max else "MIN"
255
+ sql = f"SELECT {agg_func}({column}) as ts FROM {ref}"
256
+ result = dataset.engine.fetch_all(sql)
257
+
258
+ if not result or result[0][0] is None:
259
+ return FreshnessResult(
260
+ source=source_str,
261
+ last_modified=None,
262
+ age_seconds=None,
263
+ age_human="no data",
264
+ is_fresh=False,
265
+ threshold_seconds=self.threshold_seconds,
266
+ method=method,
267
+ details={"error": "Column contains no timestamp values"},
268
+ )
269
+
270
+ timestamp_value = result[0][0]
271
+
272
+ # Parse timestamp
273
+ if isinstance(timestamp_value, datetime):
274
+ last_modified = timestamp_value
275
+ elif isinstance(timestamp_value, str):
276
+ # Try common formats
277
+ for fmt in [
278
+ "%Y-%m-%d %H:%M:%S",
279
+ "%Y-%m-%d %H:%M:%S.%f",
280
+ "%Y-%m-%dT%H:%M:%S",
281
+ "%Y-%m-%dT%H:%M:%S.%f",
282
+ "%Y-%m-%d",
283
+ ]:
284
+ try:
285
+ last_modified = datetime.strptime(timestamp_value, fmt)
286
+ break
287
+ except ValueError:
288
+ continue
289
+ else:
290
+ return FreshnessResult(
291
+ source=source_str,
292
+ last_modified=None,
293
+ age_seconds=None,
294
+ age_human="invalid timestamp format",
295
+ is_fresh=False,
296
+ threshold_seconds=self.threshold_seconds,
297
+ method=method,
298
+ details={"error": f"Cannot parse timestamp: {timestamp_value}"},
299
+ )
300
+ else:
301
+ return FreshnessResult(
302
+ source=source_str,
303
+ last_modified=None,
304
+ age_seconds=None,
305
+ age_human="unsupported type",
306
+ is_fresh=False,
307
+ threshold_seconds=self.threshold_seconds,
308
+ method=method,
309
+ details={"error": f"Unsupported timestamp type: {type(timestamp_value)}"},
310
+ )
311
+
312
+ now = datetime.now()
313
+ age = now - last_modified
314
+ age_seconds = age.total_seconds()
315
+
316
+ is_fresh = age_seconds <= self.threshold_seconds
317
+
318
+ return FreshnessResult(
319
+ source=source_str,
320
+ last_modified=last_modified,
321
+ age_seconds=age_seconds,
322
+ age_human=self._format_age(age),
323
+ is_fresh=is_fresh,
324
+ threshold_seconds=self.threshold_seconds,
325
+ method=method,
326
+ details={
327
+ "column": column,
328
+ "aggregation": agg_func,
329
+ "threshold_human": self._format_age(self.threshold),
330
+ },
331
+ )
332
+
333
+ except Exception as e:
334
+ return FreshnessResult(
335
+ source=source_str,
336
+ last_modified=None,
337
+ age_seconds=None,
338
+ age_human="query error",
339
+ is_fresh=False,
340
+ threshold_seconds=self.threshold_seconds,
341
+ method=method,
342
+ details={"error": str(e)},
343
+ )
344
+
345
+ def _is_local_file(self, source: str) -> bool:
346
+ """Check if source is a local file path."""
347
+ # Check for URL schemes
348
+ parsed = urlparse(source)
349
+ if parsed.scheme and parsed.scheme not in ("", "file"):
350
+ return False
351
+
352
+ # Check for connection strings
353
+ if "://" in source and not source.startswith("file://"):
354
+ return False
355
+
356
+ # Check if path exists
357
+ path = Path(source)
358
+ return path.exists() and path.is_file()
359
+
360
+ def _detect_timestamp_column(self, dataset: Dataset) -> str | None:
361
+ """Try to auto-detect a timestamp column."""
362
+ timestamp_patterns = [
363
+ "updated_at", "modified_at", "last_modified", "modified",
364
+ "created_at", "timestamp", "date", "datetime", "time",
365
+ "update_time", "modify_time", "last_update",
366
+ ]
367
+
368
+ columns_lower = {c.lower(): c for c in dataset.columns}
369
+
370
+ for pattern in timestamp_patterns:
371
+ if pattern in columns_lower:
372
+ return columns_lower[pattern]
373
+
374
+ return None
375
+
376
+ def _format_age(self, age: timedelta) -> str:
377
+ """Format a timedelta as human-readable string."""
378
+ total_seconds = int(age.total_seconds())
379
+
380
+ if total_seconds < 0:
381
+ return "in the future"
382
+ elif total_seconds < 60:
383
+ return f"{total_seconds} seconds ago"
384
+ elif total_seconds < 3600:
385
+ minutes = total_seconds // 60
386
+ return f"{minutes} minute{'s' if minutes != 1 else ''} ago"
387
+ elif total_seconds < 86400:
388
+ hours = total_seconds // 3600
389
+ return f"{hours} hour{'s' if hours != 1 else ''} ago"
390
+ elif total_seconds < 604800:
391
+ days = total_seconds // 86400
392
+ return f"{days} day{'s' if days != 1 else ''} ago"
393
+ elif total_seconds < 2592000:
394
+ weeks = total_seconds // 604800
395
+ return f"{weeks} week{'s' if weeks != 1 else ''} ago"
396
+ else:
397
+ months = total_seconds // 2592000
398
+ return f"{months} month{'s' if months != 1 else ''} ago"
399
+
400
+
401
+ def parse_age_string(age_str: str) -> timedelta:
402
+ """Parse an age string like '24h', '7d', '1w' into timedelta.
403
+
404
+ Args:
405
+ age_str: Age string with unit (s, m, h, d, w)
406
+
407
+ Returns:
408
+ timedelta representing the age
409
+
410
+ Examples:
411
+ parse_age_string("24h") -> timedelta(hours=24)
412
+ parse_age_string("7d") -> timedelta(days=7)
413
+ parse_age_string("1w") -> timedelta(weeks=1)
414
+ """
415
+ age_str = age_str.strip().lower()
416
+
417
+ if age_str.endswith("s"):
418
+ return timedelta(seconds=int(age_str[:-1]))
419
+ elif age_str.endswith("m"):
420
+ return timedelta(minutes=int(age_str[:-1]))
421
+ elif age_str.endswith("h"):
422
+ return timedelta(hours=int(age_str[:-1]))
423
+ elif age_str.endswith("d"):
424
+ return timedelta(days=int(age_str[:-1]))
425
+ elif age_str.endswith("w"):
426
+ return timedelta(weeks=int(age_str[:-1]))
427
+ else:
428
+ # Assume hours if no unit
429
+ return timedelta(hours=int(age_str))
@@ -0,0 +1,44 @@
1
+ """Historical result storage and trend analysis for DuckGuard.
2
+
3
+ This module provides persistent storage for validation results,
4
+ enabling trend analysis and historical comparison.
5
+
6
+ Usage:
7
+ from duckguard.history import HistoryStorage, TrendAnalyzer
8
+
9
+ # Store validation results
10
+ storage = HistoryStorage()
11
+ storage.store(result)
12
+
13
+ # Query history
14
+ runs = storage.get_runs("data.csv", limit=10)
15
+
16
+ # Analyze trends
17
+ analyzer = TrendAnalyzer(storage)
18
+ analysis = analyzer.analyze("data.csv", days=30)
19
+ print(analysis.summary())
20
+ """
21
+
22
+ from duckguard.history.storage import (
23
+ HistoryStorage,
24
+ StoredCheckResult,
25
+ StoredRun,
26
+ TrendDataPoint,
27
+ )
28
+ from duckguard.history.trends import (
29
+ TrendAnalysis,
30
+ TrendAnalyzer,
31
+ analyze_trends,
32
+ )
33
+
34
+ __all__ = [
35
+ # Storage
36
+ "HistoryStorage",
37
+ "StoredRun",
38
+ "StoredCheckResult",
39
+ "TrendDataPoint",
40
+ # Trends
41
+ "TrendAnalyzer",
42
+ "TrendAnalysis",
43
+ "analyze_trends",
44
+ ]