duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. duckguard/__init__.py +1 -1
  2. duckguard/anomaly/__init__.py +28 -0
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/methods.py +16 -2
  5. duckguard/anomaly/ml_methods.py +724 -0
  6. duckguard/checks/__init__.py +26 -0
  7. duckguard/checks/conditional.py +796 -0
  8. duckguard/checks/distributional.py +524 -0
  9. duckguard/checks/multicolumn.py +726 -0
  10. duckguard/checks/query_based.py +643 -0
  11. duckguard/cli/main.py +257 -2
  12. duckguard/connectors/factory.py +30 -2
  13. duckguard/connectors/files.py +7 -3
  14. duckguard/core/column.py +851 -1
  15. duckguard/core/dataset.py +1035 -0
  16. duckguard/core/result.py +236 -0
  17. duckguard/freshness/__init__.py +33 -0
  18. duckguard/freshness/monitor.py +429 -0
  19. duckguard/history/schema.py +119 -1
  20. duckguard/notifications/__init__.py +20 -2
  21. duckguard/notifications/email.py +508 -0
  22. duckguard/profiler/distribution_analyzer.py +384 -0
  23. duckguard/profiler/outlier_detector.py +497 -0
  24. duckguard/profiler/pattern_matcher.py +301 -0
  25. duckguard/profiler/quality_scorer.py +445 -0
  26. duckguard/reports/html_reporter.py +1 -2
  27. duckguard/rules/executor.py +642 -0
  28. duckguard/rules/generator.py +4 -1
  29. duckguard/rules/schema.py +54 -0
  30. duckguard/schema_history/__init__.py +40 -0
  31. duckguard/schema_history/analyzer.py +414 -0
  32. duckguard/schema_history/tracker.py +288 -0
  33. duckguard/semantic/detector.py +17 -1
  34. duckguard-3.0.0.dist-info/METADATA +1072 -0
  35. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
  36. duckguard-2.2.0.dist-info/METADATA +0 -351
  37. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
  38. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
  39. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/result.py CHANGED
@@ -208,3 +208,239 @@ class ScanResult:
208
208
  if self.checks_run == 0:
209
209
  return 100.0
210
210
  return (self.checks_passed / self.checks_run) * 100
211
+
212
+
213
+ # =========================================================================
214
+ # Distribution Drift Results
215
+ # =========================================================================
216
+
217
+
218
+ @dataclass
219
+ class DriftResult:
220
+ """Result of distribution drift detection between two columns.
221
+
222
+ Attributes:
223
+ is_drifted: Whether significant drift was detected
224
+ p_value: Statistical p-value from the test
225
+ statistic: Test statistic value
226
+ threshold: P-value threshold used for detection
227
+ method: Statistical method used (e.g., "ks_test")
228
+ message: Human-readable summary
229
+ details: Additional metadata
230
+ """
231
+
232
+ is_drifted: bool
233
+ p_value: float
234
+ statistic: float
235
+ threshold: float = 0.05
236
+ method: str = "ks_test"
237
+ message: str = ""
238
+ details: dict[str, Any] = field(default_factory=dict)
239
+
240
+ def __bool__(self) -> bool:
241
+ """Returns True if NO drift detected (data is stable)."""
242
+ return not self.is_drifted
243
+
244
+ def __repr__(self) -> str:
245
+ status = "DRIFT DETECTED" if self.is_drifted else "STABLE"
246
+ return f"DriftResult({status}, p_value={self.p_value:.4f}, threshold={self.threshold})"
247
+
248
+ def summary(self) -> str:
249
+ """Get a human-readable summary."""
250
+ status = "DRIFT DETECTED" if self.is_drifted else "No significant drift"
251
+ return f"{status} (p-value: {self.p_value:.4f}, threshold: {self.threshold}, method: {self.method})"
252
+
253
+
254
+ # =========================================================================
255
+ # Reconciliation Results
256
+ # =========================================================================
257
+
258
+
259
+ @dataclass
260
+ class ReconciliationMismatch:
261
+ """Represents a single row mismatch in reconciliation.
262
+
263
+ Attributes:
264
+ key_values: Dictionary of key column values that identify the row
265
+ column: Column name where mismatch occurred
266
+ source_value: Value in source dataset
267
+ target_value: Value in target dataset
268
+ mismatch_type: Type of mismatch ("value_diff", "missing_in_target", "extra_in_target")
269
+ """
270
+
271
+ key_values: dict[str, Any]
272
+ column: str
273
+ source_value: Any = None
274
+ target_value: Any = None
275
+ mismatch_type: str = "value_diff"
276
+
277
+ def __repr__(self) -> str:
278
+ keys = ", ".join(f"{k}={v}" for k, v in self.key_values.items())
279
+ return f"ReconciliationMismatch({keys}, {self.column}: {self.source_value} vs {self.target_value})"
280
+
281
+
282
+ @dataclass
283
+ class ReconciliationResult:
284
+ """Result of reconciling two datasets.
285
+
286
+ Attributes:
287
+ passed: Whether reconciliation passed (datasets match)
288
+ source_row_count: Number of rows in source dataset
289
+ target_row_count: Number of rows in target dataset
290
+ missing_in_target: Rows in source but not in target
291
+ extra_in_target: Rows in target but not in source
292
+ value_mismatches: Count of value mismatches by column
293
+ match_percentage: Percentage of rows that match
294
+ key_columns: Columns used as keys for matching
295
+ compared_columns: Columns compared for values
296
+ mismatches: Sample of actual mismatches
297
+ details: Additional metadata
298
+ """
299
+
300
+ passed: bool
301
+ source_row_count: int
302
+ target_row_count: int
303
+ missing_in_target: int = 0
304
+ extra_in_target: int = 0
305
+ value_mismatches: dict[str, int] = field(default_factory=dict)
306
+ match_percentage: float = 100.0
307
+ key_columns: list[str] = field(default_factory=list)
308
+ compared_columns: list[str] = field(default_factory=list)
309
+ mismatches: list[ReconciliationMismatch] = field(default_factory=list)
310
+ details: dict[str, Any] = field(default_factory=dict)
311
+
312
+ def __bool__(self) -> bool:
313
+ """Allow using ReconciliationResult in boolean context."""
314
+ return self.passed
315
+
316
+ def __repr__(self) -> str:
317
+ status = "MATCHED" if self.passed else "MISMATCHED"
318
+ return f"ReconciliationResult({status}, match={self.match_percentage:.1f}%, missing={self.missing_in_target}, extra={self.extra_in_target})"
319
+
320
+ @property
321
+ def total_mismatches(self) -> int:
322
+ """Total number of mismatches across all columns."""
323
+ return self.missing_in_target + self.extra_in_target + sum(self.value_mismatches.values())
324
+
325
+ def summary(self) -> str:
326
+ """Get a human-readable summary."""
327
+ lines = [
328
+ f"Reconciliation: {'PASSED' if self.passed else 'FAILED'} ({self.match_percentage:.1f}% match)",
329
+ f"Source rows: {self.source_row_count}, Target rows: {self.target_row_count}",
330
+ ]
331
+
332
+ if self.missing_in_target > 0:
333
+ lines.append(f"Missing in target: {self.missing_in_target} rows")
334
+ if self.extra_in_target > 0:
335
+ lines.append(f"Extra in target: {self.extra_in_target} rows")
336
+ if self.value_mismatches:
337
+ lines.append("Column mismatches:")
338
+ for col, count in self.value_mismatches.items():
339
+ lines.append(f" {col}: {count} differences")
340
+
341
+ if self.mismatches:
342
+ lines.append(f"\nSample mismatches ({len(self.mismatches)} shown):")
343
+ for m in self.mismatches[:5]:
344
+ keys = ", ".join(f"{k}={v}" for k, v in m.key_values.items())
345
+ lines.append(f" [{keys}] {m.column}: {m.source_value!r} vs {m.target_value!r}")
346
+
347
+ return "\n".join(lines)
348
+
349
+
350
+ # =========================================================================
351
+ # Group By Results
352
+ # =========================================================================
353
+
354
+
355
+ @dataclass
356
+ class GroupResult:
357
+ """Validation result for a single group.
358
+
359
+ Attributes:
360
+ group_key: Dictionary of group column values
361
+ row_count: Number of rows in this group
362
+ passed: Whether all checks passed for this group
363
+ check_results: List of individual check results
364
+ stats: Group-level statistics
365
+ """
366
+
367
+ group_key: dict[str, Any]
368
+ row_count: int
369
+ passed: bool = True
370
+ check_results: list[ValidationResult] = field(default_factory=list)
371
+ stats: dict[str, Any] = field(default_factory=dict)
372
+
373
+ def __bool__(self) -> bool:
374
+ """Allow using GroupResult in boolean context."""
375
+ return self.passed
376
+
377
+ def __repr__(self) -> str:
378
+ keys = ", ".join(f"{k}={v}" for k, v in self.group_key.items())
379
+ status = "PASSED" if self.passed else "FAILED"
380
+ return f"GroupResult({keys}, rows={self.row_count}, {status})"
381
+
382
+ @property
383
+ def key_string(self) -> str:
384
+ """Get a string representation of the group key."""
385
+ return ", ".join(f"{k}={v}" for k, v in self.group_key.items())
386
+
387
+
388
+ @dataclass
389
+ class GroupByResult:
390
+ """Result of group-by validation across all groups.
391
+
392
+ Attributes:
393
+ passed: Whether all groups passed validation
394
+ total_groups: Total number of groups
395
+ passed_groups: Number of groups that passed
396
+ failed_groups: Number of groups that failed
397
+ group_results: Individual results per group
398
+ group_columns: Columns used for grouping
399
+ details: Additional metadata
400
+ """
401
+
402
+ passed: bool
403
+ total_groups: int
404
+ passed_groups: int = 0
405
+ failed_groups: int = 0
406
+ group_results: list[GroupResult] = field(default_factory=list)
407
+ group_columns: list[str] = field(default_factory=list)
408
+ details: dict[str, Any] = field(default_factory=dict)
409
+
410
+ def __bool__(self) -> bool:
411
+ """Allow using GroupByResult in boolean context."""
412
+ return self.passed
413
+
414
+ def __repr__(self) -> str:
415
+ status = "PASSED" if self.passed else "FAILED"
416
+ return f"GroupByResult({status}, groups={self.total_groups}, passed={self.passed_groups}, failed={self.failed_groups})"
417
+
418
+ @property
419
+ def pass_rate(self) -> float:
420
+ """Calculate the pass rate as a percentage."""
421
+ if self.total_groups == 0:
422
+ return 100.0
423
+ return (self.passed_groups / self.total_groups) * 100
424
+
425
+ def get_failed_groups(self) -> list[GroupResult]:
426
+ """Get list of groups that failed validation."""
427
+ return [g for g in self.group_results if not g.passed]
428
+
429
+ def summary(self) -> str:
430
+ """Get a human-readable summary."""
431
+ lines = [
432
+ f"Group By Validation: {'PASSED' if self.passed else 'FAILED'}",
433
+ f"Groups: {self.total_groups} total, {self.passed_groups} passed, {self.failed_groups} failed ({self.pass_rate:.1f}%)",
434
+ f"Grouped by: {', '.join(self.group_columns)}",
435
+ ]
436
+
437
+ failed = self.get_failed_groups()
438
+ if failed:
439
+ lines.append(f"\nFailed groups ({len(failed)}):")
440
+ for g in failed[:5]:
441
+ lines.append(f" [{g.key_string}]: {g.row_count} rows")
442
+ for cr in g.check_results:
443
+ if not cr.passed:
444
+ lines.append(f" - {cr.message}")
445
+
446
+ return "\n".join(lines)
@@ -0,0 +1,33 @@
1
+ """Freshness monitoring for DuckGuard.
2
+
3
+ This module provides functionality to check data freshness by monitoring
4
+ file modification times and timestamp columns.
5
+
6
+ Usage:
7
+ from duckguard.freshness import FreshnessMonitor, FreshnessResult
8
+ from datetime import timedelta
9
+
10
+ # Check file freshness
11
+ monitor = FreshnessMonitor(threshold=timedelta(hours=24))
12
+ result = monitor.check("data.csv")
13
+
14
+ if not result.is_fresh:
15
+ print(f"Data is stale! Last updated: {result.age_human}")
16
+
17
+ # Check column freshness
18
+ from duckguard import connect
19
+ data = connect("data.csv")
20
+ result = monitor.check_column_timestamp(data, "updated_at")
21
+ """
22
+
23
+ from duckguard.freshness.monitor import (
24
+ FreshnessMethod,
25
+ FreshnessMonitor,
26
+ FreshnessResult,
27
+ )
28
+
29
+ __all__ = [
30
+ "FreshnessMonitor",
31
+ "FreshnessResult",
32
+ "FreshnessMethod",
33
+ ]
@@ -0,0 +1,429 @@
1
+ """Freshness monitoring implementation.
2
+
3
+ Provides functionality to check data freshness via file modification times
4
+ and timestamp columns in the data.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timedelta
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+ from urllib.parse import urlparse
16
+
17
+ if TYPE_CHECKING:
18
+ from duckguard.core.dataset import Dataset
19
+
20
+
21
+ class FreshnessMethod(str, Enum):
22
+ """Methods for checking freshness."""
23
+
24
+ FILE_MTIME = "file_mtime"
25
+ COLUMN_MAX = "column_max"
26
+ COLUMN_MIN = "column_min"
27
+ METADATA = "metadata"
28
+ UNKNOWN = "unknown"
29
+
30
+
31
+ @dataclass
32
+ class FreshnessResult:
33
+ """Result of a freshness check.
34
+
35
+ Attributes:
36
+ source: Data source path
37
+ last_modified: Timestamp of last modification
38
+ age_seconds: Age in seconds (None if unknown)
39
+ age_human: Human-readable age string
40
+ is_fresh: Whether the data meets freshness threshold
41
+ threshold_seconds: Threshold used (None if no threshold)
42
+ method: Method used to determine freshness
43
+ details: Additional details about the check
44
+ """
45
+
46
+ source: str
47
+ last_modified: datetime | None
48
+ age_seconds: float | None
49
+ age_human: str
50
+ is_fresh: bool
51
+ threshold_seconds: float | None
52
+ method: FreshnessMethod
53
+ details: dict[str, Any] | None = None
54
+
55
+ def __str__(self) -> str:
56
+ """Human-readable string representation."""
57
+ status = "FRESH" if self.is_fresh else "STALE"
58
+ return f"[{status}] {self.source}: {self.age_human} (method: {self.method.value})"
59
+
60
+ def to_dict(self) -> dict[str, Any]:
61
+ """Convert to dictionary."""
62
+ return {
63
+ "source": self.source,
64
+ "last_modified": self.last_modified.isoformat() if self.last_modified else None,
65
+ "age_seconds": self.age_seconds,
66
+ "age_human": self.age_human,
67
+ "is_fresh": self.is_fresh,
68
+ "threshold_seconds": self.threshold_seconds,
69
+ "method": self.method.value,
70
+ "details": self.details,
71
+ }
72
+
73
+
74
+ class FreshnessMonitor:
75
+ """Monitor data freshness.
76
+
77
+ Usage:
78
+ from duckguard.freshness import FreshnessMonitor
79
+ from datetime import timedelta
80
+
81
+ # Create monitor with default 24-hour threshold
82
+ monitor = FreshnessMonitor()
83
+
84
+ # Check file freshness
85
+ result = monitor.check("data.csv")
86
+ print(f"Fresh: {result.is_fresh}, Age: {result.age_human}")
87
+
88
+ # Check with custom threshold
89
+ monitor = FreshnessMonitor(threshold=timedelta(hours=6))
90
+ result = monitor.check("data.csv")
91
+
92
+ # Check column timestamp
93
+ from duckguard import connect
94
+ data = connect("data.csv")
95
+ result = monitor.check_column_timestamp(data, "updated_at")
96
+ """
97
+
98
+ def __init__(self, threshold: timedelta | None = None):
99
+ """Initialize freshness monitor.
100
+
101
+ Args:
102
+ threshold: Maximum acceptable age for data to be considered fresh.
103
+ Defaults to 24 hours.
104
+ """
105
+ self.threshold = threshold or timedelta(hours=24)
106
+
107
+ @property
108
+ def threshold_seconds(self) -> float:
109
+ """Get threshold in seconds."""
110
+ return self.threshold.total_seconds()
111
+
112
+ def check(
113
+ self,
114
+ source: str | Dataset,
115
+ column: str | None = None,
116
+ ) -> FreshnessResult:
117
+ """Check freshness using the most appropriate method.
118
+
119
+ Args:
120
+ source: Data source path or Dataset object
121
+ column: Optional timestamp column to check
122
+
123
+ Returns:
124
+ FreshnessResult with freshness information
125
+ """
126
+ # Import here to avoid circular imports
127
+ from duckguard.core.dataset import Dataset
128
+
129
+ if isinstance(source, Dataset):
130
+ dataset = source
131
+ source_path = dataset.source
132
+ else:
133
+ source_path = source
134
+ dataset = None
135
+
136
+ # If column specified, use column method
137
+ if column and dataset:
138
+ return self.check_column_timestamp(dataset, column)
139
+
140
+ # Try to determine best method
141
+ if self._is_local_file(source_path):
142
+ return self.check_file_mtime(source_path)
143
+ elif dataset:
144
+ # Try to auto-detect timestamp column
145
+ timestamp_col = self._detect_timestamp_column(dataset)
146
+ if timestamp_col:
147
+ return self.check_column_timestamp(dataset, timestamp_col)
148
+
149
+ # Return unknown result
150
+ return FreshnessResult(
151
+ source=source_path,
152
+ last_modified=None,
153
+ age_seconds=None,
154
+ age_human="unknown",
155
+ is_fresh=True, # Default to fresh if can't determine
156
+ threshold_seconds=self.threshold_seconds,
157
+ method=FreshnessMethod.UNKNOWN,
158
+ details={"reason": "Cannot determine freshness for this source type"},
159
+ )
160
+
161
+ def check_file_mtime(self, path: str | Path) -> FreshnessResult:
162
+ """Check freshness via file modification time.
163
+
164
+ Args:
165
+ path: Path to the file
166
+
167
+ Returns:
168
+ FreshnessResult with file modification information
169
+ """
170
+ path = Path(path)
171
+ source_str = str(path)
172
+
173
+ if not path.exists():
174
+ return FreshnessResult(
175
+ source=source_str,
176
+ last_modified=None,
177
+ age_seconds=None,
178
+ age_human="file not found",
179
+ is_fresh=False,
180
+ threshold_seconds=self.threshold_seconds,
181
+ method=FreshnessMethod.FILE_MTIME,
182
+ details={"error": "File does not exist"},
183
+ )
184
+
185
+ try:
186
+ mtime = os.path.getmtime(path)
187
+ last_modified = datetime.fromtimestamp(mtime)
188
+ now = datetime.now()
189
+ age = now - last_modified
190
+ age_seconds = age.total_seconds()
191
+
192
+ is_fresh = age_seconds <= self.threshold_seconds
193
+
194
+ return FreshnessResult(
195
+ source=source_str,
196
+ last_modified=last_modified,
197
+ age_seconds=age_seconds,
198
+ age_human=self._format_age(age),
199
+ is_fresh=is_fresh,
200
+ threshold_seconds=self.threshold_seconds,
201
+ method=FreshnessMethod.FILE_MTIME,
202
+ details={
203
+ "file_size": path.stat().st_size,
204
+ "threshold_human": self._format_age(self.threshold),
205
+ },
206
+ )
207
+ except OSError as e:
208
+ return FreshnessResult(
209
+ source=source_str,
210
+ last_modified=None,
211
+ age_seconds=None,
212
+ age_human="error reading file",
213
+ is_fresh=False,
214
+ threshold_seconds=self.threshold_seconds,
215
+ method=FreshnessMethod.FILE_MTIME,
216
+ details={"error": str(e)},
217
+ )
218
+
219
+ def check_column_timestamp(
220
+ self,
221
+ dataset: Dataset,
222
+ column: str,
223
+ use_max: bool = True,
224
+ ) -> FreshnessResult:
225
+ """Check freshness via timestamp column.
226
+
227
+ Args:
228
+ dataset: Dataset to check
229
+ column: Timestamp column name
230
+ use_max: Use MAX (most recent) if True, MIN (oldest) if False
231
+
232
+ Returns:
233
+ FreshnessResult with column timestamp information
234
+ """
235
+ source_str = dataset.source
236
+ method = FreshnessMethod.COLUMN_MAX if use_max else FreshnessMethod.COLUMN_MIN
237
+
238
+ # Verify column exists
239
+ if column not in dataset.columns:
240
+ return FreshnessResult(
241
+ source=source_str,
242
+ last_modified=None,
243
+ age_seconds=None,
244
+ age_human="column not found",
245
+ is_fresh=False,
246
+ threshold_seconds=self.threshold_seconds,
247
+ method=method,
248
+ details={"error": f"Column '{column}' not found in dataset"},
249
+ )
250
+
251
+ try:
252
+ # Get max/min timestamp from column
253
+ ref = dataset.engine.get_source_reference(dataset.source)
254
+ agg_func = "MAX" if use_max else "MIN"
255
+ sql = f"SELECT {agg_func}({column}) as ts FROM {ref}"
256
+ result = dataset.engine.fetch_all(sql)
257
+
258
+ if not result or result[0][0] is None:
259
+ return FreshnessResult(
260
+ source=source_str,
261
+ last_modified=None,
262
+ age_seconds=None,
263
+ age_human="no data",
264
+ is_fresh=False,
265
+ threshold_seconds=self.threshold_seconds,
266
+ method=method,
267
+ details={"error": "Column contains no timestamp values"},
268
+ )
269
+
270
+ timestamp_value = result[0][0]
271
+
272
+ # Parse timestamp
273
+ if isinstance(timestamp_value, datetime):
274
+ last_modified = timestamp_value
275
+ elif isinstance(timestamp_value, str):
276
+ # Try common formats
277
+ for fmt in [
278
+ "%Y-%m-%d %H:%M:%S",
279
+ "%Y-%m-%d %H:%M:%S.%f",
280
+ "%Y-%m-%dT%H:%M:%S",
281
+ "%Y-%m-%dT%H:%M:%S.%f",
282
+ "%Y-%m-%d",
283
+ ]:
284
+ try:
285
+ last_modified = datetime.strptime(timestamp_value, fmt)
286
+ break
287
+ except ValueError:
288
+ continue
289
+ else:
290
+ return FreshnessResult(
291
+ source=source_str,
292
+ last_modified=None,
293
+ age_seconds=None,
294
+ age_human="invalid timestamp format",
295
+ is_fresh=False,
296
+ threshold_seconds=self.threshold_seconds,
297
+ method=method,
298
+ details={"error": f"Cannot parse timestamp: {timestamp_value}"},
299
+ )
300
+ else:
301
+ return FreshnessResult(
302
+ source=source_str,
303
+ last_modified=None,
304
+ age_seconds=None,
305
+ age_human="unsupported type",
306
+ is_fresh=False,
307
+ threshold_seconds=self.threshold_seconds,
308
+ method=method,
309
+ details={"error": f"Unsupported timestamp type: {type(timestamp_value)}"},
310
+ )
311
+
312
+ now = datetime.now()
313
+ age = now - last_modified
314
+ age_seconds = age.total_seconds()
315
+
316
+ is_fresh = age_seconds <= self.threshold_seconds
317
+
318
+ return FreshnessResult(
319
+ source=source_str,
320
+ last_modified=last_modified,
321
+ age_seconds=age_seconds,
322
+ age_human=self._format_age(age),
323
+ is_fresh=is_fresh,
324
+ threshold_seconds=self.threshold_seconds,
325
+ method=method,
326
+ details={
327
+ "column": column,
328
+ "aggregation": agg_func,
329
+ "threshold_human": self._format_age(self.threshold),
330
+ },
331
+ )
332
+
333
+ except Exception as e:
334
+ return FreshnessResult(
335
+ source=source_str,
336
+ last_modified=None,
337
+ age_seconds=None,
338
+ age_human="query error",
339
+ is_fresh=False,
340
+ threshold_seconds=self.threshold_seconds,
341
+ method=method,
342
+ details={"error": str(e)},
343
+ )
344
+
345
+ def _is_local_file(self, source: str) -> bool:
346
+ """Check if source is a local file path."""
347
+ # Check for URL schemes
348
+ parsed = urlparse(source)
349
+ if parsed.scheme and parsed.scheme not in ("", "file"):
350
+ return False
351
+
352
+ # Check for connection strings
353
+ if "://" in source and not source.startswith("file://"):
354
+ return False
355
+
356
+ # Check if path exists
357
+ path = Path(source)
358
+ return path.exists() and path.is_file()
359
+
360
+ def _detect_timestamp_column(self, dataset: Dataset) -> str | None:
361
+ """Try to auto-detect a timestamp column."""
362
+ timestamp_patterns = [
363
+ "updated_at", "modified_at", "last_modified", "modified",
364
+ "created_at", "timestamp", "date", "datetime", "time",
365
+ "update_time", "modify_time", "last_update",
366
+ ]
367
+
368
+ columns_lower = {c.lower(): c for c in dataset.columns}
369
+
370
+ for pattern in timestamp_patterns:
371
+ if pattern in columns_lower:
372
+ return columns_lower[pattern]
373
+
374
+ return None
375
+
376
+ def _format_age(self, age: timedelta) -> str:
377
+ """Format a timedelta as human-readable string."""
378
+ total_seconds = int(age.total_seconds())
379
+
380
+ if total_seconds < 0:
381
+ return "in the future"
382
+ elif total_seconds < 60:
383
+ return f"{total_seconds} seconds ago"
384
+ elif total_seconds < 3600:
385
+ minutes = total_seconds // 60
386
+ return f"{minutes} minute{'s' if minutes != 1 else ''} ago"
387
+ elif total_seconds < 86400:
388
+ hours = total_seconds // 3600
389
+ return f"{hours} hour{'s' if hours != 1 else ''} ago"
390
+ elif total_seconds < 604800:
391
+ days = total_seconds // 86400
392
+ return f"{days} day{'s' if days != 1 else ''} ago"
393
+ elif total_seconds < 2592000:
394
+ weeks = total_seconds // 604800
395
+ return f"{weeks} week{'s' if weeks != 1 else ''} ago"
396
+ else:
397
+ months = total_seconds // 2592000
398
+ return f"{months} month{'s' if months != 1 else ''} ago"
399
+
400
+
401
+ def parse_age_string(age_str: str) -> timedelta:
402
+ """Parse an age string like '24h', '7d', '1w' into timedelta.
403
+
404
+ Args:
405
+ age_str: Age string with unit (s, m, h, d, w)
406
+
407
+ Returns:
408
+ timedelta representing the age
409
+
410
+ Examples:
411
+ parse_age_string("24h") -> timedelta(hours=24)
412
+ parse_age_string("7d") -> timedelta(days=7)
413
+ parse_age_string("1w") -> timedelta(weeks=1)
414
+ """
415
+ age_str = age_str.strip().lower()
416
+
417
+ if age_str.endswith("s"):
418
+ return timedelta(seconds=int(age_str[:-1]))
419
+ elif age_str.endswith("m"):
420
+ return timedelta(minutes=int(age_str[:-1]))
421
+ elif age_str.endswith("h"):
422
+ return timedelta(hours=int(age_str[:-1]))
423
+ elif age_str.endswith("d"):
424
+ return timedelta(days=int(age_str[:-1]))
425
+ elif age_str.endswith("w"):
426
+ return timedelta(weeks=int(age_str[:-1]))
427
+ else:
428
+ # Assume hours if no unit
429
+ return timedelta(hours=int(age_str))