duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/errors.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""Enhanced error classes for DuckGuard with helpful suggestions.
|
|
2
|
+
|
|
3
|
+
Provides user-friendly error messages with:
|
|
4
|
+
- Clear descriptions of what went wrong
|
|
5
|
+
- Suggestions for how to fix the issue
|
|
6
|
+
- Links to relevant documentation
|
|
7
|
+
- Context about the data being validated
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
# Documentation base URL
|
|
15
|
+
DOCS_BASE_URL = "https://github.com/XDataHubAI/duckguard"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DuckGuardError(Exception):
|
|
19
|
+
"""Base exception for all DuckGuard errors.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
message: Human-readable error description
|
|
23
|
+
suggestion: Helpful suggestion for fixing the issue
|
|
24
|
+
docs_url: Link to relevant documentation
|
|
25
|
+
context: Additional context about the error
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
message: str,
|
|
31
|
+
suggestion: str | None = None,
|
|
32
|
+
docs_url: str | None = None,
|
|
33
|
+
context: dict[str, Any] | None = None,
|
|
34
|
+
):
|
|
35
|
+
self.message = message
|
|
36
|
+
self.suggestion = suggestion
|
|
37
|
+
self.docs_url = docs_url
|
|
38
|
+
self.context = context or {}
|
|
39
|
+
super().__init__(self._format_message())
|
|
40
|
+
|
|
41
|
+
def _format_message(self) -> str:
|
|
42
|
+
"""Format the full error message with suggestions."""
|
|
43
|
+
parts = [self.message]
|
|
44
|
+
|
|
45
|
+
if self.suggestion:
|
|
46
|
+
parts.append(f"\n\nSuggestion: {self.suggestion}")
|
|
47
|
+
|
|
48
|
+
if self.docs_url:
|
|
49
|
+
parts.append(f"\n\nDocs: {self.docs_url}")
|
|
50
|
+
|
|
51
|
+
if self.context:
|
|
52
|
+
context_str = "\n".join(f" {k}: {v}" for k, v in self.context.items())
|
|
53
|
+
parts.append(f"\n\nContext:\n{context_str}")
|
|
54
|
+
|
|
55
|
+
return "".join(parts)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ConnectionError(DuckGuardError):
|
|
59
|
+
"""Error connecting to a data source."""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
source: str,
|
|
64
|
+
original_error: Exception | None = None,
|
|
65
|
+
**context: Any,
|
|
66
|
+
):
|
|
67
|
+
super().__init__(
|
|
68
|
+
message=f"Failed to connect to data source: {source}",
|
|
69
|
+
suggestion=self._get_suggestion(source, original_error),
|
|
70
|
+
docs_url=f"{DOCS_BASE_URL}#connectors",
|
|
71
|
+
context={"source": source, **context},
|
|
72
|
+
)
|
|
73
|
+
self.source = source
|
|
74
|
+
self.original_error = original_error
|
|
75
|
+
|
|
76
|
+
def _get_suggestion(self, source: str, error: Exception | None) -> str:
|
|
77
|
+
"""Get a helpful suggestion based on the source type."""
|
|
78
|
+
suggestions = []
|
|
79
|
+
|
|
80
|
+
if source.endswith(".csv"):
|
|
81
|
+
suggestions.append("Verify the CSV file exists and is readable")
|
|
82
|
+
suggestions.append("Check file permissions")
|
|
83
|
+
elif source.endswith(".parquet"):
|
|
84
|
+
suggestions.append("Verify the Parquet file exists and is not corrupted")
|
|
85
|
+
suggestions.append("Try: pip install pyarrow")
|
|
86
|
+
elif "postgres" in source or "postgresql" in source:
|
|
87
|
+
suggestions.append("Verify PostgreSQL connection string format: postgresql://user:pass@host:port/db")
|
|
88
|
+
suggestions.append("Check if the database server is running")
|
|
89
|
+
elif "mysql" in source:
|
|
90
|
+
suggestions.append("Verify MySQL connection string format: mysql://user:pass@host:port/db")
|
|
91
|
+
elif "s3://" in source:
|
|
92
|
+
suggestions.append("Verify AWS credentials are configured")
|
|
93
|
+
suggestions.append("Check S3 bucket permissions")
|
|
94
|
+
else:
|
|
95
|
+
suggestions.append("Verify the data source path or connection string")
|
|
96
|
+
|
|
97
|
+
if error:
|
|
98
|
+
suggestions.append(f"Original error: {error}")
|
|
99
|
+
|
|
100
|
+
return "\n - ".join([""] + suggestions).strip()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class FileNotFoundError(DuckGuardError):
|
|
104
|
+
"""File not found error with helpful context."""
|
|
105
|
+
|
|
106
|
+
def __init__(self, path: str, **context: Any):
|
|
107
|
+
import os
|
|
108
|
+
|
|
109
|
+
cwd = os.getcwd()
|
|
110
|
+
super().__init__(
|
|
111
|
+
message=f"File not found: {path}",
|
|
112
|
+
suggestion=f"Check if the file exists. Current directory: {cwd}",
|
|
113
|
+
docs_url=f"{DOCS_BASE_URL}#file-connectors",
|
|
114
|
+
context={"path": path, "cwd": cwd, **context},
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ColumnNotFoundError(DuckGuardError):
|
|
119
|
+
"""Column not found in dataset."""
|
|
120
|
+
|
|
121
|
+
def __init__(self, column: str, available_columns: list[str], **context: Any):
|
|
122
|
+
# Find similar column names
|
|
123
|
+
similar = self._find_similar(column, available_columns)
|
|
124
|
+
|
|
125
|
+
suggestion = "Available columns: " + ", ".join(available_columns[:10])
|
|
126
|
+
if len(available_columns) > 10:
|
|
127
|
+
suggestion += f" (and {len(available_columns) - 10} more)"
|
|
128
|
+
|
|
129
|
+
if similar:
|
|
130
|
+
suggestion = f"Did you mean: {similar}?\n\n{suggestion}"
|
|
131
|
+
|
|
132
|
+
super().__init__(
|
|
133
|
+
message=f"Column '{column}' not found in dataset",
|
|
134
|
+
suggestion=suggestion,
|
|
135
|
+
docs_url=f"{DOCS_BASE_URL}#working-with-columns",
|
|
136
|
+
context={"column": column, "similar": similar, **context},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def _find_similar(self, target: str, candidates: list[str]) -> str | None:
|
|
140
|
+
"""Find a similar column name using simple string matching."""
|
|
141
|
+
target_lower = target.lower()
|
|
142
|
+
|
|
143
|
+
# Exact match ignoring case
|
|
144
|
+
for c in candidates:
|
|
145
|
+
if c.lower() == target_lower:
|
|
146
|
+
return c
|
|
147
|
+
|
|
148
|
+
# Prefix match
|
|
149
|
+
for c in candidates:
|
|
150
|
+
if c.lower().startswith(target_lower) or target_lower.startswith(c.lower()):
|
|
151
|
+
return c
|
|
152
|
+
|
|
153
|
+
# Contains match
|
|
154
|
+
for c in candidates:
|
|
155
|
+
if target_lower in c.lower() or c.lower() in target_lower:
|
|
156
|
+
return c
|
|
157
|
+
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class ValidationError(DuckGuardError):
|
|
162
|
+
"""Validation check failed with detailed information."""
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
check_name: str,
|
|
167
|
+
column: str | None = None,
|
|
168
|
+
actual_value: Any = None,
|
|
169
|
+
expected_value: Any = None,
|
|
170
|
+
failed_rows: list | None = None,
|
|
171
|
+
**context: Any,
|
|
172
|
+
):
|
|
173
|
+
col_str = f" for column '{column}'" if column else ""
|
|
174
|
+
message = f"Validation check '{check_name}' failed{col_str}"
|
|
175
|
+
|
|
176
|
+
suggestion_parts = []
|
|
177
|
+
if actual_value is not None and expected_value is not None:
|
|
178
|
+
suggestion_parts.append(f"Expected: {expected_value}, Got: {actual_value}")
|
|
179
|
+
|
|
180
|
+
if failed_rows:
|
|
181
|
+
sample = failed_rows[:3]
|
|
182
|
+
suggestion_parts.append(f"Sample failing values: {sample}")
|
|
183
|
+
if len(failed_rows) > 3:
|
|
184
|
+
suggestion_parts.append(f"({len(failed_rows)} total failures)")
|
|
185
|
+
|
|
186
|
+
suggestion = "\n".join(suggestion_parts) if suggestion_parts else None
|
|
187
|
+
|
|
188
|
+
super().__init__(
|
|
189
|
+
message=message,
|
|
190
|
+
suggestion=suggestion,
|
|
191
|
+
docs_url=f"{DOCS_BASE_URL}#validation-methods",
|
|
192
|
+
context={
|
|
193
|
+
"check_name": check_name,
|
|
194
|
+
"column": column,
|
|
195
|
+
"actual_value": actual_value,
|
|
196
|
+
"expected_value": expected_value,
|
|
197
|
+
**context,
|
|
198
|
+
},
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class RuleParseError(DuckGuardError):
|
|
203
|
+
"""Error parsing validation rules."""
|
|
204
|
+
|
|
205
|
+
def __init__(
|
|
206
|
+
self,
|
|
207
|
+
message: str,
|
|
208
|
+
file_path: str | None = None,
|
|
209
|
+
line_number: int | None = None,
|
|
210
|
+
**context: Any,
|
|
211
|
+
):
|
|
212
|
+
location = ""
|
|
213
|
+
if file_path:
|
|
214
|
+
location = f" in {file_path}"
|
|
215
|
+
if line_number:
|
|
216
|
+
location += f" at line {line_number}"
|
|
217
|
+
|
|
218
|
+
suggestion = "Check your YAML syntax and rule format.\n"
|
|
219
|
+
suggestion += "Example valid rule:\n"
|
|
220
|
+
suggestion += """
|
|
221
|
+
columns:
|
|
222
|
+
order_id:
|
|
223
|
+
checks:
|
|
224
|
+
- type: not_null
|
|
225
|
+
- type: unique
|
|
226
|
+
amount:
|
|
227
|
+
checks:
|
|
228
|
+
- type: between
|
|
229
|
+
value: [0, 10000]
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
super().__init__(
|
|
233
|
+
message=f"Failed to parse rules{location}: {message}",
|
|
234
|
+
suggestion=suggestion,
|
|
235
|
+
docs_url=f"{DOCS_BASE_URL}#yaml-rules",
|
|
236
|
+
context={"file_path": file_path, "line_number": line_number, **context},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class ContractViolationError(DuckGuardError):
|
|
241
|
+
"""Data contract was violated."""
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self,
|
|
245
|
+
violations: list[str],
|
|
246
|
+
contract_path: str | None = None,
|
|
247
|
+
**context: Any,
|
|
248
|
+
):
|
|
249
|
+
message = f"Data contract violated with {len(violations)} issue(s)"
|
|
250
|
+
if contract_path:
|
|
251
|
+
message += f" (contract: {contract_path})"
|
|
252
|
+
|
|
253
|
+
suggestion = "Violations:\n - " + "\n - ".join(violations[:5])
|
|
254
|
+
if len(violations) > 5:
|
|
255
|
+
suggestion += f"\n ... and {len(violations) - 5} more"
|
|
256
|
+
|
|
257
|
+
suggestion += "\n\nConsider updating the contract or fixing the data issues."
|
|
258
|
+
|
|
259
|
+
super().__init__(
|
|
260
|
+
message=message,
|
|
261
|
+
suggestion=suggestion,
|
|
262
|
+
docs_url=f"{DOCS_BASE_URL}#data-contracts",
|
|
263
|
+
context={"violations": violations, "contract_path": contract_path, **context},
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class UnsupportedConnectorError(DuckGuardError):
|
|
268
|
+
"""No connector available for the data source."""
|
|
269
|
+
|
|
270
|
+
def __init__(self, source: str, **context: Any):
|
|
271
|
+
supported = [
|
|
272
|
+
"CSV (.csv)",
|
|
273
|
+
"Parquet (.parquet, .pq)",
|
|
274
|
+
"JSON (.json, .jsonl, .ndjson)",
|
|
275
|
+
"PostgreSQL (postgres://, postgresql://)",
|
|
276
|
+
"MySQL (mysql://)",
|
|
277
|
+
"SQLite (sqlite://)",
|
|
278
|
+
"S3 (s3://)",
|
|
279
|
+
"Snowflake (snowflake://)",
|
|
280
|
+
"BigQuery (bigquery://)",
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
suggestion = "Supported formats:\n - " + "\n - ".join(supported)
|
|
284
|
+
|
|
285
|
+
super().__init__(
|
|
286
|
+
message=f"No connector found for: {source}",
|
|
287
|
+
suggestion=suggestion,
|
|
288
|
+
docs_url=f"{DOCS_BASE_URL}#supported-connectors",
|
|
289
|
+
context={"source": source, **context},
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
# Error formatting utilities
|
|
294
|
+
|
|
295
|
+
def format_validation_failure(
|
|
296
|
+
check_name: str,
|
|
297
|
+
column: str | None,
|
|
298
|
+
actual: Any,
|
|
299
|
+
expected: Any,
|
|
300
|
+
failed_rows: list | None = None,
|
|
301
|
+
) -> str:
|
|
302
|
+
"""Format a validation failure message with context.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
check_name: Name of the failed check
|
|
306
|
+
column: Column name (if column-level)
|
|
307
|
+
actual: Actual value found
|
|
308
|
+
expected: Expected value
|
|
309
|
+
failed_rows: Sample of failing rows
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Formatted error message
|
|
313
|
+
"""
|
|
314
|
+
parts = []
|
|
315
|
+
|
|
316
|
+
if column:
|
|
317
|
+
parts.append(f"Check '{check_name}' failed for column '{column}'")
|
|
318
|
+
else:
|
|
319
|
+
parts.append(f"Check '{check_name}' failed")
|
|
320
|
+
|
|
321
|
+
parts.append(f" Expected: {expected}")
|
|
322
|
+
parts.append(f" Actual: {actual}")
|
|
323
|
+
|
|
324
|
+
if failed_rows:
|
|
325
|
+
parts.append("")
|
|
326
|
+
parts.append(" Sample failing rows:")
|
|
327
|
+
for row in failed_rows[:5]:
|
|
328
|
+
if hasattr(row, "value"):
|
|
329
|
+
parts.append(f" Row {row.row_index}: {row.value}")
|
|
330
|
+
else:
|
|
331
|
+
parts.append(f" {row}")
|
|
332
|
+
|
|
333
|
+
if len(failed_rows) > 5:
|
|
334
|
+
parts.append(f" ... and {len(failed_rows) - 5} more")
|
|
335
|
+
|
|
336
|
+
return "\n".join(parts)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def format_multiple_failures(failures: list) -> str:
|
|
340
|
+
"""Format multiple validation failures into a summary.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
failures: List of failure objects
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Formatted summary string
|
|
347
|
+
"""
|
|
348
|
+
if not failures:
|
|
349
|
+
return "All checks passed!"
|
|
350
|
+
|
|
351
|
+
parts = [f"{len(failures)} validation check(s) failed:"]
|
|
352
|
+
parts.append("")
|
|
353
|
+
|
|
354
|
+
for i, failure in enumerate(failures[:10], 1):
|
|
355
|
+
col = f"[{failure.column}]" if hasattr(failure, "column") and failure.column else "[table]"
|
|
356
|
+
msg = failure.message if hasattr(failure, "message") else str(failure)
|
|
357
|
+
parts.append(f" {i}. {col} {msg}")
|
|
358
|
+
|
|
359
|
+
if len(failures) > 10:
|
|
360
|
+
parts.append(f" ... and {len(failures) - 10} more failures")
|
|
361
|
+
|
|
362
|
+
return "\n".join(parts)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Freshness monitoring for DuckGuard.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to check data freshness by monitoring
|
|
4
|
+
file modification times and timestamp columns.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from duckguard.freshness import FreshnessMonitor, FreshnessResult
|
|
8
|
+
from datetime import timedelta
|
|
9
|
+
|
|
10
|
+
# Check file freshness
|
|
11
|
+
monitor = FreshnessMonitor(threshold=timedelta(hours=24))
|
|
12
|
+
result = monitor.check("data.csv")
|
|
13
|
+
|
|
14
|
+
if not result.is_fresh:
|
|
15
|
+
print(f"Data is stale! Last updated: {result.age_human}")
|
|
16
|
+
|
|
17
|
+
# Check column freshness
|
|
18
|
+
from duckguard import connect
|
|
19
|
+
data = connect("data.csv")
|
|
20
|
+
result = monitor.check_column_timestamp(data, "updated_at")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from duckguard.freshness.monitor import (
|
|
24
|
+
FreshnessMethod,
|
|
25
|
+
FreshnessMonitor,
|
|
26
|
+
FreshnessResult,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"FreshnessMonitor",
|
|
31
|
+
"FreshnessResult",
|
|
32
|
+
"FreshnessMethod",
|
|
33
|
+
]
|