duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/errors.py ADDED
@@ -0,0 +1,362 @@
1
+ """Enhanced error classes for DuckGuard with helpful suggestions.
2
+
3
+ Provides user-friendly error messages with:
4
+ - Clear descriptions of what went wrong
5
+ - Suggestions for how to fix the issue
6
+ - Links to relevant documentation
7
+ - Context about the data being validated
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ # Documentation base URL
15
+ DOCS_BASE_URL = "https://github.com/XDataHubAI/duckguard"
16
+
17
+
18
+ class DuckGuardError(Exception):
19
+ """Base exception for all DuckGuard errors.
20
+
21
+ Attributes:
22
+ message: Human-readable error description
23
+ suggestion: Helpful suggestion for fixing the issue
24
+ docs_url: Link to relevant documentation
25
+ context: Additional context about the error
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ message: str,
31
+ suggestion: str | None = None,
32
+ docs_url: str | None = None,
33
+ context: dict[str, Any] | None = None,
34
+ ):
35
+ self.message = message
36
+ self.suggestion = suggestion
37
+ self.docs_url = docs_url
38
+ self.context = context or {}
39
+ super().__init__(self._format_message())
40
+
41
+ def _format_message(self) -> str:
42
+ """Format the full error message with suggestions."""
43
+ parts = [self.message]
44
+
45
+ if self.suggestion:
46
+ parts.append(f"\n\nSuggestion: {self.suggestion}")
47
+
48
+ if self.docs_url:
49
+ parts.append(f"\n\nDocs: {self.docs_url}")
50
+
51
+ if self.context:
52
+ context_str = "\n".join(f" {k}: {v}" for k, v in self.context.items())
53
+ parts.append(f"\n\nContext:\n{context_str}")
54
+
55
+ return "".join(parts)
56
+
57
+
58
+ class ConnectionError(DuckGuardError):
59
+ """Error connecting to a data source."""
60
+
61
+ def __init__(
62
+ self,
63
+ source: str,
64
+ original_error: Exception | None = None,
65
+ **context: Any,
66
+ ):
67
+ super().__init__(
68
+ message=f"Failed to connect to data source: {source}",
69
+ suggestion=self._get_suggestion(source, original_error),
70
+ docs_url=f"{DOCS_BASE_URL}#connectors",
71
+ context={"source": source, **context},
72
+ )
73
+ self.source = source
74
+ self.original_error = original_error
75
+
76
+ def _get_suggestion(self, source: str, error: Exception | None) -> str:
77
+ """Get a helpful suggestion based on the source type."""
78
+ suggestions = []
79
+
80
+ if source.endswith(".csv"):
81
+ suggestions.append("Verify the CSV file exists and is readable")
82
+ suggestions.append("Check file permissions")
83
+ elif source.endswith(".parquet"):
84
+ suggestions.append("Verify the Parquet file exists and is not corrupted")
85
+ suggestions.append("Try: pip install pyarrow")
86
+ elif "postgres" in source or "postgresql" in source:
87
+ suggestions.append("Verify PostgreSQL connection string format: postgresql://user:pass@host:port/db")
88
+ suggestions.append("Check if the database server is running")
89
+ elif "mysql" in source:
90
+ suggestions.append("Verify MySQL connection string format: mysql://user:pass@host:port/db")
91
+ elif "s3://" in source:
92
+ suggestions.append("Verify AWS credentials are configured")
93
+ suggestions.append("Check S3 bucket permissions")
94
+ else:
95
+ suggestions.append("Verify the data source path or connection string")
96
+
97
+ if error:
98
+ suggestions.append(f"Original error: {error}")
99
+
100
+ return "\n - ".join([""] + suggestions).strip()
101
+
102
+
103
+ class FileNotFoundError(DuckGuardError):
104
+ """File not found error with helpful context."""
105
+
106
+ def __init__(self, path: str, **context: Any):
107
+ import os
108
+
109
+ cwd = os.getcwd()
110
+ super().__init__(
111
+ message=f"File not found: {path}",
112
+ suggestion=f"Check if the file exists. Current directory: {cwd}",
113
+ docs_url=f"{DOCS_BASE_URL}#file-connectors",
114
+ context={"path": path, "cwd": cwd, **context},
115
+ )
116
+
117
+
118
+ class ColumnNotFoundError(DuckGuardError):
119
+ """Column not found in dataset."""
120
+
121
+ def __init__(self, column: str, available_columns: list[str], **context: Any):
122
+ # Find similar column names
123
+ similar = self._find_similar(column, available_columns)
124
+
125
+ suggestion = "Available columns: " + ", ".join(available_columns[:10])
126
+ if len(available_columns) > 10:
127
+ suggestion += f" (and {len(available_columns) - 10} more)"
128
+
129
+ if similar:
130
+ suggestion = f"Did you mean: {similar}?\n\n{suggestion}"
131
+
132
+ super().__init__(
133
+ message=f"Column '{column}' not found in dataset",
134
+ suggestion=suggestion,
135
+ docs_url=f"{DOCS_BASE_URL}#working-with-columns",
136
+ context={"column": column, "similar": similar, **context},
137
+ )
138
+
139
+ def _find_similar(self, target: str, candidates: list[str]) -> str | None:
140
+ """Find a similar column name using simple string matching."""
141
+ target_lower = target.lower()
142
+
143
+ # Exact match ignoring case
144
+ for c in candidates:
145
+ if c.lower() == target_lower:
146
+ return c
147
+
148
+ # Prefix match
149
+ for c in candidates:
150
+ if c.lower().startswith(target_lower) or target_lower.startswith(c.lower()):
151
+ return c
152
+
153
+ # Contains match
154
+ for c in candidates:
155
+ if target_lower in c.lower() or c.lower() in target_lower:
156
+ return c
157
+
158
+ return None
159
+
160
+
161
+ class ValidationError(DuckGuardError):
162
+ """Validation check failed with detailed information."""
163
+
164
+ def __init__(
165
+ self,
166
+ check_name: str,
167
+ column: str | None = None,
168
+ actual_value: Any = None,
169
+ expected_value: Any = None,
170
+ failed_rows: list | None = None,
171
+ **context: Any,
172
+ ):
173
+ col_str = f" for column '{column}'" if column else ""
174
+ message = f"Validation check '{check_name}' failed{col_str}"
175
+
176
+ suggestion_parts = []
177
+ if actual_value is not None and expected_value is not None:
178
+ suggestion_parts.append(f"Expected: {expected_value}, Got: {actual_value}")
179
+
180
+ if failed_rows:
181
+ sample = failed_rows[:3]
182
+ suggestion_parts.append(f"Sample failing values: {sample}")
183
+ if len(failed_rows) > 3:
184
+ suggestion_parts.append(f"({len(failed_rows)} total failures)")
185
+
186
+ suggestion = "\n".join(suggestion_parts) if suggestion_parts else None
187
+
188
+ super().__init__(
189
+ message=message,
190
+ suggestion=suggestion,
191
+ docs_url=f"{DOCS_BASE_URL}#validation-methods",
192
+ context={
193
+ "check_name": check_name,
194
+ "column": column,
195
+ "actual_value": actual_value,
196
+ "expected_value": expected_value,
197
+ **context,
198
+ },
199
+ )
200
+
201
+
202
+ class RuleParseError(DuckGuardError):
203
+ """Error parsing validation rules."""
204
+
205
+ def __init__(
206
+ self,
207
+ message: str,
208
+ file_path: str | None = None,
209
+ line_number: int | None = None,
210
+ **context: Any,
211
+ ):
212
+ location = ""
213
+ if file_path:
214
+ location = f" in {file_path}"
215
+ if line_number:
216
+ location += f" at line {line_number}"
217
+
218
+ suggestion = "Check your YAML syntax and rule format.\n"
219
+ suggestion += "Example valid rule:\n"
220
+ suggestion += """
221
+ columns:
222
+ order_id:
223
+ checks:
224
+ - type: not_null
225
+ - type: unique
226
+ amount:
227
+ checks:
228
+ - type: between
229
+ value: [0, 10000]
230
+ """
231
+
232
+ super().__init__(
233
+ message=f"Failed to parse rules{location}: {message}",
234
+ suggestion=suggestion,
235
+ docs_url=f"{DOCS_BASE_URL}#yaml-rules",
236
+ context={"file_path": file_path, "line_number": line_number, **context},
237
+ )
238
+
239
+
240
+ class ContractViolationError(DuckGuardError):
241
+ """Data contract was violated."""
242
+
243
+ def __init__(
244
+ self,
245
+ violations: list[str],
246
+ contract_path: str | None = None,
247
+ **context: Any,
248
+ ):
249
+ message = f"Data contract violated with {len(violations)} issue(s)"
250
+ if contract_path:
251
+ message += f" (contract: {contract_path})"
252
+
253
+ suggestion = "Violations:\n - " + "\n - ".join(violations[:5])
254
+ if len(violations) > 5:
255
+ suggestion += f"\n ... and {len(violations) - 5} more"
256
+
257
+ suggestion += "\n\nConsider updating the contract or fixing the data issues."
258
+
259
+ super().__init__(
260
+ message=message,
261
+ suggestion=suggestion,
262
+ docs_url=f"{DOCS_BASE_URL}#data-contracts",
263
+ context={"violations": violations, "contract_path": contract_path, **context},
264
+ )
265
+
266
+
267
+ class UnsupportedConnectorError(DuckGuardError):
268
+ """No connector available for the data source."""
269
+
270
+ def __init__(self, source: str, **context: Any):
271
+ supported = [
272
+ "CSV (.csv)",
273
+ "Parquet (.parquet, .pq)",
274
+ "JSON (.json, .jsonl, .ndjson)",
275
+ "PostgreSQL (postgres://, postgresql://)",
276
+ "MySQL (mysql://)",
277
+ "SQLite (sqlite://)",
278
+ "S3 (s3://)",
279
+ "Snowflake (snowflake://)",
280
+ "BigQuery (bigquery://)",
281
+ ]
282
+
283
+ suggestion = "Supported formats:\n - " + "\n - ".join(supported)
284
+
285
+ super().__init__(
286
+ message=f"No connector found for: {source}",
287
+ suggestion=suggestion,
288
+ docs_url=f"{DOCS_BASE_URL}#supported-connectors",
289
+ context={"source": source, **context},
290
+ )
291
+
292
+
293
+ # Error formatting utilities
294
+
295
+ def format_validation_failure(
296
+ check_name: str,
297
+ column: str | None,
298
+ actual: Any,
299
+ expected: Any,
300
+ failed_rows: list | None = None,
301
+ ) -> str:
302
+ """Format a validation failure message with context.
303
+
304
+ Args:
305
+ check_name: Name of the failed check
306
+ column: Column name (if column-level)
307
+ actual: Actual value found
308
+ expected: Expected value
309
+ failed_rows: Sample of failing rows
310
+
311
+ Returns:
312
+ Formatted error message
313
+ """
314
+ parts = []
315
+
316
+ if column:
317
+ parts.append(f"Check '{check_name}' failed for column '{column}'")
318
+ else:
319
+ parts.append(f"Check '{check_name}' failed")
320
+
321
+ parts.append(f" Expected: {expected}")
322
+ parts.append(f" Actual: {actual}")
323
+
324
+ if failed_rows:
325
+ parts.append("")
326
+ parts.append(" Sample failing rows:")
327
+ for row in failed_rows[:5]:
328
+ if hasattr(row, "value"):
329
+ parts.append(f" Row {row.row_index}: {row.value}")
330
+ else:
331
+ parts.append(f" {row}")
332
+
333
+ if len(failed_rows) > 5:
334
+ parts.append(f" ... and {len(failed_rows) - 5} more")
335
+
336
+ return "\n".join(parts)
337
+
338
+
339
+ def format_multiple_failures(failures: list) -> str:
340
+ """Format multiple validation failures into a summary.
341
+
342
+ Args:
343
+ failures: List of failure objects
344
+
345
+ Returns:
346
+ Formatted summary string
347
+ """
348
+ if not failures:
349
+ return "All checks passed!"
350
+
351
+ parts = [f"{len(failures)} validation check(s) failed:"]
352
+ parts.append("")
353
+
354
+ for i, failure in enumerate(failures[:10], 1):
355
+ col = f"[{failure.column}]" if hasattr(failure, "column") and failure.column else "[table]"
356
+ msg = failure.message if hasattr(failure, "message") else str(failure)
357
+ parts.append(f" {i}. {col} {msg}")
358
+
359
+ if len(failures) > 10:
360
+ parts.append(f" ... and {len(failures) - 10} more failures")
361
+
362
+ return "\n".join(parts)
@@ -0,0 +1,33 @@
1
+ """Freshness monitoring for DuckGuard.
2
+
3
+ This module provides functionality to check data freshness by monitoring
4
+ file modification times and timestamp columns.
5
+
6
+ Usage:
7
+ from duckguard.freshness import FreshnessMonitor, FreshnessResult
8
+ from datetime import timedelta
9
+
10
+ # Check file freshness
11
+ monitor = FreshnessMonitor(threshold=timedelta(hours=24))
12
+ result = monitor.check("data.csv")
13
+
14
+ if not result.is_fresh:
15
+ print(f"Data is stale! Last updated: {result.age_human}")
16
+
17
+ # Check column freshness
18
+ from duckguard import connect
19
+ data = connect("data.csv")
20
+ result = monitor.check_column_timestamp(data, "updated_at")
21
+ """
22
+
23
+ from duckguard.freshness.monitor import (
24
+ FreshnessMethod,
25
+ FreshnessMonitor,
26
+ FreshnessResult,
27
+ )
28
+
29
+ __all__ = [
30
+ "FreshnessMonitor",
31
+ "FreshnessResult",
32
+ "FreshnessMethod",
33
+ ]