duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,387 @@
1
+ """Apache Airflow integration for DuckGuard.
2
+
3
+ Provides a DuckGuardOperator for running data quality checks in Airflow DAGs.
4
+
5
+ Usage:
6
+ from duckguard.integrations.airflow import DuckGuardOperator
7
+
8
+ check_orders = DuckGuardOperator(
9
+ task_id="check_orders_quality",
10
+ source="s3://bucket/orders.parquet",
11
+ rules="duckguard.yaml",
12
+ fail_on_warning=False,
13
+ )
14
+
15
+ Note:
16
+ Requires apache-airflow to be installed:
17
+ pip install duckguard[airflow]
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from collections.abc import Sequence
23
+ from typing import Any
24
+
25
+ # Try to import Airflow - it's an optional dependency
26
+ try:
27
+ from airflow.models import BaseOperator
28
+ from airflow.utils.context import Context
29
+
30
+ AIRFLOW_AVAILABLE = True
31
+ except ImportError:
32
+ # Create stubs when Airflow is not installed
33
+ AIRFLOW_AVAILABLE = False
34
+
35
+ class BaseOperator: # type: ignore[no-redef]
36
+ """Stub BaseOperator when Airflow is not installed."""
37
+
38
+ def __init__(self, **kwargs: Any) -> None:
39
+ pass
40
+
41
+ Context = dict # type: ignore[misc,assignment]
42
+
43
+
44
+ class DuckGuardOperator(BaseOperator):
45
+ """Airflow operator for running DuckGuard data quality checks.
46
+
47
+ This operator runs DuckGuard validation rules against a data source
48
+ and optionally fails the task if quality checks don't pass.
49
+
50
+ Args:
51
+ source: Data source path or connection string (supports Jinja templating)
52
+ rules: Path to duckguard.yaml rules file (supports Jinja templating)
53
+ table: Table name for database connections (supports Jinja templating)
54
+ fail_on_warning: Whether to fail task on warnings (default: False)
55
+ fail_on_error: Whether to fail task on errors (default: True)
56
+ notify_slack: Slack webhook URL for notifications (supports Jinja templating)
57
+ notify_teams: Teams webhook URL for notifications (supports Jinja templating)
58
+ store_history: Whether to store results in history database (default: False)
59
+ history_db: Path to history database (default: ~/.duckguard/history.db)
60
+ **kwargs: Additional BaseOperator arguments (task_id, dag, etc.)
61
+
62
+ Returns (via XCom):
63
+ Dict with keys:
64
+ - passed: bool - Whether all checks passed
65
+ - quality_score: float - Overall quality score (0-100)
66
+ - total_checks: int - Total number of checks
67
+ - passed_count: int - Number of passing checks
68
+ - failed_count: int - Number of failing checks
69
+ - warning_count: int - Number of warnings
70
+ - failures: list[dict] - Details of failed checks
71
+
72
+ Example:
73
+ from airflow import DAG
74
+ from airflow.utils.dates import days_ago
75
+ from duckguard.integrations.airflow import DuckGuardOperator
76
+
77
+ with DAG("data_quality", start_date=days_ago(1)) as dag:
78
+ check_orders = DuckGuardOperator(
79
+ task_id="check_orders",
80
+ source="s3://bucket/orders/{{ ds }}.parquet",
81
+ rules="dags/rules/orders.yaml",
82
+ fail_on_warning=False,
83
+ notify_slack="{{ var.value.slack_webhook }}",
84
+ )
85
+
86
+ process_orders = SomeOtherOperator(task_id="process_orders")
87
+ check_orders >> process_orders
88
+
89
+ Raises:
90
+ ImportError: If Apache Airflow is not installed
91
+ AirflowException: If checks fail and fail_on_* is True
92
+ """
93
+
94
+ # Template fields for Airflow variable substitution
95
+ template_fields: Sequence[str] = (
96
+ "source",
97
+ "rules",
98
+ "table",
99
+ "notify_slack",
100
+ "notify_teams",
101
+ )
102
+
103
+ # Operator UI color (DuckGuard green)
104
+ ui_color = "#00D26A"
105
+ ui_fgcolor = "#FFFFFF"
106
+
107
+ def __init__(
108
+ self,
109
+ *,
110
+ source: str,
111
+ rules: str | None = None,
112
+ table: str | None = None,
113
+ fail_on_warning: bool = False,
114
+ fail_on_error: bool = True,
115
+ notify_slack: str | None = None,
116
+ notify_teams: str | None = None,
117
+ store_history: bool = False,
118
+ history_db: str | None = None,
119
+ **kwargs: Any,
120
+ ) -> None:
121
+ """Initialize the DuckGuard operator."""
122
+ if not AIRFLOW_AVAILABLE:
123
+ raise ImportError(
124
+ "Apache Airflow is required for DuckGuardOperator. "
125
+ "Install with: pip install duckguard[airflow]"
126
+ )
127
+
128
+ super().__init__(**kwargs)
129
+
130
+ self.source = source
131
+ self.rules = rules
132
+ self.table = table
133
+ self.fail_on_warning = fail_on_warning
134
+ self.fail_on_error = fail_on_error
135
+ self.notify_slack = notify_slack
136
+ self.notify_teams = notify_teams
137
+ self.store_history = store_history
138
+ self.history_db = history_db
139
+
140
+ def execute(self, context: Context) -> dict[str, Any]:
141
+ """Execute the DuckGuard checks.
142
+
143
+ Args:
144
+ context: Airflow context dictionary
145
+
146
+ Returns:
147
+ Dict with execution results (also pushed to XCom)
148
+
149
+ Raises:
150
+ AirflowException: If checks fail and fail_on_* is True
151
+ """
152
+ from airflow.exceptions import AirflowException
153
+
154
+ from duckguard import connect
155
+ from duckguard.rules import execute_rules, generate_rules, load_rules
156
+
157
+ self.log.info(f"Running DuckGuard checks on: {self.source}")
158
+
159
+ # Connect to data source
160
+ dataset = connect(self.source, table=self.table)
161
+ self.log.info(
162
+ f"Connected: {dataset.row_count:,} rows, {dataset.column_count} columns"
163
+ )
164
+
165
+ # Load or generate rules
166
+ if self.rules:
167
+ self.log.info(f"Loading rules from: {self.rules}")
168
+ ruleset = load_rules(self.rules)
169
+ else:
170
+ self.log.info("No rules file specified, generating rules from data profile")
171
+ ruleset = generate_rules(dataset, as_yaml=False)
172
+
173
+ # Execute validation
174
+ result = execute_rules(ruleset, dataset=dataset)
175
+
176
+ # Log results
177
+ self.log.info(f"Quality Score: {result.quality_score:.1f}%")
178
+ self.log.info(
179
+ f"Checks: {result.passed_count}/{result.total_checks} passed"
180
+ )
181
+
182
+ if result.failed_count > 0:
183
+ self.log.warning(f"Failures: {result.failed_count}")
184
+ for failure in result.get_failures()[:10]: # Limit log output
185
+ self.log.warning(f" - [{failure.column}] {failure.message}")
186
+
187
+ if result.warning_count > 0:
188
+ self.log.warning(f"Warnings: {result.warning_count}")
189
+
190
+ # Send notifications
191
+ self._send_notifications(result)
192
+
193
+ # Store in history
194
+ if self.store_history:
195
+ self._store_history(result, context)
196
+
197
+ # Build XCom return value
198
+ xcom_result = {
199
+ "passed": result.passed,
200
+ "quality_score": result.quality_score,
201
+ "total_checks": result.total_checks,
202
+ "passed_count": result.passed_count,
203
+ "failed_count": result.failed_count,
204
+ "warning_count": result.warning_count,
205
+ "source": result.source,
206
+ "failures": [
207
+ {
208
+ "column": f.column,
209
+ "check_type": f.check.type.value,
210
+ "message": f.message,
211
+ "actual_value": str(f.actual_value),
212
+ "expected_value": str(f.expected_value),
213
+ }
214
+ for f in result.get_failures()
215
+ ],
216
+ }
217
+
218
+ # Determine if we should fail the task
219
+ should_fail = False
220
+ fail_reason = ""
221
+
222
+ if self.fail_on_error and result.failed_count > 0:
223
+ should_fail = True
224
+ fail_reason = f"{result.failed_count} check(s) failed"
225
+
226
+ if self.fail_on_warning and result.warning_count > 0:
227
+ should_fail = True
228
+ if fail_reason:
229
+ fail_reason += f", {result.warning_count} warning(s)"
230
+ else:
231
+ fail_reason = f"{result.warning_count} warning(s)"
232
+
233
+ if should_fail:
234
+ raise AirflowException(
235
+ f"DuckGuard validation failed: {fail_reason}. "
236
+ f"Quality score: {result.quality_score:.1f}%"
237
+ )
238
+
239
+ self.log.info("DuckGuard validation passed!")
240
+ return xcom_result
241
+
242
+ def _send_notifications(self, result: Any) -> None:
243
+ """Send notifications if configured."""
244
+ if self.notify_slack:
245
+ try:
246
+ from duckguard.notifications import SlackNotifier
247
+
248
+ notifier = SlackNotifier(webhook_url=self.notify_slack)
249
+ notifier.send_results(result)
250
+ self.log.info("Sent Slack notification")
251
+ except Exception as e:
252
+ self.log.warning(f"Failed to send Slack notification: {e}")
253
+
254
+ if self.notify_teams:
255
+ try:
256
+ from duckguard.notifications import TeamsNotifier
257
+
258
+ notifier = TeamsNotifier(webhook_url=self.notify_teams)
259
+ notifier.send_results(result)
260
+ self.log.info("Sent Teams notification")
261
+ except Exception as e:
262
+ self.log.warning(f"Failed to send Teams notification: {e}")
263
+
264
+ def _store_history(self, result: Any, context: Context) -> None:
265
+ """Store results in history database."""
266
+ try:
267
+ from duckguard.history import HistoryStorage
268
+
269
+ storage = HistoryStorage(db_path=self.history_db)
270
+
271
+ # Include Airflow context as metadata
272
+ dag = context.get("dag")
273
+ metadata = {
274
+ "dag_id": dag.dag_id if dag else None,
275
+ "task_id": self.task_id,
276
+ "run_id": context.get("run_id"),
277
+ "execution_date": str(context.get("execution_date")),
278
+ "try_number": context.get("ti").try_number if context.get("ti") else None,
279
+ }
280
+
281
+ run_id = storage.store(result, metadata=metadata)
282
+ self.log.info(f"Stored results in history: {run_id}")
283
+ except Exception as e:
284
+ self.log.warning(f"Failed to store history: {e}")
285
+
286
+
287
+ class DuckGuardSensor(BaseOperator):
288
+ """Airflow sensor that waits for data quality to meet threshold.
289
+
290
+ This sensor repeatedly checks data quality until it meets
291
+ a minimum quality score threshold.
292
+
293
+ Args:
294
+ source: Data source path or connection string
295
+ rules: Path to duckguard.yaml rules file
296
+ min_quality_score: Minimum quality score to pass (0-100)
297
+ poke_interval: Seconds between checks (default: 300)
298
+ timeout: Total seconds before timing out (default: 3600)
299
+ **kwargs: Additional BaseOperator arguments
300
+
301
+ Example:
302
+ wait_for_quality = DuckGuardSensor(
303
+ task_id="wait_for_quality",
304
+ source="s3://bucket/data.parquet",
305
+ min_quality_score=95.0,
306
+ poke_interval=300,
307
+ timeout=3600,
308
+ )
309
+ """
310
+
311
+ template_fields: Sequence[str] = ("source", "rules")
312
+ ui_color = "#00D26A"
313
+ ui_fgcolor = "#FFFFFF"
314
+
315
+ def __init__(
316
+ self,
317
+ *,
318
+ source: str,
319
+ rules: str | None = None,
320
+ min_quality_score: float = 90.0,
321
+ table: str | None = None,
322
+ **kwargs: Any,
323
+ ) -> None:
324
+ """Initialize the DuckGuard sensor."""
325
+ if not AIRFLOW_AVAILABLE:
326
+ raise ImportError(
327
+ "Apache Airflow is required for DuckGuardSensor. "
328
+ "Install with: pip install duckguard[airflow]"
329
+ )
330
+
331
+ super().__init__(**kwargs)
332
+
333
+ self.source = source
334
+ self.rules = rules
335
+ self.min_quality_score = min_quality_score
336
+ self.table = table
337
+
338
+ def execute(self, context: Context) -> bool:
339
+ """Check if data quality meets threshold.
340
+
341
+ Returns:
342
+ True if quality score >= min_quality_score
343
+ """
344
+ from duckguard import connect
345
+ from duckguard.rules import execute_rules, generate_rules, load_rules
346
+
347
+ self.log.info(f"Checking quality for: {self.source}")
348
+ self.log.info(f"Minimum score required: {self.min_quality_score}")
349
+
350
+ dataset = connect(self.source, table=self.table)
351
+
352
+ if self.rules:
353
+ ruleset = load_rules(self.rules)
354
+ else:
355
+ ruleset = generate_rules(dataset, as_yaml=False)
356
+
357
+ result = execute_rules(ruleset, dataset=dataset)
358
+
359
+ self.log.info(f"Current quality score: {result.quality_score:.1f}%")
360
+
361
+ if result.quality_score >= self.min_quality_score:
362
+ self.log.info("Quality threshold met!")
363
+ return True
364
+ else:
365
+ self.log.info(
366
+ f"Quality score {result.quality_score:.1f}% "
367
+ f"below threshold {self.min_quality_score}%"
368
+ )
369
+ return False
370
+
371
+
372
+ def duckguard_check(
373
+ task_id: str,
374
+ source: str,
375
+ **kwargs: Any,
376
+ ) -> DuckGuardOperator:
377
+ """Convenience function for creating a DuckGuard check operator.
378
+
379
+ Args:
380
+ task_id: Airflow task ID
381
+ source: Data source path or connection string
382
+ **kwargs: Additional DuckGuardOperator arguments
383
+
384
+ Returns:
385
+ DuckGuardOperator instance
386
+ """
387
+ return DuckGuardOperator(task_id=task_id, source=source, **kwargs)