duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""Apache Airflow integration for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides a DuckGuardOperator for running data quality checks in Airflow DAGs.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from duckguard.integrations.airflow import DuckGuardOperator
|
|
7
|
+
|
|
8
|
+
check_orders = DuckGuardOperator(
|
|
9
|
+
task_id="check_orders_quality",
|
|
10
|
+
source="s3://bucket/orders.parquet",
|
|
11
|
+
rules="duckguard.yaml",
|
|
12
|
+
fail_on_warning=False,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
Note:
|
|
16
|
+
Requires apache-airflow to be installed:
|
|
17
|
+
pip install duckguard[airflow]
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from collections.abc import Sequence
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
# Try to import Airflow - it's an optional dependency
|
|
26
|
+
try:
|
|
27
|
+
from airflow.models import BaseOperator
|
|
28
|
+
from airflow.utils.context import Context
|
|
29
|
+
|
|
30
|
+
AIRFLOW_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
# Create stubs when Airflow is not installed
|
|
33
|
+
AIRFLOW_AVAILABLE = False
|
|
34
|
+
|
|
35
|
+
class BaseOperator: # type: ignore[no-redef]
|
|
36
|
+
"""Stub BaseOperator when Airflow is not installed."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
Context = dict # type: ignore[misc,assignment]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DuckGuardOperator(BaseOperator):
|
|
45
|
+
"""Airflow operator for running DuckGuard data quality checks.
|
|
46
|
+
|
|
47
|
+
This operator runs DuckGuard validation rules against a data source
|
|
48
|
+
and optionally fails the task if quality checks don't pass.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
source: Data source path or connection string (supports Jinja templating)
|
|
52
|
+
rules: Path to duckguard.yaml rules file (supports Jinja templating)
|
|
53
|
+
table: Table name for database connections (supports Jinja templating)
|
|
54
|
+
fail_on_warning: Whether to fail task on warnings (default: False)
|
|
55
|
+
fail_on_error: Whether to fail task on errors (default: True)
|
|
56
|
+
notify_slack: Slack webhook URL for notifications (supports Jinja templating)
|
|
57
|
+
notify_teams: Teams webhook URL for notifications (supports Jinja templating)
|
|
58
|
+
store_history: Whether to store results in history database (default: False)
|
|
59
|
+
history_db: Path to history database (default: ~/.duckguard/history.db)
|
|
60
|
+
**kwargs: Additional BaseOperator arguments (task_id, dag, etc.)
|
|
61
|
+
|
|
62
|
+
Returns (via XCom):
|
|
63
|
+
Dict with keys:
|
|
64
|
+
- passed: bool - Whether all checks passed
|
|
65
|
+
- quality_score: float - Overall quality score (0-100)
|
|
66
|
+
- total_checks: int - Total number of checks
|
|
67
|
+
- passed_count: int - Number of passing checks
|
|
68
|
+
- failed_count: int - Number of failing checks
|
|
69
|
+
- warning_count: int - Number of warnings
|
|
70
|
+
- failures: list[dict] - Details of failed checks
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
from airflow import DAG
|
|
74
|
+
from airflow.utils.dates import days_ago
|
|
75
|
+
from duckguard.integrations.airflow import DuckGuardOperator
|
|
76
|
+
|
|
77
|
+
with DAG("data_quality", start_date=days_ago(1)) as dag:
|
|
78
|
+
check_orders = DuckGuardOperator(
|
|
79
|
+
task_id="check_orders",
|
|
80
|
+
source="s3://bucket/orders/{{ ds }}.parquet",
|
|
81
|
+
rules="dags/rules/orders.yaml",
|
|
82
|
+
fail_on_warning=False,
|
|
83
|
+
notify_slack="{{ var.value.slack_webhook }}",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
process_orders = SomeOtherOperator(task_id="process_orders")
|
|
87
|
+
check_orders >> process_orders
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
ImportError: If Apache Airflow is not installed
|
|
91
|
+
AirflowException: If checks fail and fail_on_* is True
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Template fields for Airflow variable substitution
|
|
95
|
+
template_fields: Sequence[str] = (
|
|
96
|
+
"source",
|
|
97
|
+
"rules",
|
|
98
|
+
"table",
|
|
99
|
+
"notify_slack",
|
|
100
|
+
"notify_teams",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Operator UI color (DuckGuard green)
|
|
104
|
+
ui_color = "#00D26A"
|
|
105
|
+
ui_fgcolor = "#FFFFFF"
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
*,
|
|
110
|
+
source: str,
|
|
111
|
+
rules: str | None = None,
|
|
112
|
+
table: str | None = None,
|
|
113
|
+
fail_on_warning: bool = False,
|
|
114
|
+
fail_on_error: bool = True,
|
|
115
|
+
notify_slack: str | None = None,
|
|
116
|
+
notify_teams: str | None = None,
|
|
117
|
+
store_history: bool = False,
|
|
118
|
+
history_db: str | None = None,
|
|
119
|
+
**kwargs: Any,
|
|
120
|
+
) -> None:
|
|
121
|
+
"""Initialize the DuckGuard operator."""
|
|
122
|
+
if not AIRFLOW_AVAILABLE:
|
|
123
|
+
raise ImportError(
|
|
124
|
+
"Apache Airflow is required for DuckGuardOperator. "
|
|
125
|
+
"Install with: pip install duckguard[airflow]"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
super().__init__(**kwargs)
|
|
129
|
+
|
|
130
|
+
self.source = source
|
|
131
|
+
self.rules = rules
|
|
132
|
+
self.table = table
|
|
133
|
+
self.fail_on_warning = fail_on_warning
|
|
134
|
+
self.fail_on_error = fail_on_error
|
|
135
|
+
self.notify_slack = notify_slack
|
|
136
|
+
self.notify_teams = notify_teams
|
|
137
|
+
self.store_history = store_history
|
|
138
|
+
self.history_db = history_db
|
|
139
|
+
|
|
140
|
+
def execute(self, context: Context) -> dict[str, Any]:
|
|
141
|
+
"""Execute the DuckGuard checks.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
context: Airflow context dictionary
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dict with execution results (also pushed to XCom)
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
AirflowException: If checks fail and fail_on_* is True
|
|
151
|
+
"""
|
|
152
|
+
from airflow.exceptions import AirflowException
|
|
153
|
+
|
|
154
|
+
from duckguard import connect
|
|
155
|
+
from duckguard.rules import execute_rules, generate_rules, load_rules
|
|
156
|
+
|
|
157
|
+
self.log.info(f"Running DuckGuard checks on: {self.source}")
|
|
158
|
+
|
|
159
|
+
# Connect to data source
|
|
160
|
+
dataset = connect(self.source, table=self.table)
|
|
161
|
+
self.log.info(
|
|
162
|
+
f"Connected: {dataset.row_count:,} rows, {dataset.column_count} columns"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Load or generate rules
|
|
166
|
+
if self.rules:
|
|
167
|
+
self.log.info(f"Loading rules from: {self.rules}")
|
|
168
|
+
ruleset = load_rules(self.rules)
|
|
169
|
+
else:
|
|
170
|
+
self.log.info("No rules file specified, generating rules from data profile")
|
|
171
|
+
ruleset = generate_rules(dataset, as_yaml=False)
|
|
172
|
+
|
|
173
|
+
# Execute validation
|
|
174
|
+
result = execute_rules(ruleset, dataset=dataset)
|
|
175
|
+
|
|
176
|
+
# Log results
|
|
177
|
+
self.log.info(f"Quality Score: {result.quality_score:.1f}%")
|
|
178
|
+
self.log.info(
|
|
179
|
+
f"Checks: {result.passed_count}/{result.total_checks} passed"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if result.failed_count > 0:
|
|
183
|
+
self.log.warning(f"Failures: {result.failed_count}")
|
|
184
|
+
for failure in result.get_failures()[:10]: # Limit log output
|
|
185
|
+
self.log.warning(f" - [{failure.column}] {failure.message}")
|
|
186
|
+
|
|
187
|
+
if result.warning_count > 0:
|
|
188
|
+
self.log.warning(f"Warnings: {result.warning_count}")
|
|
189
|
+
|
|
190
|
+
# Send notifications
|
|
191
|
+
self._send_notifications(result)
|
|
192
|
+
|
|
193
|
+
# Store in history
|
|
194
|
+
if self.store_history:
|
|
195
|
+
self._store_history(result, context)
|
|
196
|
+
|
|
197
|
+
# Build XCom return value
|
|
198
|
+
xcom_result = {
|
|
199
|
+
"passed": result.passed,
|
|
200
|
+
"quality_score": result.quality_score,
|
|
201
|
+
"total_checks": result.total_checks,
|
|
202
|
+
"passed_count": result.passed_count,
|
|
203
|
+
"failed_count": result.failed_count,
|
|
204
|
+
"warning_count": result.warning_count,
|
|
205
|
+
"source": result.source,
|
|
206
|
+
"failures": [
|
|
207
|
+
{
|
|
208
|
+
"column": f.column,
|
|
209
|
+
"check_type": f.check.type.value,
|
|
210
|
+
"message": f.message,
|
|
211
|
+
"actual_value": str(f.actual_value),
|
|
212
|
+
"expected_value": str(f.expected_value),
|
|
213
|
+
}
|
|
214
|
+
for f in result.get_failures()
|
|
215
|
+
],
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
# Determine if we should fail the task
|
|
219
|
+
should_fail = False
|
|
220
|
+
fail_reason = ""
|
|
221
|
+
|
|
222
|
+
if self.fail_on_error and result.failed_count > 0:
|
|
223
|
+
should_fail = True
|
|
224
|
+
fail_reason = f"{result.failed_count} check(s) failed"
|
|
225
|
+
|
|
226
|
+
if self.fail_on_warning and result.warning_count > 0:
|
|
227
|
+
should_fail = True
|
|
228
|
+
if fail_reason:
|
|
229
|
+
fail_reason += f", {result.warning_count} warning(s)"
|
|
230
|
+
else:
|
|
231
|
+
fail_reason = f"{result.warning_count} warning(s)"
|
|
232
|
+
|
|
233
|
+
if should_fail:
|
|
234
|
+
raise AirflowException(
|
|
235
|
+
f"DuckGuard validation failed: {fail_reason}. "
|
|
236
|
+
f"Quality score: {result.quality_score:.1f}%"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
self.log.info("DuckGuard validation passed!")
|
|
240
|
+
return xcom_result
|
|
241
|
+
|
|
242
|
+
def _send_notifications(self, result: Any) -> None:
|
|
243
|
+
"""Send notifications if configured."""
|
|
244
|
+
if self.notify_slack:
|
|
245
|
+
try:
|
|
246
|
+
from duckguard.notifications import SlackNotifier
|
|
247
|
+
|
|
248
|
+
notifier = SlackNotifier(webhook_url=self.notify_slack)
|
|
249
|
+
notifier.send_results(result)
|
|
250
|
+
self.log.info("Sent Slack notification")
|
|
251
|
+
except Exception as e:
|
|
252
|
+
self.log.warning(f"Failed to send Slack notification: {e}")
|
|
253
|
+
|
|
254
|
+
if self.notify_teams:
|
|
255
|
+
try:
|
|
256
|
+
from duckguard.notifications import TeamsNotifier
|
|
257
|
+
|
|
258
|
+
notifier = TeamsNotifier(webhook_url=self.notify_teams)
|
|
259
|
+
notifier.send_results(result)
|
|
260
|
+
self.log.info("Sent Teams notification")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
self.log.warning(f"Failed to send Teams notification: {e}")
|
|
263
|
+
|
|
264
|
+
def _store_history(self, result: Any, context: Context) -> None:
|
|
265
|
+
"""Store results in history database."""
|
|
266
|
+
try:
|
|
267
|
+
from duckguard.history import HistoryStorage
|
|
268
|
+
|
|
269
|
+
storage = HistoryStorage(db_path=self.history_db)
|
|
270
|
+
|
|
271
|
+
# Include Airflow context as metadata
|
|
272
|
+
dag = context.get("dag")
|
|
273
|
+
metadata = {
|
|
274
|
+
"dag_id": dag.dag_id if dag else None,
|
|
275
|
+
"task_id": self.task_id,
|
|
276
|
+
"run_id": context.get("run_id"),
|
|
277
|
+
"execution_date": str(context.get("execution_date")),
|
|
278
|
+
"try_number": context.get("ti").try_number if context.get("ti") else None,
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
run_id = storage.store(result, metadata=metadata)
|
|
282
|
+
self.log.info(f"Stored results in history: {run_id}")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
self.log.warning(f"Failed to store history: {e}")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class DuckGuardSensor(BaseOperator):
|
|
288
|
+
"""Airflow sensor that waits for data quality to meet threshold.
|
|
289
|
+
|
|
290
|
+
This sensor repeatedly checks data quality until it meets
|
|
291
|
+
a minimum quality score threshold.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
source: Data source path or connection string
|
|
295
|
+
rules: Path to duckguard.yaml rules file
|
|
296
|
+
min_quality_score: Minimum quality score to pass (0-100)
|
|
297
|
+
poke_interval: Seconds between checks (default: 300)
|
|
298
|
+
timeout: Total seconds before timing out (default: 3600)
|
|
299
|
+
**kwargs: Additional BaseOperator arguments
|
|
300
|
+
|
|
301
|
+
Example:
|
|
302
|
+
wait_for_quality = DuckGuardSensor(
|
|
303
|
+
task_id="wait_for_quality",
|
|
304
|
+
source="s3://bucket/data.parquet",
|
|
305
|
+
min_quality_score=95.0,
|
|
306
|
+
poke_interval=300,
|
|
307
|
+
timeout=3600,
|
|
308
|
+
)
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
template_fields: Sequence[str] = ("source", "rules")
|
|
312
|
+
ui_color = "#00D26A"
|
|
313
|
+
ui_fgcolor = "#FFFFFF"
|
|
314
|
+
|
|
315
|
+
def __init__(
|
|
316
|
+
self,
|
|
317
|
+
*,
|
|
318
|
+
source: str,
|
|
319
|
+
rules: str | None = None,
|
|
320
|
+
min_quality_score: float = 90.0,
|
|
321
|
+
table: str | None = None,
|
|
322
|
+
**kwargs: Any,
|
|
323
|
+
) -> None:
|
|
324
|
+
"""Initialize the DuckGuard sensor."""
|
|
325
|
+
if not AIRFLOW_AVAILABLE:
|
|
326
|
+
raise ImportError(
|
|
327
|
+
"Apache Airflow is required for DuckGuardSensor. "
|
|
328
|
+
"Install with: pip install duckguard[airflow]"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
super().__init__(**kwargs)
|
|
332
|
+
|
|
333
|
+
self.source = source
|
|
334
|
+
self.rules = rules
|
|
335
|
+
self.min_quality_score = min_quality_score
|
|
336
|
+
self.table = table
|
|
337
|
+
|
|
338
|
+
def execute(self, context: Context) -> bool:
|
|
339
|
+
"""Check if data quality meets threshold.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
True if quality score >= min_quality_score
|
|
343
|
+
"""
|
|
344
|
+
from duckguard import connect
|
|
345
|
+
from duckguard.rules import execute_rules, generate_rules, load_rules
|
|
346
|
+
|
|
347
|
+
self.log.info(f"Checking quality for: {self.source}")
|
|
348
|
+
self.log.info(f"Minimum score required: {self.min_quality_score}")
|
|
349
|
+
|
|
350
|
+
dataset = connect(self.source, table=self.table)
|
|
351
|
+
|
|
352
|
+
if self.rules:
|
|
353
|
+
ruleset = load_rules(self.rules)
|
|
354
|
+
else:
|
|
355
|
+
ruleset = generate_rules(dataset, as_yaml=False)
|
|
356
|
+
|
|
357
|
+
result = execute_rules(ruleset, dataset=dataset)
|
|
358
|
+
|
|
359
|
+
self.log.info(f"Current quality score: {result.quality_score:.1f}%")
|
|
360
|
+
|
|
361
|
+
if result.quality_score >= self.min_quality_score:
|
|
362
|
+
self.log.info("Quality threshold met!")
|
|
363
|
+
return True
|
|
364
|
+
else:
|
|
365
|
+
self.log.info(
|
|
366
|
+
f"Quality score {result.quality_score:.1f}% "
|
|
367
|
+
f"below threshold {self.min_quality_score}%"
|
|
368
|
+
)
|
|
369
|
+
return False
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def duckguard_check(
|
|
373
|
+
task_id: str,
|
|
374
|
+
source: str,
|
|
375
|
+
**kwargs: Any,
|
|
376
|
+
) -> DuckGuardOperator:
|
|
377
|
+
"""Convenience function for creating a DuckGuard check operator.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
task_id: Airflow task ID
|
|
381
|
+
source: Data source path or connection string
|
|
382
|
+
**kwargs: Additional DuckGuardOperator arguments
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
DuckGuardOperator instance
|
|
386
|
+
"""
|
|
387
|
+
return DuckGuardOperator(task_id=task_id, source=source, **kwargs)
|