duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,458 @@
1
+ """dbt Integration for DuckGuard.
2
+
3
+ Export DuckGuard validation rules as dbt tests and schema.yml configurations.
4
+
5
+ Usage:
6
+ from duckguard import load_rules
7
+ from duckguard.integrations import dbt
8
+
9
+ # Load existing DuckGuard rules
10
+ rules = load_rules("duckguard.yaml")
11
+
12
+ # Export to dbt schema.yml format
13
+ dbt.export_to_schema(rules, "models/schema.yml")
14
+
15
+ # Generate dbt singular tests
16
+ dbt.generate_singular_tests(rules, "tests/")
17
+
18
+ # Convert rules to dbt test format
19
+ tests = dbt.rules_to_dbt_tests(rules)
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ import yaml
28
+
29
+ from duckguard.rules.loader import load_rules
30
+ from duckguard.rules.schema import Check, CheckType, RuleSet
31
+
32
+ # Mapping from DuckGuard check types to dbt generic tests
33
+ DBT_TEST_MAPPING = {
34
+ CheckType.NOT_NULL: "not_null",
35
+ CheckType.UNIQUE: "unique",
36
+ CheckType.NO_DUPLICATES: "unique",
37
+ CheckType.ALLOWED_VALUES: "accepted_values",
38
+ CheckType.ISIN: "accepted_values",
39
+ }
40
+
41
+ # dbt test configurations for different check types
42
+ DBT_TEST_CONFIGS = {
43
+ "not_null": lambda check: {},
44
+ "unique": lambda check: {},
45
+ "accepted_values": lambda check: {"values": check.value if isinstance(check.value, list) else [check.value]},
46
+ }
47
+
48
+
49
+ def rules_to_dbt_tests(ruleset: RuleSet) -> dict[str, Any]:
50
+ """Convert a DuckGuard RuleSet to dbt test format.
51
+
52
+ Args:
53
+ ruleset: DuckGuard RuleSet
54
+
55
+ Returns:
56
+ Dictionary in dbt schema.yml format
57
+ """
58
+ columns = []
59
+
60
+ for col_name, col_rules in ruleset.columns.items():
61
+ column_tests = []
62
+
63
+ for check in col_rules.checks:
64
+ if not check.enabled:
65
+ continue
66
+
67
+ dbt_test = _check_to_dbt_test(check)
68
+ if dbt_test:
69
+ column_tests.append(dbt_test)
70
+
71
+ if column_tests:
72
+ columns.append({
73
+ "name": col_name,
74
+ "description": col_rules.description or "",
75
+ "tests": column_tests,
76
+ })
77
+
78
+ # Build model configuration
79
+ model = {
80
+ "name": ruleset.name or "validated_model",
81
+ "description": ruleset.description or "Model validated by DuckGuard",
82
+ "columns": columns,
83
+ }
84
+
85
+ # Add table-level tests if any
86
+ model_tests = []
87
+ for check in ruleset.table.checks:
88
+ if not check.enabled:
89
+ continue
90
+
91
+ if check.type == CheckType.ROW_COUNT:
92
+ # dbt-utils row_count test
93
+ model_tests.append({
94
+ "dbt_utils.expression_is_true": {
95
+ "expression": f"count(*) {check.operator} {check.value}"
96
+ }
97
+ })
98
+
99
+ if model_tests:
100
+ model["tests"] = model_tests
101
+
102
+ return {"models": [model]}
103
+
104
+
105
+ def _check_to_dbt_test(check) -> dict[str, Any] | str | None:
106
+ """Convert a single DuckGuard check to a dbt test.
107
+
108
+ Args:
109
+ check: DuckGuard Check object
110
+
111
+ Returns:
112
+ dbt test configuration (string for simple tests, dict for configured)
113
+ """
114
+ # Simple mapping for basic tests
115
+ if check.type in DBT_TEST_MAPPING:
116
+ test_name = DBT_TEST_MAPPING[check.type]
117
+ config_fn = DBT_TEST_CONFIGS.get(test_name)
118
+
119
+ if config_fn:
120
+ config = config_fn(check)
121
+ if config:
122
+ return {test_name: config}
123
+ return test_name
124
+
125
+ # Handle range/between checks
126
+ if check.type in (CheckType.BETWEEN, CheckType.RANGE):
127
+ if isinstance(check.value, (list, tuple)) and len(check.value) == 2:
128
+ min_val, max_val = check.value
129
+ return {
130
+ "dbt_utils.expression_is_true": {
131
+ "expression": f"{{{{ column_name }}}} >= {min_val} and {{{{ column_name }}}} <= {max_val}"
132
+ }
133
+ }
134
+
135
+ # Handle min/max checks
136
+ if check.type == CheckType.MIN:
137
+ return {
138
+ "dbt_utils.expression_is_true": {
139
+ "expression": f"{{{{ column_name }}}} >= {check.value}"
140
+ }
141
+ }
142
+
143
+ if check.type == CheckType.MAX:
144
+ return {
145
+ "dbt_utils.expression_is_true": {
146
+ "expression": f"{{{{ column_name }}}} <= {check.value}"
147
+ }
148
+ }
149
+
150
+ # Handle positive/negative/non_negative
151
+ if check.type == CheckType.POSITIVE:
152
+ return {
153
+ "dbt_utils.expression_is_true": {
154
+ "expression": "{{ column_name }} > 0"
155
+ }
156
+ }
157
+
158
+ if check.type == CheckType.NON_NEGATIVE:
159
+ return {
160
+ "dbt_utils.expression_is_true": {
161
+ "expression": "{{ column_name }} >= 0"
162
+ }
163
+ }
164
+
165
+ if check.type == CheckType.NEGATIVE:
166
+ return {
167
+ "dbt_utils.expression_is_true": {
168
+ "expression": "{{ column_name }} < 0"
169
+ }
170
+ }
171
+
172
+ # Handle pattern/regex checks
173
+ if check.type == CheckType.PATTERN:
174
+ return {
175
+ "dbt_utils.expression_is_true": {
176
+ "expression": f"REGEXP_MATCHES({{{{ column_name }}}}, '{check.value}')"
177
+ }
178
+ }
179
+
180
+ # Handle length checks
181
+ if check.type == CheckType.LENGTH:
182
+ if isinstance(check.value, (list, tuple)) and len(check.value) == 2:
183
+ min_len, max_len = check.value
184
+ return {
185
+ "dbt_utils.expression_is_true": {
186
+ "expression": f"LENGTH({{{{ column_name }}}}) >= {min_len} AND LENGTH({{{{ column_name }}}}) <= {max_len}"
187
+ }
188
+ }
189
+
190
+ if check.type == CheckType.MIN_LENGTH:
191
+ return {
192
+ "dbt_utils.expression_is_true": {
193
+ "expression": f"LENGTH({{{{ column_name }}}}) >= {check.value}"
194
+ }
195
+ }
196
+
197
+ if check.type == CheckType.MAX_LENGTH:
198
+ return {
199
+ "dbt_utils.expression_is_true": {
200
+ "expression": f"LENGTH({{{{ column_name }}}}) <= {check.value}"
201
+ }
202
+ }
203
+
204
+ # Handle null percentage checks
205
+ if check.type == CheckType.NULL_PERCENT:
206
+ # This requires a singular test
207
+ return None
208
+
209
+ return None
210
+
211
+
212
+ def export_to_schema(
213
+ rules: RuleSet | str,
214
+ output_path: str | Path,
215
+ merge: bool = True
216
+ ) -> Path:
217
+ """Export DuckGuard rules to a dbt schema.yml file.
218
+
219
+ Args:
220
+ rules: RuleSet or path to duckguard.yaml file
221
+ output_path: Path to output schema.yml file
222
+ merge: If True, merge with existing file (default: True)
223
+
224
+ Returns:
225
+ Path to created schema.yml file
226
+ """
227
+ if isinstance(rules, str):
228
+ rules = load_rules(rules)
229
+
230
+ output_path = Path(output_path)
231
+ dbt_config = rules_to_dbt_tests(rules)
232
+
233
+ # Merge with existing file if it exists
234
+ if merge and output_path.exists():
235
+ with open(output_path) as f:
236
+ existing = yaml.safe_load(f) or {}
237
+
238
+ if "models" in existing:
239
+ # Merge models by name
240
+ existing_models = {m["name"]: m for m in existing.get("models", [])}
241
+ for model in dbt_config["models"]:
242
+ if model["name"] in existing_models:
243
+ # Merge columns
244
+ existing_cols = {c["name"]: c for c in existing_models[model["name"]].get("columns", [])}
245
+ for col in model.get("columns", []):
246
+ if col["name"] in existing_cols:
247
+ # Merge tests
248
+ existing_tests = existing_cols[col["name"]].get("tests", [])
249
+ new_tests = col.get("tests", [])
250
+ merged_tests = _merge_tests(existing_tests, new_tests)
251
+ existing_cols[col["name"]]["tests"] = merged_tests
252
+ else:
253
+ existing_cols[col["name"]] = col
254
+ existing_models[model["name"]]["columns"] = list(existing_cols.values())
255
+ else:
256
+ existing_models[model["name"]] = model
257
+ dbt_config["models"] = list(existing_models.values())
258
+
259
+ # Write output
260
+ output_path.parent.mkdir(parents=True, exist_ok=True)
261
+ with open(output_path, "w") as f:
262
+ yaml.dump(dbt_config, f, default_flow_style=False, sort_keys=False)
263
+
264
+ return output_path
265
+
266
+
267
+ def _merge_tests(existing: list, new: list) -> list:
268
+ """Merge test lists, avoiding duplicates."""
269
+ result = list(existing)
270
+ existing_names = set()
271
+
272
+ for test in existing:
273
+ if isinstance(test, str):
274
+ existing_names.add(test)
275
+ elif isinstance(test, dict):
276
+ existing_names.update(test.keys())
277
+
278
+ for test in new:
279
+ if isinstance(test, str):
280
+ if test not in existing_names:
281
+ result.append(test)
282
+ elif isinstance(test, dict):
283
+ test_name = list(test.keys())[0]
284
+ if test_name not in existing_names:
285
+ result.append(test)
286
+
287
+ return result
288
+
289
+
290
+ def generate_singular_tests(
291
+ rules: RuleSet | str,
292
+ output_dir: str | Path,
293
+ table_name: str | None = None
294
+ ) -> list[Path]:
295
+ """Generate dbt singular test files from DuckGuard rules.
296
+
297
+ Singular tests are good for complex validations that can't be expressed
298
+ as generic tests.
299
+
300
+ Args:
301
+ rules: RuleSet or path to duckguard.yaml file
302
+ output_dir: Directory to write test files
303
+ table_name: Table name to use in tests (defaults to rules.name)
304
+
305
+ Returns:
306
+ List of created test file paths
307
+ """
308
+ if isinstance(rules, str):
309
+ rules = load_rules(rules)
310
+
311
+ output_dir = Path(output_dir)
312
+ output_dir.mkdir(parents=True, exist_ok=True)
313
+
314
+ table = table_name or rules.name or "source_table"
315
+ created_files = []
316
+
317
+ # Generate tests for checks that can't be generic tests
318
+ for col_name, col_rules in rules.columns.items():
319
+ for check in col_rules.checks:
320
+ if not check.enabled:
321
+ continue
322
+
323
+ # Generate singular test for complex checks
324
+ if check.type == CheckType.NULL_PERCENT:
325
+ test_sql = _generate_null_percent_test(table, col_name, check)
326
+ if test_sql:
327
+ filename = f"test_{table}_{col_name}_null_percent.sql"
328
+ test_path = output_dir / filename
329
+ with open(test_path, "w") as f:
330
+ f.write(test_sql)
331
+ created_files.append(test_path)
332
+
333
+ if check.type == CheckType.UNIQUE_PERCENT:
334
+ test_sql = _generate_unique_percent_test(table, col_name, check)
335
+ if test_sql:
336
+ filename = f"test_{table}_{col_name}_unique_percent.sql"
337
+ test_path = output_dir / filename
338
+ with open(test_path, "w") as f:
339
+ f.write(test_sql)
340
+ created_files.append(test_path)
341
+
342
+ return created_files
343
+
344
+
345
+ def _generate_null_percent_test(table: str, column: str, check) -> str:
346
+ """Generate SQL for null percentage test."""
347
+ operator = check.operator or "<="
348
+ threshold = check.value
349
+
350
+ return f"""-- Test that {column} null percentage is {operator} {threshold}%
351
+ -- Generated by DuckGuard
352
+
353
+ SELECT
354
+ COUNT(*) FILTER (WHERE "{column}" IS NULL) * 100.0 / COUNT(*) as null_pct
355
+ FROM {{{{ ref('{table}') }}}}
356
+ HAVING COUNT(*) FILTER (WHERE "{column}" IS NULL) * 100.0 / COUNT(*) {_invert_operator(operator)} {threshold}
357
+ """
358
+
359
+
360
+ def _generate_unique_percent_test(table: str, column: str, check) -> str:
361
+ """Generate SQL for unique percentage test."""
362
+ operator = check.operator or ">="
363
+ threshold = check.value
364
+
365
+ return f"""-- Test that {column} unique percentage is {operator} {threshold}%
366
+ -- Generated by DuckGuard
367
+
368
+ SELECT
369
+ COUNT(DISTINCT "{column}") * 100.0 / COUNT(*) as unique_pct
370
+ FROM {{{{ ref('{table}') }}}}
371
+ WHERE "{column}" IS NOT NULL
372
+ HAVING COUNT(DISTINCT "{column}") * 100.0 / COUNT(*) {_invert_operator(operator)} {threshold}
373
+ """
374
+
375
+
376
+ def _invert_operator(op: str) -> str:
377
+ """Invert comparison operator for failure condition."""
378
+ inversions = {
379
+ ">=": "<",
380
+ ">": "<=",
381
+ "<=": ">",
382
+ "<": ">=",
383
+ "=": "!=",
384
+ "==": "!=",
385
+ "!=": "=",
386
+ }
387
+ return inversions.get(op, op)
388
+
389
+
390
+ def import_from_dbt(schema_path: str | Path) -> RuleSet:
391
+ """Import dbt schema.yml tests as DuckGuard rules.
392
+
393
+ Args:
394
+ schema_path: Path to dbt schema.yml file
395
+
396
+ Returns:
397
+ DuckGuard RuleSet
398
+ """
399
+ from duckguard.rules.schema import ColumnRules, RuleSet, TableRules
400
+
401
+ with open(schema_path) as f:
402
+ schema = yaml.safe_load(f)
403
+
404
+ models = schema.get("models", [])
405
+ if not models:
406
+ raise ValueError("No models found in schema.yml")
407
+
408
+ # Use first model
409
+ model = models[0]
410
+
411
+ columns = {}
412
+ for col_def in model.get("columns", []):
413
+ col_name = col_def["name"]
414
+ checks = []
415
+
416
+ for test in col_def.get("tests", []):
417
+ check = _dbt_test_to_check(test)
418
+ if check:
419
+ checks.append(check)
420
+
421
+ if checks:
422
+ columns[col_name] = ColumnRules(
423
+ name=col_name,
424
+ description=col_def.get("description", ""),
425
+ checks=checks,
426
+ )
427
+
428
+ return RuleSet(
429
+ name=model.get("name", "imported_rules"),
430
+ description=model.get("description", "Imported from dbt"),
431
+ table=TableRules(),
432
+ columns=columns,
433
+ )
434
+
435
+
436
+ def _dbt_test_to_check(test) -> Check | None:
437
+ """Convert a dbt test to a DuckGuard Check."""
438
+
439
+ if isinstance(test, str):
440
+ if test == "not_null":
441
+ return Check(type=CheckType.NOT_NULL)
442
+ if test == "unique":
443
+ return Check(type=CheckType.UNIQUE)
444
+ return None
445
+
446
+ if isinstance(test, dict):
447
+ test_name = list(test.keys())[0]
448
+ config = test[test_name]
449
+
450
+ if test_name == "not_null":
451
+ return Check(type=CheckType.NOT_NULL)
452
+ if test_name == "unique":
453
+ return Check(type=CheckType.UNIQUE)
454
+ if test_name == "accepted_values":
455
+ values = config.get("values", [])
456
+ return Check(type=CheckType.ALLOWED_VALUES, value=values)
457
+
458
+ return None
@@ -0,0 +1,61 @@
1
+ """
2
+ DuckGuard Notifications - Slack, Teams, and Email alerting for data quality checks.
3
+
4
+ Usage:
5
+ from duckguard.notifications import SlackNotifier, TeamsNotifier, EmailNotifier
6
+
7
+ # Slack
8
+ slack = SlackNotifier(webhook_url="https://hooks.slack.com/...")
9
+ slack.send_results(execution_result)
10
+
11
+ # Microsoft Teams
12
+ teams = TeamsNotifier(webhook_url="https://outlook.office.com/webhook/...")
13
+ teams.send_results(execution_result)
14
+
15
+ # Email
16
+ email = EmailNotifier(
17
+ smtp_host="smtp.gmail.com",
18
+ smtp_user="alerts@company.com",
19
+ smtp_password="app_password",
20
+ to_addresses=["team@company.com"],
21
+ )
22
+ email.send_results(execution_result)
23
+
24
+ # Auto-notify on failures
25
+ from duckguard import execute_rules, load_rules
26
+
27
+ rules = load_rules("duckguard.yaml")
28
+ result = execute_rules(rules, "data.csv")
29
+
30
+ if not result.passed:
31
+ slack.send_failure_alert(result)
32
+ email.send_failure_alert(result)
33
+ """
34
+
35
+ from duckguard.notifications.email import (
36
+ EmailConfig,
37
+ EmailNotifier,
38
+ )
39
+ from duckguard.notifications.formatter import (
40
+ format_results_markdown,
41
+ format_results_text,
42
+ )
43
+ from duckguard.notifications.notifiers import (
44
+ BaseNotifier,
45
+ NotificationConfig,
46
+ NotificationError,
47
+ SlackNotifier,
48
+ TeamsNotifier,
49
+ )
50
+
51
+ __all__ = [
52
+ "BaseNotifier",
53
+ "NotificationConfig",
54
+ "NotificationError",
55
+ "SlackNotifier",
56
+ "TeamsNotifier",
57
+ "EmailNotifier",
58
+ "EmailConfig",
59
+ "format_results_text",
60
+ "format_results_markdown",
61
+ ]