duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,483 @@
1
+ """YAML rule loader for DuckGuard.
2
+
3
+ Parses YAML configuration files into RuleSet objects.
4
+ Supports a simple, readable syntax without complex DSL.
5
+
6
+ Example YAML:
7
+ source: data/orders.csv
8
+
9
+ checks:
10
+ customer_id:
11
+ - not_null
12
+ - unique
13
+
14
+ amount:
15
+ - positive
16
+ - range: [0, 10000]
17
+
18
+ email:
19
+ - pattern: email
20
+ - null_percent: < 5%
21
+
22
+ status:
23
+ - allowed_values: [pending, shipped, delivered]
24
+
25
+ table:
26
+ - row_count: "> 0"
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import re
32
+ from pathlib import Path
33
+ from typing import Any
34
+
35
+ import yaml
36
+
37
+ from duckguard.rules.schema import (
38
+ RuleSet,
39
+ ColumnRules,
40
+ TableRules,
41
+ Check,
42
+ CheckType,
43
+ Severity,
44
+ BUILTIN_PATTERNS,
45
+ )
46
+
47
+
48
+ class RuleParseError(Exception):
49
+ """Raised when YAML rule parsing fails."""
50
+
51
+ def __init__(self, message: str, location: str | None = None):
52
+ self.location = location
53
+ full_message = f"{message}" if not location else f"{message} (at {location})"
54
+ super().__init__(full_message)
55
+
56
+
57
+ def load_rules(path: str | Path) -> RuleSet:
58
+ """Load rules from a YAML file.
59
+
60
+ Args:
61
+ path: Path to the YAML file
62
+
63
+ Returns:
64
+ Parsed RuleSet
65
+
66
+ Raises:
67
+ FileNotFoundError: If the file doesn't exist
68
+ RuleParseError: If the YAML is invalid
69
+ """
70
+ path = Path(path)
71
+ if not path.exists():
72
+ raise FileNotFoundError(f"Rules file not found: {path}")
73
+
74
+ with open(path, "r", encoding="utf-8") as f:
75
+ content = f.read()
76
+
77
+ return load_rules_from_string(content, source_file=str(path))
78
+
79
+
80
+ def load_rules_from_string(content: str, source_file: str | None = None) -> RuleSet:
81
+ """Load rules from a YAML string.
82
+
83
+ Args:
84
+ content: YAML content as string
85
+ source_file: Optional source file path for error messages
86
+
87
+ Returns:
88
+ Parsed RuleSet
89
+ """
90
+ try:
91
+ data = yaml.safe_load(content)
92
+ except yaml.YAMLError as e:
93
+ raise RuleParseError(f"Invalid YAML: {e}", source_file)
94
+
95
+ if not data:
96
+ return RuleSet()
97
+
98
+ if not isinstance(data, dict):
99
+ raise RuleParseError("YAML root must be a mapping", source_file)
100
+
101
+ return _parse_ruleset(data, source_file)
102
+
103
+
104
+ def _parse_ruleset(data: dict[str, Any], source_file: str | None = None) -> RuleSet:
105
+ """Parse a dictionary into a RuleSet."""
106
+ # Support both "name:" and "dataset:" for the ruleset name
107
+ name = data.get("name") or data.get("dataset")
108
+
109
+ ruleset = RuleSet(
110
+ source=data.get("source"),
111
+ name=name,
112
+ version=str(data.get("version", "1.0")),
113
+ description=data.get("description"),
114
+ settings=data.get("settings", {}),
115
+ )
116
+
117
+ # Check for simple "rules:" list format (like Soda-style)
118
+ rules_data = data.get("rules", [])
119
+ if rules_data and isinstance(rules_data, list):
120
+ for rule_expr in rules_data:
121
+ if isinstance(rule_expr, str):
122
+ ruleset.add_simple_check(rule_expr)
123
+ # Also parse into structured format for execution
124
+ _parse_simple_rule_expression(ruleset, rule_expr, source_file)
125
+
126
+ # Parse column checks (structured format)
127
+ checks_data = data.get("checks", {})
128
+ if isinstance(checks_data, dict):
129
+ for col_name, col_checks in checks_data.items():
130
+ column_rules = _parse_column_rules(col_name, col_checks, source_file)
131
+ ruleset.columns[col_name] = column_rules
132
+
133
+ # Parse table-level checks
134
+ table_data = data.get("table", [])
135
+ if table_data:
136
+ ruleset.table = _parse_table_rules(table_data, source_file)
137
+
138
+ return ruleset
139
+
140
+
141
+ def _parse_simple_rule_expression(
142
+ ruleset: RuleSet,
143
+ expr: str,
144
+ source_file: str | None = None
145
+ ) -> None:
146
+ """Parse a simple rule expression like 'order_id is not null' into structured checks."""
147
+ expr = expr.strip()
148
+
149
+ # Table-level rules
150
+ if expr.startswith("row_count"):
151
+ # Parse: "row_count > 0", "row_count < 1000000"
152
+ match = re.match(r"row_count\s*([<>=!]+)\s*(\d+)", expr)
153
+ if match:
154
+ operator = match.group(1)
155
+ value = int(match.group(2))
156
+ ruleset.add_table_check(CheckType.ROW_COUNT, value=value, operator=operator)
157
+ return
158
+
159
+ # Column-level rules - parse various patterns
160
+ # Pattern: "column_name is not null"
161
+ match = re.match(r"(\w+)\s+is\s+not\s+null", expr, re.IGNORECASE)
162
+ if match:
163
+ col_name = match.group(1)
164
+ ruleset.add_column_check(col_name, CheckType.NOT_NULL)
165
+ return
166
+
167
+ # Pattern: "column_name is unique"
168
+ match = re.match(r"(\w+)\s+is\s+unique", expr, re.IGNORECASE)
169
+ if match:
170
+ col_name = match.group(1)
171
+ ruleset.add_column_check(col_name, CheckType.UNIQUE)
172
+ return
173
+
174
+ # Pattern: "column_name >= value" or "column_name > value"
175
+ match = re.match(r"(\w+)\s*([<>=!]+)\s*(-?[\d.]+)", expr)
176
+ if match:
177
+ col_name = match.group(1)
178
+ operator = match.group(2)
179
+ value = float(match.group(3)) if "." in match.group(3) else int(match.group(3))
180
+
181
+ if operator == ">=":
182
+ ruleset.add_column_check(col_name, CheckType.MIN, value=value)
183
+ elif operator == ">":
184
+ ruleset.add_column_check(col_name, CheckType.MIN, value=value, operator=">")
185
+ elif operator == "<=":
186
+ ruleset.add_column_check(col_name, CheckType.MAX, value=value)
187
+ elif operator == "<":
188
+ ruleset.add_column_check(col_name, CheckType.MAX, value=value, operator="<")
189
+ return
190
+
191
+ # Pattern: "column_name in ['a', 'b', 'c']"
192
+ match = re.match(r"(\w+)\s+in\s+\[(.+)\]", expr, re.IGNORECASE)
193
+ if match:
194
+ col_name = match.group(1)
195
+ values_str = match.group(2)
196
+ # Parse the values list
197
+ values = [v.strip().strip("'\"") for v in values_str.split(",")]
198
+ ruleset.add_column_check(col_name, CheckType.ALLOWED_VALUES, value=values)
199
+ return
200
+
201
+ # Pattern: "column_name matches 'pattern'"
202
+ match = re.match(r"(\w+)\s+matches\s+['\"](.+)['\"]", expr, re.IGNORECASE)
203
+ if match:
204
+ col_name = match.group(1)
205
+ pattern = match.group(2)
206
+ ruleset.add_column_check(col_name, CheckType.PATTERN, value=pattern)
207
+ return
208
+
209
+ # Pattern: "column_name between min and max"
210
+ match = re.match(r"(\w+)\s+between\s+(-?[\d.]+)\s+and\s+(-?[\d.]+)", expr, re.IGNORECASE)
211
+ if match:
212
+ col_name = match.group(1)
213
+ min_val = float(match.group(2)) if "." in match.group(2) else int(match.group(2))
214
+ max_val = float(match.group(3)) if "." in match.group(3) else int(match.group(3))
215
+ ruleset.add_column_check(col_name, CheckType.BETWEEN, value=[min_val, max_val])
216
+ return
217
+
218
+ # Pattern: "column_name null_percent < 5"
219
+ match = re.match(r"(\w+)\s+null_percent\s*([<>=!]+)\s*(\d+)", expr, re.IGNORECASE)
220
+ if match:
221
+ col_name = match.group(1)
222
+ operator = match.group(2)
223
+ value = int(match.group(3))
224
+ ruleset.add_column_check(col_name, CheckType.NULL_PERCENT, value=value, operator=operator)
225
+ return
226
+
227
+
228
+ def _parse_column_rules(
229
+ col_name: str,
230
+ checks: list[Any],
231
+ source_file: str | None = None
232
+ ) -> ColumnRules:
233
+ """Parse column-level rules."""
234
+ column_rules = ColumnRules(name=col_name)
235
+
236
+ if not checks:
237
+ return column_rules
238
+
239
+ if not isinstance(checks, list):
240
+ checks = [checks]
241
+
242
+ for check_item in checks:
243
+ check = _parse_check(check_item, f"checks.{col_name}", source_file)
244
+ if check:
245
+ column_rules.checks.append(check)
246
+
247
+ return column_rules
248
+
249
+
250
+ def _parse_table_rules(
251
+ checks: list[Any],
252
+ source_file: str | None = None
253
+ ) -> TableRules:
254
+ """Parse table-level rules."""
255
+ table_rules = TableRules()
256
+
257
+ if not isinstance(checks, list):
258
+ checks = [checks]
259
+
260
+ for check_item in checks:
261
+ check = _parse_check(check_item, "table", source_file)
262
+ if check:
263
+ table_rules.checks.append(check)
264
+
265
+ return table_rules
266
+
267
+
268
+ def _parse_check(
269
+ check_item: Any,
270
+ location: str,
271
+ source_file: str | None = None
272
+ ) -> Check | None:
273
+ """Parse a single check from various formats.
274
+
275
+ Supports:
276
+ - Simple string: "not_null"
277
+ - Dict with value: {"range": [0, 100]}
278
+ - Dict with operator: {"null_percent": "< 5"}
279
+ - Dict with params: {"pattern": {"value": "email", "severity": "warning"}}
280
+ """
281
+ if check_item is None:
282
+ return None
283
+
284
+ # Simple string format: "not_null"
285
+ if isinstance(check_item, str):
286
+ return _parse_simple_check(check_item)
287
+
288
+ # Dictionary format
289
+ if isinstance(check_item, dict):
290
+ return _parse_dict_check(check_item, location, source_file)
291
+
292
+ raise RuleParseError(
293
+ f"Invalid check format: {check_item}",
294
+ f"{source_file}:{location}" if source_file else location
295
+ )
296
+
297
+
298
+ def _parse_simple_check(check_str: str) -> Check:
299
+ """Parse a simple string check like 'not_null' or 'unique'."""
300
+ check_str = check_str.lower().strip()
301
+
302
+ # Handle simple check types
303
+ simple_checks = {
304
+ "not_null": CheckType.NOT_NULL,
305
+ "notnull": CheckType.NOT_NULL,
306
+ "required": CheckType.NOT_NULL,
307
+ "unique": CheckType.UNIQUE,
308
+ "no_duplicates": CheckType.NO_DUPLICATES,
309
+ "positive": CheckType.POSITIVE,
310
+ "negative": CheckType.NEGATIVE,
311
+ "non_negative": CheckType.NON_NEGATIVE,
312
+ "nonnegative": CheckType.NON_NEGATIVE,
313
+ }
314
+
315
+ if check_str in simple_checks:
316
+ return Check(type=simple_checks[check_str])
317
+
318
+ # Try to parse as CheckType
319
+ try:
320
+ check_type = CheckType(check_str)
321
+ return Check(type=check_type)
322
+ except ValueError:
323
+ raise RuleParseError(f"Unknown check type: {check_str}")
324
+
325
+
326
+ def _parse_dict_check(
327
+ check_dict: dict[str, Any],
328
+ location: str,
329
+ source_file: str | None = None
330
+ ) -> Check:
331
+ """Parse a dictionary check."""
332
+ if len(check_dict) == 0:
333
+ raise RuleParseError("Empty check definition", location)
334
+
335
+ # Get the check type (first key)
336
+ check_type_str = list(check_dict.keys())[0]
337
+ check_value = check_dict[check_type_str]
338
+
339
+ # Normalize check type
340
+ check_type_str_normalized = check_type_str.lower().replace("-", "_")
341
+
342
+ # Map common aliases
343
+ type_aliases = {
344
+ "not_null": "not_null",
345
+ "notnull": "not_null",
346
+ "required": "not_null",
347
+ "unique": "unique",
348
+ "range": "between",
349
+ "between": "between",
350
+ "min": "min",
351
+ "max": "max",
352
+ "pattern": "pattern",
353
+ "regex": "pattern",
354
+ "allowed_values": "allowed_values",
355
+ "isin": "allowed_values",
356
+ "in": "allowed_values",
357
+ "values": "allowed_values",
358
+ "length": "length",
359
+ "min_length": "min_length",
360
+ "max_length": "max_length",
361
+ "type": "type",
362
+ "semantic_type": "semantic_type",
363
+ "null_percent": "null_percent",
364
+ "unique_percent": "unique_percent",
365
+ "row_count": "row_count",
366
+ "positive": "positive",
367
+ "negative": "negative",
368
+ "non_negative": "non_negative",
369
+ "anomaly": "anomaly",
370
+ "custom_sql": "custom_sql",
371
+ "sql": "custom_sql",
372
+ }
373
+
374
+ if check_type_str_normalized in type_aliases:
375
+ check_type_str_normalized = type_aliases[check_type_str_normalized]
376
+
377
+ try:
378
+ check_type = CheckType(check_type_str_normalized)
379
+ except ValueError:
380
+ raise RuleParseError(f"Unknown check type: {check_type_str}", location)
381
+
382
+ # Parse the value and extract operator if present
383
+ value, operator, params = _parse_check_value(check_type, check_value)
384
+
385
+ # Extract severity if specified
386
+ severity = Severity.ERROR
387
+ message = None
388
+
389
+ if isinstance(check_value, dict):
390
+ if "severity" in check_value:
391
+ severity = Severity(check_value["severity"].lower())
392
+ if "message" in check_value:
393
+ message = check_value["message"]
394
+ if "params" in check_value:
395
+ params.update(check_value["params"])
396
+
397
+ return Check(
398
+ type=check_type,
399
+ value=value,
400
+ operator=operator,
401
+ severity=severity,
402
+ message=message,
403
+ params=params,
404
+ )
405
+
406
+
407
+ def _parse_check_value(
408
+ check_type: CheckType,
409
+ raw_value: Any
410
+ ) -> tuple[Any, str, dict[str, Any]]:
411
+ """Parse the value portion of a check, extracting operators if present.
412
+
413
+ Returns:
414
+ Tuple of (value, operator, extra_params)
415
+ """
416
+ operator = "="
417
+ params: dict[str, Any] = {}
418
+
419
+ # Handle None
420
+ if raw_value is None:
421
+ return None, operator, params
422
+
423
+ # Handle dict with explicit value
424
+ if isinstance(raw_value, dict):
425
+ if "value" in raw_value:
426
+ raw_value = raw_value["value"]
427
+ elif "min" in raw_value and "max" in raw_value:
428
+ # Range specified as {min: 0, max: 100}
429
+ return [raw_value["min"], raw_value["max"]], operator, params
430
+ elif "method" in raw_value:
431
+ # Anomaly detection params
432
+ params = raw_value.copy()
433
+ return None, operator, params
434
+ else:
435
+ # Pass entire dict as params
436
+ return None, operator, raw_value
437
+
438
+ # Handle list (for range, allowed_values)
439
+ if isinstance(raw_value, list):
440
+ return raw_value, operator, params
441
+
442
+ # Handle string with operator: "< 5", "> 0", "<= 10%"
443
+ if isinstance(raw_value, str):
444
+ value_str = raw_value.strip()
445
+
446
+ # Check for percentage
447
+ is_percent = value_str.endswith("%")
448
+ if is_percent:
449
+ value_str = value_str[:-1].strip()
450
+
451
+ # Extract operator
452
+ operator_match = re.match(r"^([<>=!]+)\s*(.+)$", value_str)
453
+ if operator_match:
454
+ operator = operator_match.group(1)
455
+ value_str = operator_match.group(2).strip()
456
+
457
+ # Handle built-in patterns
458
+ if check_type == CheckType.PATTERN and value_str.lower() in BUILTIN_PATTERNS:
459
+ return BUILTIN_PATTERNS[value_str.lower()], operator, {"pattern_name": value_str.lower()}
460
+
461
+ # Try to parse as number
462
+ try:
463
+ if "." in value_str:
464
+ value = float(value_str)
465
+ else:
466
+ value = int(value_str)
467
+ except ValueError:
468
+ value = value_str
469
+
470
+ if is_percent:
471
+ params["is_percent"] = True
472
+
473
+ return value, operator, params
474
+
475
+ # Handle boolean
476
+ if isinstance(raw_value, bool):
477
+ return raw_value, operator, params
478
+
479
+ # Handle numbers directly
480
+ if isinstance(raw_value, (int, float)):
481
+ return raw_value, operator, params
482
+
483
+ return raw_value, operator, params