duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,616 @@
1
+ """Rule executor for DuckGuard.
2
+
3
+ Executes validation rules against datasets and collects results.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from typing import Any
11
+
12
+ from duckguard.core.dataset import Dataset
13
+ from duckguard.core.result import ValidationResult
14
+ from duckguard.connectors import connect
15
+ from duckguard.rules.schema import (
16
+ RuleSet,
17
+ Check,
18
+ CheckType,
19
+ Severity,
20
+ BUILTIN_PATTERNS,
21
+ )
22
+
23
+
24
+ @dataclass
25
+ class CheckResult:
26
+ """Result of a single check execution.
27
+
28
+ Attributes:
29
+ check: The check that was executed
30
+ column: Column name (None for table-level checks)
31
+ passed: Whether the check passed
32
+ actual_value: The actual value found
33
+ expected_value: The expected value or threshold
34
+ message: Human-readable result message
35
+ severity: Severity level of the check
36
+ details: Additional details about the check
37
+ """
38
+
39
+ check: Check
40
+ column: str | None
41
+ passed: bool
42
+ actual_value: Any
43
+ expected_value: Any
44
+ message: str
45
+ severity: Severity = Severity.ERROR
46
+ details: dict[str, Any] = field(default_factory=dict)
47
+
48
+ @property
49
+ def status(self) -> str:
50
+ """Get status string."""
51
+ if self.passed:
52
+ return "PASSED"
53
+ if self.severity == Severity.WARNING:
54
+ return "WARNING"
55
+ if self.severity == Severity.INFO:
56
+ return "INFO"
57
+ return "FAILED"
58
+
59
+ @property
60
+ def is_failure(self) -> bool:
61
+ """Check if this is a hard failure."""
62
+ return not self.passed and self.severity == Severity.ERROR
63
+
64
+
65
+ @dataclass
66
+ class ExecutionResult:
67
+ """Result of executing a complete rule set.
68
+
69
+ Attributes:
70
+ ruleset: The rule set that was executed
71
+ source: The data source that was validated
72
+ results: Individual check results
73
+ started_at: When execution started
74
+ finished_at: When execution finished
75
+ """
76
+
77
+ ruleset: RuleSet
78
+ source: str
79
+ results: list[CheckResult] = field(default_factory=list)
80
+ started_at: datetime = field(default_factory=datetime.now)
81
+ finished_at: datetime | None = None
82
+
83
+ @property
84
+ def passed(self) -> bool:
85
+ """Check if all error-level checks passed."""
86
+ return not any(r.is_failure for r in self.results)
87
+
88
+ @property
89
+ def total_checks(self) -> int:
90
+ return len(self.results)
91
+
92
+ @property
93
+ def passed_count(self) -> int:
94
+ return sum(1 for r in self.results if r.passed)
95
+
96
+ @property
97
+ def failed_count(self) -> int:
98
+ return sum(1 for r in self.results if r.is_failure)
99
+
100
+ @property
101
+ def warning_count(self) -> int:
102
+ return sum(1 for r in self.results if not r.passed and r.severity == Severity.WARNING)
103
+
104
+ @property
105
+ def quality_score(self) -> float:
106
+ """Calculate quality score (0-100)."""
107
+ if not self.results:
108
+ return 100.0
109
+ return (self.passed_count / self.total_checks) * 100
110
+
111
+ def get_failures(self) -> list[CheckResult]:
112
+ """Get all failed checks."""
113
+ return [r for r in self.results if r.is_failure]
114
+
115
+ def get_warnings(self) -> list[CheckResult]:
116
+ """Get all warning checks."""
117
+ return [r for r in self.results if not r.passed and r.severity == Severity.WARNING]
118
+
119
+
120
+ class RuleExecutor:
121
+ """Executes validation rules against datasets."""
122
+
123
+ def __init__(self, dataset: Dataset | None = None):
124
+ """Initialize the executor.
125
+
126
+ Args:
127
+ dataset: Optional pre-loaded dataset
128
+ """
129
+ self._dataset = dataset
130
+
131
+ def execute(
132
+ self,
133
+ ruleset: RuleSet,
134
+ source: str | None = None,
135
+ dataset: Dataset | None = None
136
+ ) -> ExecutionResult:
137
+ """Execute a rule set against a data source.
138
+
139
+ Args:
140
+ ruleset: The rules to execute
141
+ source: Data source path (overrides ruleset.source)
142
+ dataset: Pre-loaded dataset (overrides source)
143
+
144
+ Returns:
145
+ ExecutionResult with all check results
146
+ """
147
+ # Determine data source
148
+ if dataset is not None:
149
+ ds = dataset
150
+ source_str = dataset.source
151
+ elif source is not None:
152
+ ds = connect(source)
153
+ source_str = source
154
+ elif ruleset.source is not None:
155
+ ds = connect(ruleset.source)
156
+ source_str = ruleset.source
157
+ elif self._dataset is not None:
158
+ ds = self._dataset
159
+ source_str = ds.source
160
+ else:
161
+ raise ValueError("No data source specified")
162
+
163
+ result = ExecutionResult(
164
+ ruleset=ruleset,
165
+ source=source_str,
166
+ )
167
+
168
+ # Execute table-level checks
169
+ for check in ruleset.table.checks:
170
+ if check.enabled:
171
+ check_result = self._execute_table_check(ds, check)
172
+ result.results.append(check_result)
173
+
174
+ # Execute column-level checks
175
+ for col_name, col_rules in ruleset.columns.items():
176
+ if col_name not in ds.columns:
177
+ # Column doesn't exist - add error
178
+ result.results.append(CheckResult(
179
+ check=Check(type=CheckType.NOT_NULL),
180
+ column=col_name,
181
+ passed=False,
182
+ actual_value=None,
183
+ expected_value="column exists",
184
+ message=f"Column '{col_name}' not found in dataset",
185
+ severity=Severity.ERROR,
186
+ ))
187
+ continue
188
+
189
+ for check in col_rules.checks:
190
+ if check.enabled:
191
+ check_result = self._execute_column_check(ds, col_name, check)
192
+ result.results.append(check_result)
193
+
194
+ result.finished_at = datetime.now()
195
+ return result
196
+
197
+ def _execute_table_check(self, dataset: Dataset, check: Check) -> CheckResult:
198
+ """Execute a table-level check."""
199
+ try:
200
+ if check.type == CheckType.ROW_COUNT:
201
+ return self._check_row_count(dataset, check)
202
+ else:
203
+ return CheckResult(
204
+ check=check,
205
+ column=None,
206
+ passed=False,
207
+ actual_value=None,
208
+ expected_value=None,
209
+ message=f"Unsupported table check type: {check.type.value}",
210
+ severity=Severity.ERROR,
211
+ )
212
+ except Exception as e:
213
+ return CheckResult(
214
+ check=check,
215
+ column=None,
216
+ passed=False,
217
+ actual_value=None,
218
+ expected_value=None,
219
+ message=f"Error executing check: {e}",
220
+ severity=Severity.ERROR,
221
+ )
222
+
223
+ def _execute_column_check(
224
+ self,
225
+ dataset: Dataset,
226
+ col_name: str,
227
+ check: Check
228
+ ) -> CheckResult:
229
+ """Execute a column-level check."""
230
+ try:
231
+ col = dataset[col_name]
232
+
233
+ check_handlers = {
234
+ CheckType.NOT_NULL: self._check_not_null,
235
+ CheckType.NULL_PERCENT: self._check_null_percent,
236
+ CheckType.UNIQUE: self._check_unique,
237
+ CheckType.UNIQUE_PERCENT: self._check_unique_percent,
238
+ CheckType.NO_DUPLICATES: self._check_no_duplicates,
239
+ CheckType.BETWEEN: self._check_between,
240
+ CheckType.RANGE: self._check_between,
241
+ CheckType.MIN: self._check_min,
242
+ CheckType.MAX: self._check_max,
243
+ CheckType.POSITIVE: self._check_positive,
244
+ CheckType.NEGATIVE: self._check_negative,
245
+ CheckType.NON_NEGATIVE: self._check_non_negative,
246
+ CheckType.PATTERN: self._check_pattern,
247
+ CheckType.LENGTH: self._check_length,
248
+ CheckType.MIN_LENGTH: self._check_min_length,
249
+ CheckType.MAX_LENGTH: self._check_max_length,
250
+ CheckType.ALLOWED_VALUES: self._check_allowed_values,
251
+ CheckType.ISIN: self._check_allowed_values,
252
+ }
253
+
254
+ handler = check_handlers.get(check.type)
255
+ if handler:
256
+ return handler(col, check)
257
+ else:
258
+ return CheckResult(
259
+ check=check,
260
+ column=col_name,
261
+ passed=False,
262
+ actual_value=None,
263
+ expected_value=None,
264
+ message=f"Unsupported check type: {check.type.value}",
265
+ severity=Severity.ERROR,
266
+ )
267
+
268
+ except Exception as e:
269
+ return CheckResult(
270
+ check=check,
271
+ column=col_name,
272
+ passed=False,
273
+ actual_value=None,
274
+ expected_value=None,
275
+ message=f"Error executing check: {e}",
276
+ severity=Severity.ERROR,
277
+ )
278
+
279
+ def _check_row_count(self, dataset: Dataset, check: Check) -> CheckResult:
280
+ """Check row count against threshold."""
281
+ actual = dataset.row_count
282
+ expected = check.value
283
+ passed = self._compare(actual, expected, check.operator)
284
+
285
+ return CheckResult(
286
+ check=check,
287
+ column=None,
288
+ passed=passed,
289
+ actual_value=actual,
290
+ expected_value=f"{check.operator} {expected}",
291
+ message=f"Row count is {actual:,} (expected {check.operator} {expected})",
292
+ severity=check.severity,
293
+ )
294
+
295
+ def _check_not_null(self, col, check: Check) -> CheckResult:
296
+ """Check that column has no nulls."""
297
+ null_count = col.null_count
298
+ passed = null_count == 0
299
+
300
+ return CheckResult(
301
+ check=check,
302
+ column=col.name,
303
+ passed=passed,
304
+ actual_value=null_count,
305
+ expected_value=0,
306
+ message=f"Column '{col.name}' has {null_count} null values" if not passed
307
+ else f"Column '{col.name}' has no null values",
308
+ severity=check.severity,
309
+ details={"null_percent": col.null_percent},
310
+ )
311
+
312
+ def _check_null_percent(self, col, check: Check) -> CheckResult:
313
+ """Check null percentage against threshold."""
314
+ actual = col.null_percent
315
+ expected = check.value
316
+ passed = self._compare(actual, expected, check.operator)
317
+
318
+ return CheckResult(
319
+ check=check,
320
+ column=col.name,
321
+ passed=passed,
322
+ actual_value=actual,
323
+ expected_value=f"{check.operator} {expected}%",
324
+ message=f"Column '{col.name}' null_percent is {actual:.2f}% (expected {check.operator} {expected}%)",
325
+ severity=check.severity,
326
+ )
327
+
328
+ def _check_unique(self, col, check: Check) -> CheckResult:
329
+ """Check that column values are unique."""
330
+ unique_pct = col.unique_percent
331
+ passed = unique_pct == 100
332
+
333
+ return CheckResult(
334
+ check=check,
335
+ column=col.name,
336
+ passed=passed,
337
+ actual_value=unique_pct,
338
+ expected_value=100,
339
+ message=f"Column '{col.name}' is {unique_pct:.2f}% unique" if not passed
340
+ else f"Column '{col.name}' is 100% unique",
341
+ severity=check.severity,
342
+ details={
343
+ "unique_count": col.unique_count,
344
+ "total_count": col.total_count,
345
+ "duplicate_count": col.total_count - col.unique_count,
346
+ },
347
+ )
348
+
349
+ def _check_unique_percent(self, col, check: Check) -> CheckResult:
350
+ """Check unique percentage against threshold."""
351
+ actual = col.unique_percent
352
+ expected = check.value
353
+ passed = self._compare(actual, expected, check.operator)
354
+
355
+ return CheckResult(
356
+ check=check,
357
+ column=col.name,
358
+ passed=passed,
359
+ actual_value=actual,
360
+ expected_value=f"{check.operator} {expected}%",
361
+ message=f"Column '{col.name}' unique_percent is {actual:.2f}% (expected {check.operator} {expected}%)",
362
+ severity=check.severity,
363
+ )
364
+
365
+ def _check_no_duplicates(self, col, check: Check) -> CheckResult:
366
+ """Check that column has no duplicate values."""
367
+ result = col.has_no_duplicates()
368
+
369
+ return CheckResult(
370
+ check=check,
371
+ column=col.name,
372
+ passed=result.passed,
373
+ actual_value=result.actual_value,
374
+ expected_value=0,
375
+ message=result.message,
376
+ severity=check.severity,
377
+ )
378
+
379
+ def _check_between(self, col, check: Check) -> CheckResult:
380
+ """Check that values are within a range."""
381
+ if not isinstance(check.value, (list, tuple)) or len(check.value) != 2:
382
+ return CheckResult(
383
+ check=check,
384
+ column=col.name,
385
+ passed=False,
386
+ actual_value=None,
387
+ expected_value=check.value,
388
+ message=f"Range check requires [min, max], got: {check.value}",
389
+ severity=Severity.ERROR,
390
+ )
391
+
392
+ min_val, max_val = check.value
393
+ result = col.between(min_val, max_val)
394
+
395
+ return CheckResult(
396
+ check=check,
397
+ column=col.name,
398
+ passed=result.passed,
399
+ actual_value=result.actual_value,
400
+ expected_value=f"[{min_val}, {max_val}]",
401
+ message=result.message,
402
+ severity=check.severity,
403
+ details=result.details or {},
404
+ )
405
+
406
+ def _check_min(self, col, check: Check) -> CheckResult:
407
+ """Check that all values are >= min."""
408
+ actual_min = col.min
409
+ expected_min = check.value
410
+ passed = actual_min is not None and actual_min >= expected_min
411
+
412
+ return CheckResult(
413
+ check=check,
414
+ column=col.name,
415
+ passed=passed,
416
+ actual_value=actual_min,
417
+ expected_value=f">= {expected_min}",
418
+ message=f"Column '{col.name}' min is {actual_min} (expected >= {expected_min})",
419
+ severity=check.severity,
420
+ )
421
+
422
+ def _check_max(self, col, check: Check) -> CheckResult:
423
+ """Check that all values are <= max."""
424
+ actual_max = col.max
425
+ expected_max = check.value
426
+ passed = actual_max is not None and actual_max <= expected_max
427
+
428
+ return CheckResult(
429
+ check=check,
430
+ column=col.name,
431
+ passed=passed,
432
+ actual_value=actual_max,
433
+ expected_value=f"<= {expected_max}",
434
+ message=f"Column '{col.name}' max is {actual_max} (expected <= {expected_max})",
435
+ severity=check.severity,
436
+ )
437
+
438
+ def _check_positive(self, col, check: Check) -> CheckResult:
439
+ """Check that all values are positive (> 0)."""
440
+ result = col.greater_than(0)
441
+
442
+ return CheckResult(
443
+ check=check,
444
+ column=col.name,
445
+ passed=result.passed,
446
+ actual_value=result.actual_value,
447
+ expected_value="> 0",
448
+ message=result.message if not result.passed
449
+ else f"Column '{col.name}' has all positive values",
450
+ severity=check.severity,
451
+ )
452
+
453
+ def _check_negative(self, col, check: Check) -> CheckResult:
454
+ """Check that all values are negative (< 0)."""
455
+ result = col.less_than(0)
456
+
457
+ return CheckResult(
458
+ check=check,
459
+ column=col.name,
460
+ passed=result.passed,
461
+ actual_value=result.actual_value,
462
+ expected_value="< 0",
463
+ message=result.message if not result.passed
464
+ else f"Column '{col.name}' has all negative values",
465
+ severity=check.severity,
466
+ )
467
+
468
+ def _check_non_negative(self, col, check: Check) -> CheckResult:
469
+ """Check that all values are non-negative (>= 0)."""
470
+ actual_min = col.min
471
+ passed = actual_min is not None and actual_min >= 0
472
+
473
+ return CheckResult(
474
+ check=check,
475
+ column=col.name,
476
+ passed=passed,
477
+ actual_value=actual_min,
478
+ expected_value=">= 0",
479
+ message=f"Column '{col.name}' min is {actual_min} (expected >= 0)",
480
+ severity=check.severity,
481
+ )
482
+
483
+ def _check_pattern(self, col, check: Check) -> CheckResult:
484
+ """Check that values match a regex pattern."""
485
+ pattern = check.value
486
+
487
+ # Handle built-in pattern names
488
+ pattern_name = check.params.get("pattern_name")
489
+ if pattern_name:
490
+ pattern = BUILTIN_PATTERNS.get(pattern_name, pattern)
491
+
492
+ result = col.matches(pattern)
493
+
494
+ return CheckResult(
495
+ check=check,
496
+ column=col.name,
497
+ passed=result.passed,
498
+ actual_value=result.actual_value,
499
+ expected_value=f"matches '{pattern_name or pattern}'",
500
+ message=result.message,
501
+ severity=check.severity,
502
+ details=result.details or {},
503
+ )
504
+
505
+ def _check_length(self, col, check: Check) -> CheckResult:
506
+ """Check that string lengths are within range."""
507
+ if not isinstance(check.value, (list, tuple)) or len(check.value) != 2:
508
+ return CheckResult(
509
+ check=check,
510
+ column=col.name,
511
+ passed=False,
512
+ actual_value=None,
513
+ expected_value=check.value,
514
+ message=f"Length check requires [min, max], got: {check.value}",
515
+ severity=Severity.ERROR,
516
+ )
517
+
518
+ min_len, max_len = check.value
519
+ result = col.value_lengths_between(min_len, max_len)
520
+
521
+ return CheckResult(
522
+ check=check,
523
+ column=col.name,
524
+ passed=result.passed,
525
+ actual_value=result.actual_value,
526
+ expected_value=f"length in [{min_len}, {max_len}]",
527
+ message=result.message,
528
+ severity=check.severity,
529
+ )
530
+
531
+ def _check_min_length(self, col, check: Check) -> CheckResult:
532
+ """Check that string lengths are >= min."""
533
+ min_len = check.value
534
+ result = col.value_lengths_between(min_len, 1000000)
535
+
536
+ return CheckResult(
537
+ check=check,
538
+ column=col.name,
539
+ passed=result.passed,
540
+ actual_value=result.actual_value,
541
+ expected_value=f"length >= {min_len}",
542
+ message=result.message,
543
+ severity=check.severity,
544
+ )
545
+
546
+ def _check_max_length(self, col, check: Check) -> CheckResult:
547
+ """Check that string lengths are <= max."""
548
+ max_len = check.value
549
+ result = col.value_lengths_between(0, max_len)
550
+
551
+ return CheckResult(
552
+ check=check,
553
+ column=col.name,
554
+ passed=result.passed,
555
+ actual_value=result.actual_value,
556
+ expected_value=f"length <= {max_len}",
557
+ message=result.message,
558
+ severity=check.severity,
559
+ )
560
+
561
+ def _check_allowed_values(self, col, check: Check) -> CheckResult:
562
+ """Check that values are in allowed set."""
563
+ allowed = check.value
564
+ if not isinstance(allowed, list):
565
+ allowed = [allowed]
566
+
567
+ result = col.isin(allowed)
568
+
569
+ return CheckResult(
570
+ check=check,
571
+ column=col.name,
572
+ passed=result.passed,
573
+ actual_value=result.actual_value,
574
+ expected_value=f"in {allowed}",
575
+ message=result.message,
576
+ severity=check.severity,
577
+ details=result.details or {},
578
+ )
579
+
580
+ def _compare(self, actual: Any, expected: Any, operator: str) -> bool:
581
+ """Compare actual value to expected using operator."""
582
+ if actual is None or expected is None:
583
+ return False
584
+
585
+ comparisons = {
586
+ "=": lambda a, e: a == e,
587
+ "==": lambda a, e: a == e,
588
+ "!=": lambda a, e: a != e,
589
+ "<>": lambda a, e: a != e,
590
+ "<": lambda a, e: a < e,
591
+ ">": lambda a, e: a > e,
592
+ "<=": lambda a, e: a <= e,
593
+ ">=": lambda a, e: a >= e,
594
+ }
595
+
596
+ compare_fn = comparisons.get(operator, comparisons["="])
597
+ return compare_fn(actual, expected)
598
+
599
+
600
+ def execute_rules(
601
+ ruleset: RuleSet,
602
+ source: str | None = None,
603
+ dataset: Dataset | None = None
604
+ ) -> ExecutionResult:
605
+ """Convenience function to execute rules.
606
+
607
+ Args:
608
+ ruleset: The rules to execute
609
+ source: Data source path
610
+ dataset: Pre-loaded dataset
611
+
612
+ Returns:
613
+ ExecutionResult
614
+ """
615
+ executor = RuleExecutor()
616
+ return executor.execute(ruleset, source=source, dataset=dataset)