duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,508 @@
1
+ """Data Quality Scoring System.
2
+
3
+ Implements industry-standard DQ dimensions (ISO 8000, DAMA DMBOK):
4
+ - Completeness: Are all required values present?
5
+ - Uniqueness: Are values appropriately unique?
6
+ - Validity: Do values conform to expected formats/ranges?
7
+ - Consistency: Are values consistent across columns/datasets?
8
+ - Timeliness: Is data fresh enough? (optional)
9
+ - Accuracy: Does data reflect reality? (requires reference data)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
16
+ from enum import Enum
17
+ from typing import Any, TYPE_CHECKING
18
+
19
+ if TYPE_CHECKING:
20
+ from duckguard.core.dataset import Dataset
21
+
22
+
23
+ class QualityDimension(Enum):
24
+ """Standard data quality dimensions."""
25
+
26
+ COMPLETENESS = "completeness"
27
+ UNIQUENESS = "uniqueness"
28
+ VALIDITY = "validity"
29
+ CONSISTENCY = "consistency"
30
+ TIMELINESS = "timeliness"
31
+
32
+
33
+ @dataclass
34
+ class DimensionScore:
35
+ """Score for a single quality dimension."""
36
+
37
+ dimension: QualityDimension
38
+ score: float # 0-100
39
+ weight: float # 0-1
40
+ checks_run: int
41
+ checks_passed: int
42
+ details: list[CheckScore] = field(default_factory=list)
43
+
44
+ @property
45
+ def weighted_score(self) -> float:
46
+ """Calculate weighted contribution to overall score."""
47
+ return self.score * self.weight
48
+
49
+ @property
50
+ def pass_rate(self) -> float:
51
+ """Percentage of checks that passed."""
52
+ if self.checks_run == 0:
53
+ return 100.0
54
+ return (self.checks_passed / self.checks_run) * 100
55
+
56
+
57
+ @dataclass
58
+ class CheckScore:
59
+ """Score for a single check within a dimension."""
60
+
61
+ name: str
62
+ column: str | None
63
+ passed: bool
64
+ score: float # 0-100
65
+ message: str
66
+ severity: str = "medium" # low, medium, high, critical
67
+
68
+
69
+ @dataclass
70
+ class ColumnScore:
71
+ """Quality score for a single column."""
72
+
73
+ name: str
74
+ overall_score: float
75
+ completeness_score: float
76
+ uniqueness_score: float
77
+ validity_score: float
78
+ checks_run: int
79
+ checks_passed: int
80
+ issues: list[str] = field(default_factory=list)
81
+
82
+
83
+ @dataclass
84
+ class QualityScore:
85
+ """Complete data quality score for a dataset."""
86
+
87
+ source: str
88
+ overall: float # 0-100 weighted average
89
+ grade: str # A, B, C, D, F
90
+
91
+ # Dimension scores
92
+ completeness: float
93
+ uniqueness: float
94
+ validity: float
95
+ consistency: float
96
+
97
+ # Detailed breakdowns
98
+ dimensions: dict[str, DimensionScore] = field(default_factory=dict)
99
+ columns: dict[str, ColumnScore] = field(default_factory=dict)
100
+
101
+ # Summary stats
102
+ total_checks: int = 0
103
+ passed_checks: int = 0
104
+ failed_checks: int = 0
105
+
106
+ # Metadata
107
+ timestamp: datetime = field(default_factory=datetime.now)
108
+
109
+ @property
110
+ def pass_rate(self) -> float:
111
+ """Overall pass rate as percentage."""
112
+ if self.total_checks == 0:
113
+ return 100.0
114
+ return (self.passed_checks / self.total_checks) * 100
115
+
116
+ def __repr__(self) -> str:
117
+ return (
118
+ f"QualityScore(overall={self.overall:.1f}, grade='{self.grade}', "
119
+ f"completeness={self.completeness:.1f}, uniqueness={self.uniqueness:.1f}, "
120
+ f"validity={self.validity:.1f}, consistency={self.consistency:.1f})"
121
+ )
122
+
123
+
124
+ class QualityScorer:
125
+ """
126
+ Calculates data quality scores across multiple dimensions.
127
+
128
+ Example:
129
+ scorer = QualityScorer()
130
+ score = scorer.score(dataset)
131
+
132
+ print(score.overall) # 87.5
133
+ print(score.grade) # 'B'
134
+ print(score.completeness) # 95.0
135
+ print(score.columns['email'].overall_score) # 76.2
136
+ """
137
+
138
+ # Default weights for each dimension (must sum to 1.0)
139
+ DEFAULT_WEIGHTS = {
140
+ QualityDimension.COMPLETENESS: 0.30,
141
+ QualityDimension.UNIQUENESS: 0.20,
142
+ QualityDimension.VALIDITY: 0.30,
143
+ QualityDimension.CONSISTENCY: 0.20,
144
+ }
145
+
146
+ # Grade thresholds
147
+ GRADE_THRESHOLDS = [
148
+ (90, "A"),
149
+ (80, "B"),
150
+ (70, "C"),
151
+ (60, "D"),
152
+ (0, "F"),
153
+ ]
154
+
155
+ # Severity multipliers (how much a failure affects the score)
156
+ SEVERITY_MULTIPLIERS = {
157
+ "low": 0.5,
158
+ "medium": 1.0,
159
+ "high": 1.5,
160
+ "critical": 2.0,
161
+ }
162
+
163
+ def __init__(
164
+ self,
165
+ weights: dict[QualityDimension, float] | None = None,
166
+ include_timeliness: bool = False,
167
+ ):
168
+ """
169
+ Initialize the scorer.
170
+
171
+ Args:
172
+ weights: Custom weights for each dimension (must sum to 1.0)
173
+ include_timeliness: Whether to include timeliness checks
174
+ """
175
+ self.weights = weights or self.DEFAULT_WEIGHTS.copy()
176
+ self.include_timeliness = include_timeliness
177
+
178
+ # Normalize weights to sum to 1.0
179
+ total = sum(self.weights.values())
180
+ if total != 1.0:
181
+ self.weights = {k: v / total for k, v in self.weights.items()}
182
+
183
+ def score(self, dataset: Dataset) -> QualityScore:
184
+ """
185
+ Calculate comprehensive quality score for a dataset.
186
+
187
+ Args:
188
+ dataset: Dataset to score
189
+
190
+ Returns:
191
+ QualityScore with detailed breakdown
192
+ """
193
+ dimension_scores: dict[str, DimensionScore] = {}
194
+ column_scores: dict[str, ColumnScore] = {}
195
+ all_checks: list[CheckScore] = []
196
+
197
+ # Score each column
198
+ for col_name in dataset.columns:
199
+ col = dataset[col_name]
200
+ col_checks = self._score_column(col)
201
+ all_checks.extend(col_checks)
202
+
203
+ # Calculate column-level scores
204
+ col_score = self._calculate_column_score(col_name, col_checks)
205
+ column_scores[col_name] = col_score
206
+
207
+ # Aggregate into dimension scores
208
+ dimension_scores = self._aggregate_dimension_scores(all_checks)
209
+
210
+ # Calculate overall score
211
+ overall = self._calculate_overall_score(dimension_scores)
212
+ grade = self._calculate_grade(overall)
213
+
214
+ # Count totals
215
+ total_checks = len(all_checks)
216
+ passed_checks = sum(1 for c in all_checks if c.passed)
217
+ failed_checks = total_checks - passed_checks
218
+
219
+ return QualityScore(
220
+ source=dataset.source,
221
+ overall=overall,
222
+ grade=grade,
223
+ completeness=dimension_scores.get(
224
+ QualityDimension.COMPLETENESS.value,
225
+ DimensionScore(QualityDimension.COMPLETENESS, 100, 0.3, 0, 0)
226
+ ).score,
227
+ uniqueness=dimension_scores.get(
228
+ QualityDimension.UNIQUENESS.value,
229
+ DimensionScore(QualityDimension.UNIQUENESS, 100, 0.2, 0, 0)
230
+ ).score,
231
+ validity=dimension_scores.get(
232
+ QualityDimension.VALIDITY.value,
233
+ DimensionScore(QualityDimension.VALIDITY, 100, 0.3, 0, 0)
234
+ ).score,
235
+ consistency=dimension_scores.get(
236
+ QualityDimension.CONSISTENCY.value,
237
+ DimensionScore(QualityDimension.CONSISTENCY, 100, 0.2, 0, 0)
238
+ ).score,
239
+ dimensions=dimension_scores,
240
+ columns=column_scores,
241
+ total_checks=total_checks,
242
+ passed_checks=passed_checks,
243
+ failed_checks=failed_checks,
244
+ )
245
+
246
+ def _score_column(self, col) -> list[CheckScore]:
247
+ """Score a single column across all dimensions."""
248
+ checks = []
249
+ col_name = col.name
250
+
251
+ # Get column statistics
252
+ stats = col._get_stats()
253
+ numeric_stats = col._get_numeric_stats()
254
+
255
+ # === COMPLETENESS CHECKS ===
256
+ null_pct = stats.get("null_percent", 0.0)
257
+ completeness_score = 100 - null_pct
258
+
259
+ checks.append(CheckScore(
260
+ name="not_null",
261
+ column=col_name,
262
+ passed=null_pct == 0,
263
+ score=completeness_score,
264
+ message=f"{col_name}: {null_pct:.1f}% null values",
265
+ severity="high" if null_pct > 50 else "medium" if null_pct > 10 else "low",
266
+ ))
267
+
268
+ # === UNIQUENESS CHECKS ===
269
+ unique_pct = stats.get("unique_percent", 0.0)
270
+ total_count = stats.get("total_count", 0)
271
+ unique_count = stats.get("unique_count", 0)
272
+
273
+ # For likely ID columns, check for duplicates
274
+ is_likely_id = any(
275
+ pattern in col_name.lower()
276
+ for pattern in ["id", "key", "code", "uuid", "guid"]
277
+ )
278
+
279
+ if is_likely_id:
280
+ uniqueness_score = unique_pct
281
+ has_duplicates = unique_count < total_count
282
+ checks.append(CheckScore(
283
+ name="unique",
284
+ column=col_name,
285
+ passed=not has_duplicates,
286
+ score=uniqueness_score,
287
+ message=f"{col_name}: {unique_pct:.1f}% unique ({total_count - unique_count} duplicates)",
288
+ severity="critical" if has_duplicates else "low",
289
+ ))
290
+ else:
291
+ # For non-ID columns, just track cardinality
292
+ checks.append(CheckScore(
293
+ name="cardinality",
294
+ column=col_name,
295
+ passed=True,
296
+ score=100,
297
+ message=f"{col_name}: {unique_count} distinct values",
298
+ severity="low",
299
+ ))
300
+
301
+ # === VALIDITY CHECKS ===
302
+ # Check for reasonable ranges on numeric columns
303
+ if numeric_stats.get("mean") is not None:
304
+ min_val = stats.get("min_value")
305
+ max_val = stats.get("max_value")
306
+
307
+ # Check for negative values in likely positive-only columns
308
+ is_likely_positive = any(
309
+ pattern in col_name.lower()
310
+ for pattern in ["amount", "price", "quantity", "count", "age", "size"]
311
+ )
312
+
313
+ if is_likely_positive and min_val is not None:
314
+ is_positive = min_val >= 0
315
+ checks.append(CheckScore(
316
+ name="positive_values",
317
+ column=col_name,
318
+ passed=is_positive,
319
+ score=100 if is_positive else 0,
320
+ message=f"{col_name}: min={min_val} (should be >= 0)",
321
+ severity="high" if not is_positive else "low",
322
+ ))
323
+
324
+ # Check for common patterns in string columns
325
+ sample_values = col.get_distinct_values(limit=100)
326
+ string_values = [v for v in sample_values if isinstance(v, str)]
327
+
328
+ if string_values and len(string_values) >= 10:
329
+ pattern_score = self._check_pattern_consistency(col_name, string_values)
330
+ if pattern_score is not None:
331
+ checks.append(pattern_score)
332
+
333
+ return checks
334
+
335
+ def _check_pattern_consistency(self, col_name: str, values: list[str]) -> CheckScore | None:
336
+ """Check if string values follow consistent patterns."""
337
+ import re
338
+
339
+ # Common patterns to detect
340
+ patterns = {
341
+ "email": (r"^[\w\.-]+@[\w\.-]+\.\w+$", ["email", "mail"]),
342
+ "phone": (r"^\+?[\d\s\-\(\)]{10,}$", ["phone", "tel", "mobile"]),
343
+ "url": (r"^https?://", ["url", "link", "website"]),
344
+ "uuid": (r"^[0-9a-f]{8}-[0-9a-f]{4}-", ["uuid", "guid"]),
345
+ }
346
+
347
+ col_lower = col_name.lower()
348
+
349
+ for pattern_name, (regex, keywords) in patterns.items():
350
+ # Check if column name suggests this pattern
351
+ if any(kw in col_lower for kw in keywords):
352
+ matches = sum(1 for v in values if re.match(regex, str(v), re.IGNORECASE))
353
+ match_rate = (matches / len(values)) * 100
354
+
355
+ return CheckScore(
356
+ name=f"pattern_{pattern_name}",
357
+ column=col_name,
358
+ passed=match_rate >= 90,
359
+ score=match_rate,
360
+ message=f"{col_name}: {match_rate:.1f}% match {pattern_name} pattern",
361
+ severity="medium" if match_rate < 90 else "low",
362
+ )
363
+
364
+ return None
365
+
366
+ def _calculate_column_score(self, col_name: str, checks: list[CheckScore]) -> ColumnScore:
367
+ """Calculate aggregate score for a column."""
368
+ if not checks:
369
+ return ColumnScore(
370
+ name=col_name,
371
+ overall_score=100,
372
+ completeness_score=100,
373
+ uniqueness_score=100,
374
+ validity_score=100,
375
+ checks_run=0,
376
+ checks_passed=0,
377
+ )
378
+
379
+ # Group checks by type
380
+ completeness_checks = [c for c in checks if c.name == "not_null"]
381
+ uniqueness_checks = [c for c in checks if c.name in ("unique", "cardinality")]
382
+ validity_checks = [c for c in checks if c.name not in ("not_null", "unique", "cardinality")]
383
+
384
+ def avg_score(check_list: list[CheckScore]) -> float:
385
+ if not check_list:
386
+ return 100.0
387
+ return sum(c.score for c in check_list) / len(check_list)
388
+
389
+ completeness = avg_score(completeness_checks)
390
+ uniqueness = avg_score(uniqueness_checks)
391
+ validity = avg_score(validity_checks)
392
+
393
+ # Calculate overall using weights
394
+ overall = (
395
+ completeness * 0.35 +
396
+ uniqueness * 0.25 +
397
+ validity * 0.40
398
+ )
399
+
400
+ issues = [c.message for c in checks if not c.passed]
401
+
402
+ return ColumnScore(
403
+ name=col_name,
404
+ overall_score=overall,
405
+ completeness_score=completeness,
406
+ uniqueness_score=uniqueness,
407
+ validity_score=validity,
408
+ checks_run=len(checks),
409
+ checks_passed=sum(1 for c in checks if c.passed),
410
+ issues=issues,
411
+ )
412
+
413
+ def _aggregate_dimension_scores(
414
+ self, checks: list[CheckScore]
415
+ ) -> dict[str, DimensionScore]:
416
+ """Aggregate check scores into dimension scores."""
417
+ # Map check names to dimensions
418
+ dimension_mapping = {
419
+ "not_null": QualityDimension.COMPLETENESS,
420
+ "unique": QualityDimension.UNIQUENESS,
421
+ "cardinality": QualityDimension.UNIQUENESS,
422
+ "positive_values": QualityDimension.VALIDITY,
423
+ "pattern_email": QualityDimension.VALIDITY,
424
+ "pattern_phone": QualityDimension.VALIDITY,
425
+ "pattern_url": QualityDimension.VALIDITY,
426
+ "pattern_uuid": QualityDimension.VALIDITY,
427
+ }
428
+
429
+ # Group checks by dimension
430
+ dimension_checks: dict[QualityDimension, list[CheckScore]] = {
431
+ QualityDimension.COMPLETENESS: [],
432
+ QualityDimension.UNIQUENESS: [],
433
+ QualityDimension.VALIDITY: [],
434
+ QualityDimension.CONSISTENCY: [],
435
+ }
436
+
437
+ for check in checks:
438
+ dimension = dimension_mapping.get(check.name, QualityDimension.VALIDITY)
439
+ dimension_checks[dimension].append(check)
440
+
441
+ # Calculate scores per dimension
442
+ result = {}
443
+ for dimension, dim_checks in dimension_checks.items():
444
+ if not dim_checks:
445
+ score = 100.0
446
+ passed = 0
447
+ run = 0
448
+ else:
449
+ # Weight by severity
450
+ total_weight = 0
451
+ weighted_score = 0
452
+ for check in dim_checks:
453
+ weight = self.SEVERITY_MULTIPLIERS.get(check.severity, 1.0)
454
+ weighted_score += check.score * weight
455
+ total_weight += weight
456
+
457
+ score = weighted_score / total_weight if total_weight > 0 else 100.0
458
+ passed = sum(1 for c in dim_checks if c.passed)
459
+ run = len(dim_checks)
460
+
461
+ result[dimension.value] = DimensionScore(
462
+ dimension=dimension,
463
+ score=score,
464
+ weight=self.weights.get(dimension, 0.25),
465
+ checks_run=run,
466
+ checks_passed=passed,
467
+ details=dim_checks,
468
+ )
469
+
470
+ return result
471
+
472
+ def _calculate_overall_score(self, dimension_scores: dict[str, DimensionScore]) -> float:
473
+ """Calculate weighted overall score."""
474
+ total = 0.0
475
+ for dim_score in dimension_scores.values():
476
+ total += dim_score.weighted_score
477
+ return total
478
+
479
+ def _calculate_grade(self, score: float) -> str:
480
+ """Convert numeric score to letter grade."""
481
+ for threshold, grade in self.GRADE_THRESHOLDS:
482
+ if score >= threshold:
483
+ return grade
484
+ return "F"
485
+
486
+
487
+ def score(dataset: Dataset, **kwargs) -> QualityScore:
488
+ """
489
+ Convenience function to score a dataset.
490
+
491
+ Args:
492
+ dataset: Dataset to score
493
+ **kwargs: Arguments passed to QualityScorer
494
+
495
+ Returns:
496
+ QualityScore
497
+
498
+ Example:
499
+ from duckguard import connect, score
500
+
501
+ orders = connect("data/orders.csv")
502
+ result = score(orders)
503
+
504
+ print(result.overall) # 87.5
505
+ print(result.grade) # 'B'
506
+ """
507
+ scorer = QualityScorer(**kwargs)
508
+ return scorer.score(dataset)
@@ -0,0 +1,5 @@
1
+ """Profiler module for auto-profiling and rule suggestions."""
2
+
3
+ from duckguard.profiler.auto_profile import AutoProfiler, profile
4
+
5
+ __all__ = ["AutoProfiler", "profile"]