duckguard 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckguard/core/result.py CHANGED
@@ -130,28 +130,36 @@ class ValidationResult:
130
130
  if not self.failed_rows:
131
131
  return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
132
132
 
133
- return pd.DataFrame([
134
- {
135
- "row_index": row.row_index,
136
- "column": row.column,
137
- "value": row.value,
138
- "expected": row.expected,
139
- "reason": row.reason,
140
- **row.context,
141
- }
142
- for row in self.failed_rows
143
- ])
133
+ return pd.DataFrame(
134
+ [
135
+ {
136
+ "row_index": row.row_index,
137
+ "column": row.column,
138
+ "value": row.value,
139
+ "expected": row.expected,
140
+ "reason": row.reason,
141
+ **row.context,
142
+ }
143
+ for row in self.failed_rows
144
+ ]
145
+ )
144
146
  except ImportError:
145
- raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")
147
+ raise ImportError(
148
+ "pandas is required for to_dataframe(). Install with: pip install pandas"
149
+ )
146
150
 
147
151
  def summary(self) -> str:
148
152
  """Get a summary of the validation result with sample failures."""
149
153
  lines = [self.message]
150
154
 
151
155
  if self.failed_rows:
152
- lines.append(f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):")
156
+ lines.append(
157
+ f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):"
158
+ )
153
159
  for row in self.failed_rows[:5]:
154
- lines.append(f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}")
160
+ lines.append(
161
+ f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}"
162
+ )
155
163
 
156
164
  if self.total_failures > 5:
157
165
  lines.append(f" ... and {self.total_failures - 5} more failures")
@@ -169,6 +177,8 @@ class ProfileResult:
169
177
  columns: list[ColumnProfile]
170
178
  suggested_rules: list[str] = field(default_factory=list)
171
179
  timestamp: datetime = field(default_factory=datetime.now)
180
+ overall_quality_score: float | None = None
181
+ overall_quality_grade: str | None = None
172
182
 
173
183
 
174
184
  @dataclass
@@ -185,8 +195,19 @@ class ColumnProfile:
185
195
  max_value: Any | None = None
186
196
  mean_value: float | None = None
187
197
  stddev_value: float | None = None
198
+ median_value: float | None = None
199
+ p25_value: float | None = None
200
+ p75_value: float | None = None
188
201
  sample_values: list[Any] = field(default_factory=list)
189
202
  suggested_rules: list[str] = field(default_factory=list)
203
+ quality_score: float | None = None
204
+ quality_grade: str | None = None
205
+ distribution_type: str | None = None
206
+ skewness: float | None = None
207
+ kurtosis: float | None = None
208
+ is_normal: bool | None = None
209
+ outlier_count: int | None = None
210
+ outlier_percentage: float | None = None
190
211
 
191
212
 
192
213
  @dataclass
@@ -2,13 +2,30 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import re
6
5
  from dataclasses import dataclass
7
6
  from typing import Any
8
7
 
9
8
  from duckguard.core.dataset import Dataset
10
9
  from duckguard.core.result import ColumnProfile, ProfileResult
11
10
 
11
+ # Grade thresholds (shared with QualityScorer for consistency)
12
+ _GRADE_THRESHOLDS = {"A": 90.0, "B": 80.0, "C": 70.0, "D": 60.0}
13
+
14
+ # Mapping from inferred dtype to QualityScorer expected_type
15
+ _DTYPE_TO_EXPECTED_TYPE: dict[str, str] = {
16
+ "integer": "int",
17
+ "float": "float",
18
+ "string": "string",
19
+ }
20
+
21
+
22
+ def _score_to_grade(score: float) -> str:
23
+ """Convert a numeric score (0-100) to a letter grade."""
24
+ for grade, threshold in _GRADE_THRESHOLDS.items():
25
+ if score >= threshold:
26
+ return grade
27
+ return "F"
28
+
12
29
 
13
30
  @dataclass
14
31
  class RuleSuggestion:
@@ -26,33 +43,35 @@ class AutoProfiler:
26
43
 
27
44
  The profiler analyzes data patterns and generates Python assertions
28
45
  that can be used directly in test files.
29
- """
30
46
 
31
- # Thresholds for rule generation
32
- NULL_THRESHOLD_SUGGEST = 1.0 # Suggest not_null if nulls < 1%
33
- UNIQUE_THRESHOLD_SUGGEST = 99.0 # Suggest unique if > 99% unique
34
- ENUM_MAX_VALUES = 20 # Max distinct values to suggest enum check
35
- PATTERN_SAMPLE_SIZE = 1000 # Sample size for pattern detection
36
-
37
- # Common patterns to detect
38
- PATTERNS = {
39
- "email": r"^[\w\.-]+@[\w\.-]+\.\w+$",
40
- "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
41
- "phone": r"^\+?[\d\s\-\(\)]{10,}$",
42
- "url": r"^https?://[\w\.-]+",
43
- "ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
44
- "date_iso": r"^\d{4}-\d{2}-\d{2}$",
45
- "datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
46
- }
47
-
48
- def __init__(self, dataset_var_name: str = "data"):
49
- """
50
- Initialize the profiler.
47
+ Args:
48
+ dataset_var_name: Variable name to use in generated rules.
49
+ deep: Enable deep profiling (distribution analysis, outlier detection).
50
+ Requires scipy for distribution fitting. Default is False.
51
+ null_threshold: Suggest not_null rule if null percentage is below this value.
52
+ unique_threshold: Suggest unique rule if unique percentage is above this value.
53
+ enum_max_values: Maximum distinct values for enum check suggestion.
54
+ pattern_sample_size: Number of sample values for pattern detection.
55
+ pattern_min_confidence: Minimum confidence (0-100) for pattern match reporting.
56
+ """
51
57
 
52
- Args:
53
- dataset_var_name: Variable name to use in generated rules
54
- """
58
+ def __init__(
59
+ self,
60
+ dataset_var_name: str = "data",
61
+ deep: bool = False,
62
+ null_threshold: float = 1.0,
63
+ unique_threshold: float = 99.0,
64
+ enum_max_values: int = 20,
65
+ pattern_sample_size: int = 1000,
66
+ pattern_min_confidence: float = 90.0,
67
+ ) -> None:
55
68
  self.dataset_var_name = dataset_var_name
69
+ self.deep = deep
70
+ self.null_threshold = null_threshold
71
+ self.unique_threshold = unique_threshold
72
+ self.enum_max_values = enum_max_values
73
+ self.pattern_sample_size = pattern_sample_size
74
+ self.pattern_min_confidence = pattern_min_confidence
56
75
 
57
76
  def profile(self, dataset: Dataset) -> ProfileResult:
58
77
  """
@@ -73,29 +92,62 @@ class AutoProfiler:
73
92
  column_profiles.append(col_profile)
74
93
  all_suggestions.extend(col_profile.suggested_rules)
75
94
 
95
+ # Compute aggregate quality score
96
+ scored_columns = [c for c in column_profiles if c.quality_score is not None]
97
+ overall_score: float | None = None
98
+ overall_grade: str | None = None
99
+ if scored_columns:
100
+ overall_score = sum(c.quality_score for c in scored_columns) / len(scored_columns) # type: ignore[misc]
101
+ overall_grade = _score_to_grade(overall_score)
102
+
76
103
  return ProfileResult(
77
104
  source=dataset.source,
78
105
  row_count=dataset.row_count,
79
106
  column_count=dataset.column_count,
80
107
  columns=column_profiles,
81
108
  suggested_rules=all_suggestions,
109
+ overall_quality_score=overall_score,
110
+ overall_quality_grade=overall_grade,
82
111
  )
83
112
 
84
- def _profile_column(self, col) -> ColumnProfile:
113
+ def _profile_column(self, col: Any) -> ColumnProfile:
85
114
  """Profile a single column."""
86
115
  # Get basic stats
87
116
  stats = col._get_stats()
88
117
  numeric_stats = col._get_numeric_stats()
89
118
 
90
119
  # Get sample values for pattern detection
91
- sample_values = col.get_distinct_values(limit=self.PATTERN_SAMPLE_SIZE)
120
+ sample_values = col.get_distinct_values(limit=self.pattern_sample_size)
92
121
 
93
122
  # Generate suggestions
94
123
  suggestions = self._generate_suggestions(col, stats, numeric_stats, sample_values)
95
124
 
125
+ # Infer data type
126
+ inferred_dtype = self._infer_dtype(stats, sample_values)
127
+
128
+ # Quality scoring (requires numpy)
129
+ quality_score, quality_grade = self._compute_quality(sample_values, inferred_dtype)
130
+
131
+ # Deep profiling: distribution + outlier analysis (numeric columns only)
132
+ distribution_type = None
133
+ skewness = None
134
+ kurtosis = None
135
+ is_normal = None
136
+ outlier_count = None
137
+ outlier_percentage = None
138
+
139
+ if self.deep and numeric_stats.get("mean") is not None:
140
+ deep_results = self._deep_profile_numeric(col)
141
+ distribution_type = deep_results.get("distribution_type")
142
+ skewness = deep_results.get("skewness")
143
+ kurtosis = deep_results.get("kurtosis")
144
+ is_normal = deep_results.get("is_normal")
145
+ outlier_count = deep_results.get("outlier_count")
146
+ outlier_percentage = deep_results.get("outlier_percentage")
147
+
96
148
  return ColumnProfile(
97
149
  name=col.name,
98
- dtype=self._infer_dtype(stats, sample_values),
150
+ dtype=inferred_dtype,
99
151
  null_count=stats.get("null_count", 0),
100
152
  null_percent=stats.get("null_percent", 0.0),
101
153
  unique_count=stats.get("unique_count", 0),
@@ -104,13 +156,85 @@ class AutoProfiler:
104
156
  max_value=stats.get("max_value"),
105
157
  mean_value=numeric_stats.get("mean"),
106
158
  stddev_value=numeric_stats.get("stddev"),
159
+ median_value=numeric_stats.get("median"),
160
+ p25_value=numeric_stats.get("p25"),
161
+ p75_value=numeric_stats.get("p75"),
107
162
  sample_values=sample_values[:10],
108
163
  suggested_rules=[s.rule for s in suggestions],
164
+ quality_score=quality_score,
165
+ quality_grade=quality_grade,
166
+ distribution_type=distribution_type,
167
+ skewness=skewness,
168
+ kurtosis=kurtosis,
169
+ is_normal=is_normal,
170
+ outlier_count=outlier_count,
171
+ outlier_percentage=outlier_percentage,
109
172
  )
110
173
 
174
+ def _compute_quality(
175
+ self, sample_values: list[Any], inferred_dtype: str
176
+ ) -> tuple[float | None, str | None]:
177
+ """Compute quality score and grade for a column using QualityScorer."""
178
+ try:
179
+ import numpy as np
180
+
181
+ from duckguard.profiler.quality_scorer import QualityScorer
182
+
183
+ if not sample_values:
184
+ return None, None
185
+
186
+ scorer = QualityScorer()
187
+ values_array = np.array(sample_values, dtype=object)
188
+ expected_type = _DTYPE_TO_EXPECTED_TYPE.get(inferred_dtype)
189
+ dimensions = scorer.calculate(values_array, expected_type=expected_type)
190
+ return dimensions.overall, dimensions.grade
191
+ except ImportError:
192
+ return None, None
193
+
194
+ def _deep_profile_numeric(self, col: Any) -> dict[str, Any]:
195
+ """Run deep profiling (distribution + outlier detection) on a numeric column."""
196
+ results: dict[str, Any] = {}
197
+ try:
198
+ import numpy as np
199
+
200
+ numeric_values = col._get_numeric_values(limit=10000)
201
+ if len(numeric_values) < 30:
202
+ return results
203
+
204
+ values_array = np.array(numeric_values, dtype=float)
205
+
206
+ # Distribution analysis (requires scipy)
207
+ try:
208
+ from duckguard.profiler.distribution_analyzer import DistributionAnalyzer
209
+
210
+ analyzer = DistributionAnalyzer()
211
+ analysis = analyzer.analyze(values_array)
212
+ results["distribution_type"] = analysis.best_fit_distribution
213
+ results["skewness"] = float(analysis.skewness)
214
+ results["kurtosis"] = float(analysis.kurtosis)
215
+ results["is_normal"] = analysis.is_normal
216
+ except (ImportError, ValueError):
217
+ pass
218
+
219
+ # Outlier detection (IQR method — works without scipy)
220
+ try:
221
+ from duckguard.profiler.outlier_detector import OutlierDetector
222
+
223
+ detector = OutlierDetector()
224
+ outlier_analysis = detector.detect(values_array, method="iqr")
225
+ results["outlier_count"] = outlier_analysis.outlier_count
226
+ results["outlier_percentage"] = outlier_analysis.outlier_percentage
227
+ except (ImportError, ValueError):
228
+ pass
229
+
230
+ except ImportError:
231
+ pass # numpy not available
232
+
233
+ return results
234
+
111
235
  def _generate_suggestions(
112
236
  self,
113
- col,
237
+ col: Any,
114
238
  stats: dict[str, Any],
115
239
  numeric_stats: dict[str, Any],
116
240
  sample_values: list[Any],
@@ -131,7 +255,7 @@ class AutoProfiler:
131
255
  category="null",
132
256
  )
133
257
  )
134
- elif null_pct < self.NULL_THRESHOLD_SUGGEST:
258
+ elif null_pct < self.null_threshold:
135
259
  threshold = max(1, round(null_pct * 2)) # 2x buffer
136
260
  suggestions.append(
137
261
  RuleSuggestion(
@@ -153,7 +277,7 @@ class AutoProfiler:
153
277
  category="unique",
154
278
  )
155
279
  )
156
- elif unique_pct > self.UNIQUE_THRESHOLD_SUGGEST:
280
+ elif unique_pct > self.unique_threshold:
157
281
  suggestions.append(
158
282
  RuleSuggestion(
159
283
  rule=f"assert {var}.{col_name}.unique_percent > 99",
@@ -168,7 +292,12 @@ class AutoProfiler:
168
292
  min_val = stats.get("min_value")
169
293
  max_val = stats.get("max_value")
170
294
 
171
- if min_val is not None and max_val is not None:
295
+ if (
296
+ min_val is not None
297
+ and max_val is not None
298
+ and isinstance(min_val, (int, float))
299
+ and isinstance(max_val, (int, float))
300
+ ):
172
301
  # Add buffer for range
173
302
  range_size = max_val - min_val
174
303
  buffer = range_size * 0.1 if range_size > 0 else 1
@@ -186,7 +315,7 @@ class AutoProfiler:
186
315
  )
187
316
 
188
317
  # Non-negative check
189
- if min_val is not None and min_val >= 0:
318
+ if min_val is not None and isinstance(min_val, (int, float)) and min_val >= 0:
190
319
  suggestions.append(
191
320
  RuleSuggestion(
192
321
  rule=f"assert {var}.{col_name}.min >= 0",
@@ -200,10 +329,10 @@ class AutoProfiler:
200
329
  unique_count = stats.get("unique_count", 0)
201
330
  total_count = stats.get("total_count", 0)
202
331
 
203
- if 0 < unique_count <= self.ENUM_MAX_VALUES and total_count > unique_count * 2:
332
+ if 0 < unique_count <= self.enum_max_values and total_count > unique_count * 2:
204
333
  # Get all distinct values
205
- distinct_values = col.get_distinct_values(limit=self.ENUM_MAX_VALUES + 1)
206
- if len(distinct_values) <= self.ENUM_MAX_VALUES:
334
+ distinct_values = col.get_distinct_values(limit=self.enum_max_values + 1)
335
+ if len(distinct_values) <= self.enum_max_values:
207
336
  # Format values for Python code
208
337
  formatted_values = self._format_values(distinct_values)
209
338
  suggestions.append(
@@ -215,39 +344,46 @@ class AutoProfiler:
215
344
  )
216
345
  )
217
346
 
218
- # 5. Pattern suggestions for string columns
347
+ # 5. Pattern suggestions for string columns (using PatternMatcher)
219
348
  string_values = [v for v in sample_values if isinstance(v, str)]
220
349
  if string_values:
221
- detected_pattern = self._detect_pattern(string_values)
222
- if detected_pattern:
223
- pattern_name, pattern = detected_pattern
224
- suggestions.append(
225
- RuleSuggestion(
226
- rule=f'assert {var}.{col_name}.matches(r"{pattern}")',
227
- confidence=0.75,
228
- reason=f"Values appear to be {pattern_name}",
229
- category="pattern",
230
- )
231
- )
350
+ pattern_suggestion = self._detect_pattern_with_matcher(col_name, string_values)
351
+ if pattern_suggestion:
352
+ suggestions.append(pattern_suggestion)
232
353
 
233
354
  return suggestions
234
355
 
235
- def _detect_pattern(self, values: list[str]) -> tuple[str, str] | None:
236
- """Detect common patterns in string values."""
237
- if not values:
238
- return None
356
+ def _detect_pattern_with_matcher(
357
+ self, col_name: str, string_values: list[str]
358
+ ) -> RuleSuggestion | None:
359
+ """Detect patterns using the full PatternMatcher (25+ patterns)."""
360
+ var = self.dataset_var_name
361
+ try:
362
+ import numpy as np
363
+
364
+ from duckguard.profiler.pattern_matcher import PatternMatcher
239
365
 
240
- # Sample for pattern detection
241
- sample = values[: min(100, len(values))]
366
+ matcher = PatternMatcher()
367
+ values_array = np.array(string_values, dtype=object)
368
+ matches = matcher.detect_patterns(
369
+ values_array, min_confidence=self.pattern_min_confidence
370
+ )
242
371
 
243
- for pattern_name, pattern in self.PATTERNS.items():
244
- matches = sum(1 for v in sample if re.match(pattern, str(v), re.IGNORECASE))
245
- match_rate = matches / len(sample)
372
+ if not matches:
373
+ return None
246
374
 
247
- if match_rate > 0.9: # 90% match threshold
248
- return pattern_name, pattern
375
+ best_match = matches[0]
376
+ semantic_type = matcher.suggest_semantic_type(matches)
377
+ label = semantic_type or best_match.pattern_type
249
378
 
250
- return None
379
+ return RuleSuggestion(
380
+ rule=f'assert {var}.{col_name}.matches(r"{best_match.pattern_regex}")',
381
+ confidence=best_match.confidence / 100.0,
382
+ reason=f"Values appear to be {label} ({best_match.confidence:.0f}% match)",
383
+ category="pattern",
384
+ )
385
+ except ImportError:
386
+ return None
251
387
 
252
388
  def _infer_dtype(self, stats: dict[str, Any], sample_values: list[Any]) -> str:
253
389
  """Infer the data type from statistics and samples."""
@@ -308,7 +444,7 @@ class AutoProfiler:
308
444
  Python code string for a test file
309
445
  """
310
446
  self.dataset_var_name = output_var
311
- profile = self.profile(dataset)
447
+ result = self.profile(dataset)
312
448
 
313
449
  lines = [
314
450
  '"""Auto-generated data quality tests by DuckGuard."""',
@@ -325,7 +461,7 @@ class AutoProfiler:
325
461
  ]
326
462
 
327
463
  # Group suggestions by column
328
- for col_profile in profile.columns:
464
+ for col_profile in result.columns:
329
465
  if col_profile.suggested_rules:
330
466
  lines.append(f" # {col_profile.name} validations")
331
467
  for rule in col_profile.suggested_rules:
@@ -335,16 +471,33 @@ class AutoProfiler:
335
471
  return "\n".join(lines)
336
472
 
337
473
 
338
- def profile(dataset: Dataset, dataset_var_name: str = "data") -> ProfileResult:
474
+ def profile(
475
+ dataset: Dataset,
476
+ dataset_var_name: str = "data",
477
+ deep: bool = False,
478
+ null_threshold: float = 1.0,
479
+ unique_threshold: float = 99.0,
480
+ pattern_min_confidence: float = 90.0,
481
+ ) -> ProfileResult:
339
482
  """
340
483
  Convenience function to profile a dataset.
341
484
 
342
485
  Args:
343
486
  dataset: Dataset to profile
344
487
  dataset_var_name: Variable name for generated rules
488
+ deep: Enable deep profiling (distribution, outlier detection)
489
+ null_threshold: Suggest not_null rule if null percentage is below this
490
+ unique_threshold: Suggest unique rule if unique percentage is above this
491
+ pattern_min_confidence: Minimum confidence (0-100) for pattern matches
345
492
 
346
493
  Returns:
347
494
  ProfileResult
348
495
  """
349
- profiler = AutoProfiler(dataset_var_name=dataset_var_name)
496
+ profiler = AutoProfiler(
497
+ dataset_var_name=dataset_var_name,
498
+ deep=deep,
499
+ null_threshold=null_threshold,
500
+ unique_threshold=unique_threshold,
501
+ pattern_min_confidence=pattern_min_confidence,
502
+ )
350
503
  return profiler.profile(dataset)