duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +21 -12
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data quality scoring for enhanced profiling in DuckGuard 3.0.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive data quality assessment across multiple dimensions:
|
|
5
|
+
- Completeness: Percentage of non-null values
|
|
6
|
+
- Validity: Conformance to expected patterns/types
|
|
7
|
+
- Consistency: Internal consistency and duplicate detection
|
|
8
|
+
- Accuracy: Statistical measures of correctness
|
|
9
|
+
- Overall quality score: Weighted combination of dimensions
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> from duckguard.profiler.quality_scorer import QualityScorer
|
|
13
|
+
>>> scorer = QualityScorer()
|
|
14
|
+
>>> score = scorer.calculate(column_profile)
|
|
15
|
+
>>> print(f"Overall quality: {score.overall_score}/100")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class QualityDimensions:
|
|
25
|
+
"""Quality scores across different dimensions."""
|
|
26
|
+
|
|
27
|
+
completeness: float # 0-100
|
|
28
|
+
validity: float # 0-100
|
|
29
|
+
consistency: float # 0-100
|
|
30
|
+
accuracy: float # 0-100
|
|
31
|
+
overall: float # 0-100
|
|
32
|
+
|
|
33
|
+
# Detailed breakdowns
|
|
34
|
+
completeness_details: dict
|
|
35
|
+
validity_details: dict
|
|
36
|
+
consistency_details: dict
|
|
37
|
+
accuracy_details: dict
|
|
38
|
+
|
|
39
|
+
# Grade
|
|
40
|
+
grade: str # A, B, C, D, F
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class QualityScorer:
|
|
44
|
+
"""
|
|
45
|
+
Calculates data quality scores across multiple dimensions.
|
|
46
|
+
|
|
47
|
+
Provides both overall scores and dimensional breakdowns with
|
|
48
|
+
actionable insights for improvement.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Dimension weights (must sum to 1.0)
|
|
52
|
+
WEIGHTS = {
|
|
53
|
+
'completeness': 0.30,
|
|
54
|
+
'validity': 0.30,
|
|
55
|
+
'consistency': 0.20,
|
|
56
|
+
'accuracy': 0.20,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Grade thresholds
|
|
60
|
+
GRADE_THRESHOLDS = {
|
|
61
|
+
'A': 90.0,
|
|
62
|
+
'B': 80.0,
|
|
63
|
+
'C': 70.0,
|
|
64
|
+
'D': 60.0,
|
|
65
|
+
'F': 0.0,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def calculate(
|
|
69
|
+
self,
|
|
70
|
+
values: np.ndarray,
|
|
71
|
+
expected_type: str | None = None,
|
|
72
|
+
expected_pattern: str | None = None,
|
|
73
|
+
allow_nulls: bool = True
|
|
74
|
+
) -> QualityDimensions:
|
|
75
|
+
"""
|
|
76
|
+
Calculate comprehensive quality scores for a column.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
values: Array of values to score
|
|
80
|
+
expected_type: Expected data type ('int', 'float', 'string', 'date')
|
|
81
|
+
expected_pattern: Expected regex pattern for string values
|
|
82
|
+
allow_nulls: Whether nulls are acceptable
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
QualityDimensions with all scores and details
|
|
86
|
+
"""
|
|
87
|
+
# Calculate individual dimensions
|
|
88
|
+
completeness_score, completeness_details = self._calculate_completeness(
|
|
89
|
+
values, allow_nulls
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
validity_score, validity_details = self._calculate_validity(
|
|
93
|
+
values, expected_type, expected_pattern
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
consistency_score, consistency_details = self._calculate_consistency(
|
|
97
|
+
values
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
accuracy_score, accuracy_details = self._calculate_accuracy(
|
|
101
|
+
values
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Calculate weighted overall score
|
|
105
|
+
overall_score = (
|
|
106
|
+
completeness_score * self.WEIGHTS['completeness'] +
|
|
107
|
+
validity_score * self.WEIGHTS['validity'] +
|
|
108
|
+
consistency_score * self.WEIGHTS['consistency'] +
|
|
109
|
+
accuracy_score * self.WEIGHTS['accuracy']
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Determine grade
|
|
113
|
+
grade = self._calculate_grade(overall_score)
|
|
114
|
+
|
|
115
|
+
return QualityDimensions(
|
|
116
|
+
completeness=completeness_score,
|
|
117
|
+
validity=validity_score,
|
|
118
|
+
consistency=consistency_score,
|
|
119
|
+
accuracy=accuracy_score,
|
|
120
|
+
overall=overall_score,
|
|
121
|
+
completeness_details=completeness_details,
|
|
122
|
+
validity_details=validity_details,
|
|
123
|
+
consistency_details=consistency_details,
|
|
124
|
+
accuracy_details=accuracy_details,
|
|
125
|
+
grade=grade
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _calculate_completeness(
|
|
129
|
+
self,
|
|
130
|
+
values: np.ndarray,
|
|
131
|
+
allow_nulls: bool
|
|
132
|
+
) -> tuple[float, dict]:
|
|
133
|
+
"""
|
|
134
|
+
Calculate completeness score.
|
|
135
|
+
|
|
136
|
+
Completeness measures the percentage of non-null values.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
values: Array of values
|
|
140
|
+
allow_nulls: Whether nulls are acceptable
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Tuple of (score, details)
|
|
144
|
+
"""
|
|
145
|
+
total_count = len(values)
|
|
146
|
+
|
|
147
|
+
# Count nulls (NaN for numeric, None for others)
|
|
148
|
+
null_count = 0
|
|
149
|
+
for v in values:
|
|
150
|
+
if v is None or (isinstance(v, float) and np.isnan(v)):
|
|
151
|
+
null_count += 1
|
|
152
|
+
|
|
153
|
+
non_null_count = total_count - null_count
|
|
154
|
+
null_percentage = (null_count / total_count) * 100 if total_count > 0 else 0
|
|
155
|
+
|
|
156
|
+
if allow_nulls:
|
|
157
|
+
# If nulls allowed, score based on percentage present
|
|
158
|
+
# 100% = perfect, 0% = worst
|
|
159
|
+
score = (non_null_count / total_count) * 100 if total_count > 0 else 0
|
|
160
|
+
else:
|
|
161
|
+
# If nulls not allowed, any null drops score significantly
|
|
162
|
+
if null_count == 0:
|
|
163
|
+
score = 100.0
|
|
164
|
+
else:
|
|
165
|
+
score = max(0, 100 - (null_percentage * 2)) # Double penalty
|
|
166
|
+
|
|
167
|
+
details = {
|
|
168
|
+
'total_count': total_count,
|
|
169
|
+
'non_null_count': non_null_count,
|
|
170
|
+
'null_count': null_count,
|
|
171
|
+
'null_percentage': null_percentage,
|
|
172
|
+
'nulls_allowed': allow_nulls,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return score, details
|
|
176
|
+
|
|
177
|
+
def _calculate_validity(
|
|
178
|
+
self,
|
|
179
|
+
values: np.ndarray,
|
|
180
|
+
expected_type: str | None,
|
|
181
|
+
expected_pattern: str | None
|
|
182
|
+
) -> tuple[float, dict]:
|
|
183
|
+
"""
|
|
184
|
+
Calculate validity score.
|
|
185
|
+
|
|
186
|
+
Validity measures conformance to expected types and patterns.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
values: Array of values
|
|
190
|
+
expected_type: Expected data type
|
|
191
|
+
expected_pattern: Expected regex pattern
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Tuple of (score, details)
|
|
195
|
+
"""
|
|
196
|
+
if expected_type is None and expected_pattern is None:
|
|
197
|
+
# No expectations defined, assume valid
|
|
198
|
+
return 100.0, {'note': 'No expectations defined'}
|
|
199
|
+
|
|
200
|
+
valid_values = values[~pd.isna(values)] if hasattr(values, 'isna') else values
|
|
201
|
+
valid_count = len([v for v in valid_values if v is not None])
|
|
202
|
+
|
|
203
|
+
if valid_count == 0:
|
|
204
|
+
return 0.0, {'valid_count': 0, 'total_count': 0}
|
|
205
|
+
|
|
206
|
+
conforming_count = 0
|
|
207
|
+
|
|
208
|
+
# Check type conformance
|
|
209
|
+
if expected_type:
|
|
210
|
+
for v in valid_values:
|
|
211
|
+
if self._check_type_conformance(v, expected_type):
|
|
212
|
+
conforming_count += 1
|
|
213
|
+
|
|
214
|
+
# Check pattern conformance
|
|
215
|
+
elif expected_pattern:
|
|
216
|
+
import re
|
|
217
|
+
pattern = re.compile(expected_pattern)
|
|
218
|
+
for v in valid_values:
|
|
219
|
+
if pattern.match(str(v)):
|
|
220
|
+
conforming_count += 1
|
|
221
|
+
|
|
222
|
+
score = (conforming_count / valid_count) * 100 if valid_count > 0 else 0
|
|
223
|
+
|
|
224
|
+
details = {
|
|
225
|
+
'valid_count': valid_count,
|
|
226
|
+
'conforming_count': conforming_count,
|
|
227
|
+
'conformance_rate': score,
|
|
228
|
+
'expected_type': expected_type,
|
|
229
|
+
'expected_pattern': expected_pattern,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
return score, details
|
|
233
|
+
|
|
234
|
+
def _calculate_consistency(
|
|
235
|
+
self,
|
|
236
|
+
values: np.ndarray
|
|
237
|
+
) -> tuple[float, dict]:
|
|
238
|
+
"""
|
|
239
|
+
Calculate consistency score.
|
|
240
|
+
|
|
241
|
+
Consistency measures internal consistency and duplicate detection.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
values: Array of values
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Tuple of (score, details)
|
|
248
|
+
"""
|
|
249
|
+
# Remove nulls
|
|
250
|
+
valid_values = []
|
|
251
|
+
for v in values:
|
|
252
|
+
if v is not None and not (isinstance(v, float) and np.isnan(v)):
|
|
253
|
+
valid_values.append(v)
|
|
254
|
+
|
|
255
|
+
if len(valid_values) == 0:
|
|
256
|
+
return 0.0, {'note': 'No valid values'}
|
|
257
|
+
|
|
258
|
+
# Calculate uniqueness rate
|
|
259
|
+
unique_count = len(set(valid_values))
|
|
260
|
+
total_count = len(valid_values)
|
|
261
|
+
uniqueness_rate = (unique_count / total_count) * 100
|
|
262
|
+
|
|
263
|
+
# Calculate duplicate rate
|
|
264
|
+
duplicate_count = total_count - unique_count
|
|
265
|
+
duplicate_rate = (duplicate_count / total_count) * 100
|
|
266
|
+
|
|
267
|
+
# Score based on uniqueness (higher uniqueness = more consistent)
|
|
268
|
+
# But also consider if low uniqueness is expected (categorical data)
|
|
269
|
+
if unique_count <= 10:
|
|
270
|
+
# Likely categorical - low uniqueness is ok
|
|
271
|
+
score = 100.0 if duplicate_rate < 50 else 80.0
|
|
272
|
+
else:
|
|
273
|
+
# Continuous data - penalize excessive duplicates
|
|
274
|
+
score = max(50, 100 - duplicate_rate)
|
|
275
|
+
|
|
276
|
+
details = {
|
|
277
|
+
'total_count': total_count,
|
|
278
|
+
'unique_count': unique_count,
|
|
279
|
+
'duplicate_count': duplicate_count,
|
|
280
|
+
'uniqueness_rate': uniqueness_rate,
|
|
281
|
+
'duplicate_rate': duplicate_rate,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return score, details
|
|
285
|
+
|
|
286
|
+
def _calculate_accuracy(
|
|
287
|
+
self,
|
|
288
|
+
values: np.ndarray
|
|
289
|
+
) -> tuple[float, dict]:
|
|
290
|
+
"""
|
|
291
|
+
Calculate accuracy score.
|
|
292
|
+
|
|
293
|
+
Accuracy measures statistical validity and outlier detection.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
values: Array of values
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Tuple of (score, details)
|
|
300
|
+
"""
|
|
301
|
+
# Try to convert to numeric for statistical analysis
|
|
302
|
+
numeric_values = []
|
|
303
|
+
for v in values:
|
|
304
|
+
try:
|
|
305
|
+
if v is not None and not (isinstance(v, float) and np.isnan(v)):
|
|
306
|
+
numeric_values.append(float(v))
|
|
307
|
+
except (ValueError, TypeError):
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
if len(numeric_values) < 3:
|
|
311
|
+
# Not enough numeric data
|
|
312
|
+
return 100.0, {'note': 'Insufficient numeric data for accuracy assessment'}
|
|
313
|
+
|
|
314
|
+
numeric_array = np.array(numeric_values)
|
|
315
|
+
|
|
316
|
+
# Check for outliers using IQR method
|
|
317
|
+
Q1 = np.percentile(numeric_array, 25)
|
|
318
|
+
Q3 = np.percentile(numeric_array, 75)
|
|
319
|
+
IQR = Q3 - Q1
|
|
320
|
+
|
|
321
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
322
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
323
|
+
|
|
324
|
+
outliers = numeric_array[(numeric_array < lower_bound) | (numeric_array > upper_bound)]
|
|
325
|
+
outlier_count = len(outliers)
|
|
326
|
+
outlier_rate = (outlier_count / len(numeric_array)) * 100
|
|
327
|
+
|
|
328
|
+
# Score based on outlier rate
|
|
329
|
+
# < 5% outliers = excellent, > 20% = poor
|
|
330
|
+
if outlier_rate < 5:
|
|
331
|
+
score = 100.0
|
|
332
|
+
elif outlier_rate < 10:
|
|
333
|
+
score = 90.0
|
|
334
|
+
elif outlier_rate < 15:
|
|
335
|
+
score = 80.0
|
|
336
|
+
elif outlier_rate < 20:
|
|
337
|
+
score = 70.0
|
|
338
|
+
else:
|
|
339
|
+
score = max(50, 100 - outlier_rate * 2)
|
|
340
|
+
|
|
341
|
+
details = {
|
|
342
|
+
'numeric_count': len(numeric_array),
|
|
343
|
+
'outlier_count': outlier_count,
|
|
344
|
+
'outlier_rate': outlier_rate,
|
|
345
|
+
'Q1': Q1,
|
|
346
|
+
'Q3': Q3,
|
|
347
|
+
'IQR': IQR,
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return score, details
|
|
351
|
+
|
|
352
|
+
def _check_type_conformance(self, value, expected_type: str) -> bool:
|
|
353
|
+
"""Check if value conforms to expected type."""
|
|
354
|
+
if expected_type == 'int':
|
|
355
|
+
try:
|
|
356
|
+
int(value)
|
|
357
|
+
return True
|
|
358
|
+
except (ValueError, TypeError):
|
|
359
|
+
return False
|
|
360
|
+
|
|
361
|
+
elif expected_type == 'float':
|
|
362
|
+
try:
|
|
363
|
+
float(value)
|
|
364
|
+
return True
|
|
365
|
+
except (ValueError, TypeError):
|
|
366
|
+
return False
|
|
367
|
+
|
|
368
|
+
elif expected_type == 'string':
|
|
369
|
+
return isinstance(value, str)
|
|
370
|
+
|
|
371
|
+
elif expected_type == 'date':
|
|
372
|
+
# Basic date check (would need proper parsing in production)
|
|
373
|
+
import re
|
|
374
|
+
date_pattern = r'\d{4}-\d{2}-\d{2}'
|
|
375
|
+
return bool(re.match(date_pattern, str(value)))
|
|
376
|
+
|
|
377
|
+
return False
|
|
378
|
+
|
|
379
|
+
def _calculate_grade(self, overall_score: float) -> str:
|
|
380
|
+
"""Calculate letter grade from overall score."""
|
|
381
|
+
for grade, threshold in self.GRADE_THRESHOLDS.items():
|
|
382
|
+
if overall_score >= threshold:
|
|
383
|
+
return grade
|
|
384
|
+
return 'F'
|
|
385
|
+
|
|
386
|
+
def get_improvement_suggestions(self, dimensions: QualityDimensions) -> list[str]:
|
|
387
|
+
"""
|
|
388
|
+
Get suggestions for improving data quality.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
dimensions: Quality dimensions
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
List of improvement suggestions
|
|
395
|
+
"""
|
|
396
|
+
suggestions = []
|
|
397
|
+
|
|
398
|
+
# Completeness suggestions
|
|
399
|
+
if dimensions.completeness < 80:
|
|
400
|
+
null_pct = dimensions.completeness_details.get('null_percentage', 0)
|
|
401
|
+
suggestions.append(
|
|
402
|
+
f"Completeness: {null_pct:.1f}% null values detected. "
|
|
403
|
+
"Review data collection process to reduce missing data."
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Validity suggestions
|
|
407
|
+
if dimensions.validity < 80:
|
|
408
|
+
conformance = dimensions.validity_details.get('conformance_rate', 0)
|
|
409
|
+
suggestions.append(
|
|
410
|
+
f"Validity: Only {conformance:.1f}% of values conform to expectations. "
|
|
411
|
+
"Add validation checks at data ingestion."
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Consistency suggestions
|
|
415
|
+
if dimensions.consistency < 80:
|
|
416
|
+
duplicate_rate = dimensions.consistency_details.get('duplicate_rate', 0)
|
|
417
|
+
if duplicate_rate > 20:
|
|
418
|
+
suggestions.append(
|
|
419
|
+
f"Consistency: {duplicate_rate:.1f}% duplicate values. "
|
|
420
|
+
"Review for data entry errors or implement deduplication."
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Accuracy suggestions
|
|
424
|
+
if dimensions.accuracy < 80:
|
|
425
|
+
outlier_rate = dimensions.accuracy_details.get('outlier_rate', 0)
|
|
426
|
+
if outlier_rate > 10:
|
|
427
|
+
suggestions.append(
|
|
428
|
+
f"Accuracy: {outlier_rate:.1f}% outliers detected. "
|
|
429
|
+
"Investigate outliers and implement range checks."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
if not suggestions:
|
|
433
|
+
suggestions.append(
|
|
434
|
+
f"Quality grade {dimensions.grade}: Data quality is acceptable. "
|
|
435
|
+
"Continue monitoring for any degradation."
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
return suggestions
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# Import pandas if available (gracefully handle if not)
|
|
442
|
+
try:
|
|
443
|
+
import pandas as pd
|
|
444
|
+
except ImportError:
|
|
445
|
+
pd = None
|