duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,445 @@
1
+ """
2
+ Data quality scoring for enhanced profiling in DuckGuard 3.0.
3
+
4
+ This module provides comprehensive data quality assessment across multiple dimensions:
5
+ - Completeness: Percentage of non-null values
6
+ - Validity: Conformance to expected patterns/types
7
+ - Consistency: Internal consistency and duplicate detection
8
+ - Accuracy: Statistical measures of correctness
9
+ - Overall quality score: Weighted combination of dimensions
10
+
11
+ Example:
12
+ >>> from duckguard.profiler.quality_scorer import QualityScorer
13
+ >>> scorer = QualityScorer()
14
+ >>> score = scorer.calculate(column_profile)
15
+ >>> print(f"Overall quality: {score.overall_score}/100")
16
+ """
17
+
18
+ from dataclasses import dataclass
19
+
20
+ import numpy as np
21
+
22
+
23
+ @dataclass
24
+ class QualityDimensions:
25
+ """Quality scores across different dimensions."""
26
+
27
+ completeness: float # 0-100
28
+ validity: float # 0-100
29
+ consistency: float # 0-100
30
+ accuracy: float # 0-100
31
+ overall: float # 0-100
32
+
33
+ # Detailed breakdowns
34
+ completeness_details: dict
35
+ validity_details: dict
36
+ consistency_details: dict
37
+ accuracy_details: dict
38
+
39
+ # Grade
40
+ grade: str # A, B, C, D, F
41
+
42
+
43
+ class QualityScorer:
44
+ """
45
+ Calculates data quality scores across multiple dimensions.
46
+
47
+ Provides both overall scores and dimensional breakdowns with
48
+ actionable insights for improvement.
49
+ """
50
+
51
+ # Dimension weights (must sum to 1.0)
52
+ WEIGHTS = {
53
+ 'completeness': 0.30,
54
+ 'validity': 0.30,
55
+ 'consistency': 0.20,
56
+ 'accuracy': 0.20,
57
+ }
58
+
59
+ # Grade thresholds
60
+ GRADE_THRESHOLDS = {
61
+ 'A': 90.0,
62
+ 'B': 80.0,
63
+ 'C': 70.0,
64
+ 'D': 60.0,
65
+ 'F': 0.0,
66
+ }
67
+
68
+ def calculate(
69
+ self,
70
+ values: np.ndarray,
71
+ expected_type: str | None = None,
72
+ expected_pattern: str | None = None,
73
+ allow_nulls: bool = True
74
+ ) -> QualityDimensions:
75
+ """
76
+ Calculate comprehensive quality scores for a column.
77
+
78
+ Args:
79
+ values: Array of values to score
80
+ expected_type: Expected data type ('int', 'float', 'string', 'date')
81
+ expected_pattern: Expected regex pattern for string values
82
+ allow_nulls: Whether nulls are acceptable
83
+
84
+ Returns:
85
+ QualityDimensions with all scores and details
86
+ """
87
+ # Calculate individual dimensions
88
+ completeness_score, completeness_details = self._calculate_completeness(
89
+ values, allow_nulls
90
+ )
91
+
92
+ validity_score, validity_details = self._calculate_validity(
93
+ values, expected_type, expected_pattern
94
+ )
95
+
96
+ consistency_score, consistency_details = self._calculate_consistency(
97
+ values
98
+ )
99
+
100
+ accuracy_score, accuracy_details = self._calculate_accuracy(
101
+ values
102
+ )
103
+
104
+ # Calculate weighted overall score
105
+ overall_score = (
106
+ completeness_score * self.WEIGHTS['completeness'] +
107
+ validity_score * self.WEIGHTS['validity'] +
108
+ consistency_score * self.WEIGHTS['consistency'] +
109
+ accuracy_score * self.WEIGHTS['accuracy']
110
+ )
111
+
112
+ # Determine grade
113
+ grade = self._calculate_grade(overall_score)
114
+
115
+ return QualityDimensions(
116
+ completeness=completeness_score,
117
+ validity=validity_score,
118
+ consistency=consistency_score,
119
+ accuracy=accuracy_score,
120
+ overall=overall_score,
121
+ completeness_details=completeness_details,
122
+ validity_details=validity_details,
123
+ consistency_details=consistency_details,
124
+ accuracy_details=accuracy_details,
125
+ grade=grade
126
+ )
127
+
128
+ def _calculate_completeness(
129
+ self,
130
+ values: np.ndarray,
131
+ allow_nulls: bool
132
+ ) -> tuple[float, dict]:
133
+ """
134
+ Calculate completeness score.
135
+
136
+ Completeness measures the percentage of non-null values.
137
+
138
+ Args:
139
+ values: Array of values
140
+ allow_nulls: Whether nulls are acceptable
141
+
142
+ Returns:
143
+ Tuple of (score, details)
144
+ """
145
+ total_count = len(values)
146
+
147
+ # Count nulls (NaN for numeric, None for others)
148
+ null_count = 0
149
+ for v in values:
150
+ if v is None or (isinstance(v, float) and np.isnan(v)):
151
+ null_count += 1
152
+
153
+ non_null_count = total_count - null_count
154
+ null_percentage = (null_count / total_count) * 100 if total_count > 0 else 0
155
+
156
+ if allow_nulls:
157
+ # If nulls allowed, score based on percentage present
158
+ # 100% = perfect, 0% = worst
159
+ score = (non_null_count / total_count) * 100 if total_count > 0 else 0
160
+ else:
161
+ # If nulls not allowed, any null drops score significantly
162
+ if null_count == 0:
163
+ score = 100.0
164
+ else:
165
+ score = max(0, 100 - (null_percentage * 2)) # Double penalty
166
+
167
+ details = {
168
+ 'total_count': total_count,
169
+ 'non_null_count': non_null_count,
170
+ 'null_count': null_count,
171
+ 'null_percentage': null_percentage,
172
+ 'nulls_allowed': allow_nulls,
173
+ }
174
+
175
+ return score, details
176
+
177
+ def _calculate_validity(
178
+ self,
179
+ values: np.ndarray,
180
+ expected_type: str | None,
181
+ expected_pattern: str | None
182
+ ) -> tuple[float, dict]:
183
+ """
184
+ Calculate validity score.
185
+
186
+ Validity measures conformance to expected types and patterns.
187
+
188
+ Args:
189
+ values: Array of values
190
+ expected_type: Expected data type
191
+ expected_pattern: Expected regex pattern
192
+
193
+ Returns:
194
+ Tuple of (score, details)
195
+ """
196
+ if expected_type is None and expected_pattern is None:
197
+ # No expectations defined, assume valid
198
+ return 100.0, {'note': 'No expectations defined'}
199
+
200
+ valid_values = values[~pd.isna(values)] if hasattr(values, 'isna') else values
201
+ valid_count = len([v for v in valid_values if v is not None])
202
+
203
+ if valid_count == 0:
204
+ return 0.0, {'valid_count': 0, 'total_count': 0}
205
+
206
+ conforming_count = 0
207
+
208
+ # Check type conformance
209
+ if expected_type:
210
+ for v in valid_values:
211
+ if self._check_type_conformance(v, expected_type):
212
+ conforming_count += 1
213
+
214
+ # Check pattern conformance
215
+ elif expected_pattern:
216
+ import re
217
+ pattern = re.compile(expected_pattern)
218
+ for v in valid_values:
219
+ if pattern.match(str(v)):
220
+ conforming_count += 1
221
+
222
+ score = (conforming_count / valid_count) * 100 if valid_count > 0 else 0
223
+
224
+ details = {
225
+ 'valid_count': valid_count,
226
+ 'conforming_count': conforming_count,
227
+ 'conformance_rate': score,
228
+ 'expected_type': expected_type,
229
+ 'expected_pattern': expected_pattern,
230
+ }
231
+
232
+ return score, details
233
+
234
+ def _calculate_consistency(
235
+ self,
236
+ values: np.ndarray
237
+ ) -> tuple[float, dict]:
238
+ """
239
+ Calculate consistency score.
240
+
241
+ Consistency measures internal consistency and duplicate detection.
242
+
243
+ Args:
244
+ values: Array of values
245
+
246
+ Returns:
247
+ Tuple of (score, details)
248
+ """
249
+ # Remove nulls
250
+ valid_values = []
251
+ for v in values:
252
+ if v is not None and not (isinstance(v, float) and np.isnan(v)):
253
+ valid_values.append(v)
254
+
255
+ if len(valid_values) == 0:
256
+ return 0.0, {'note': 'No valid values'}
257
+
258
+ # Calculate uniqueness rate
259
+ unique_count = len(set(valid_values))
260
+ total_count = len(valid_values)
261
+ uniqueness_rate = (unique_count / total_count) * 100
262
+
263
+ # Calculate duplicate rate
264
+ duplicate_count = total_count - unique_count
265
+ duplicate_rate = (duplicate_count / total_count) * 100
266
+
267
+ # Score based on uniqueness (higher uniqueness = more consistent)
268
+ # But also consider if low uniqueness is expected (categorical data)
269
+ if unique_count <= 10:
270
+ # Likely categorical - low uniqueness is ok
271
+ score = 100.0 if duplicate_rate < 50 else 80.0
272
+ else:
273
+ # Continuous data - penalize excessive duplicates
274
+ score = max(50, 100 - duplicate_rate)
275
+
276
+ details = {
277
+ 'total_count': total_count,
278
+ 'unique_count': unique_count,
279
+ 'duplicate_count': duplicate_count,
280
+ 'uniqueness_rate': uniqueness_rate,
281
+ 'duplicate_rate': duplicate_rate,
282
+ }
283
+
284
+ return score, details
285
+
286
+ def _calculate_accuracy(
287
+ self,
288
+ values: np.ndarray
289
+ ) -> tuple[float, dict]:
290
+ """
291
+ Calculate accuracy score.
292
+
293
+ Accuracy measures statistical validity and outlier detection.
294
+
295
+ Args:
296
+ values: Array of values
297
+
298
+ Returns:
299
+ Tuple of (score, details)
300
+ """
301
+ # Try to convert to numeric for statistical analysis
302
+ numeric_values = []
303
+ for v in values:
304
+ try:
305
+ if v is not None and not (isinstance(v, float) and np.isnan(v)):
306
+ numeric_values.append(float(v))
307
+ except (ValueError, TypeError):
308
+ pass
309
+
310
+ if len(numeric_values) < 3:
311
+ # Not enough numeric data
312
+ return 100.0, {'note': 'Insufficient numeric data for accuracy assessment'}
313
+
314
+ numeric_array = np.array(numeric_values)
315
+
316
+ # Check for outliers using IQR method
317
+ Q1 = np.percentile(numeric_array, 25)
318
+ Q3 = np.percentile(numeric_array, 75)
319
+ IQR = Q3 - Q1
320
+
321
+ lower_bound = Q1 - 1.5 * IQR
322
+ upper_bound = Q3 + 1.5 * IQR
323
+
324
+ outliers = numeric_array[(numeric_array < lower_bound) | (numeric_array > upper_bound)]
325
+ outlier_count = len(outliers)
326
+ outlier_rate = (outlier_count / len(numeric_array)) * 100
327
+
328
+ # Score based on outlier rate
329
+ # < 5% outliers = excellent, > 20% = poor
330
+ if outlier_rate < 5:
331
+ score = 100.0
332
+ elif outlier_rate < 10:
333
+ score = 90.0
334
+ elif outlier_rate < 15:
335
+ score = 80.0
336
+ elif outlier_rate < 20:
337
+ score = 70.0
338
+ else:
339
+ score = max(50, 100 - outlier_rate * 2)
340
+
341
+ details = {
342
+ 'numeric_count': len(numeric_array),
343
+ 'outlier_count': outlier_count,
344
+ 'outlier_rate': outlier_rate,
345
+ 'Q1': Q1,
346
+ 'Q3': Q3,
347
+ 'IQR': IQR,
348
+ }
349
+
350
+ return score, details
351
+
352
+ def _check_type_conformance(self, value, expected_type: str) -> bool:
353
+ """Check if value conforms to expected type."""
354
+ if expected_type == 'int':
355
+ try:
356
+ int(value)
357
+ return True
358
+ except (ValueError, TypeError):
359
+ return False
360
+
361
+ elif expected_type == 'float':
362
+ try:
363
+ float(value)
364
+ return True
365
+ except (ValueError, TypeError):
366
+ return False
367
+
368
+ elif expected_type == 'string':
369
+ return isinstance(value, str)
370
+
371
+ elif expected_type == 'date':
372
+ # Basic date check (would need proper parsing in production)
373
+ import re
374
+ date_pattern = r'\d{4}-\d{2}-\d{2}'
375
+ return bool(re.match(date_pattern, str(value)))
376
+
377
+ return False
378
+
379
+ def _calculate_grade(self, overall_score: float) -> str:
380
+ """Calculate letter grade from overall score."""
381
+ for grade, threshold in self.GRADE_THRESHOLDS.items():
382
+ if overall_score >= threshold:
383
+ return grade
384
+ return 'F'
385
+
386
+ def get_improvement_suggestions(self, dimensions: QualityDimensions) -> list[str]:
387
+ """
388
+ Get suggestions for improving data quality.
389
+
390
+ Args:
391
+ dimensions: Quality dimensions
392
+
393
+ Returns:
394
+ List of improvement suggestions
395
+ """
396
+ suggestions = []
397
+
398
+ # Completeness suggestions
399
+ if dimensions.completeness < 80:
400
+ null_pct = dimensions.completeness_details.get('null_percentage', 0)
401
+ suggestions.append(
402
+ f"Completeness: {null_pct:.1f}% null values detected. "
403
+ "Review data collection process to reduce missing data."
404
+ )
405
+
406
+ # Validity suggestions
407
+ if dimensions.validity < 80:
408
+ conformance = dimensions.validity_details.get('conformance_rate', 0)
409
+ suggestions.append(
410
+ f"Validity: Only {conformance:.1f}% of values conform to expectations. "
411
+ "Add validation checks at data ingestion."
412
+ )
413
+
414
+ # Consistency suggestions
415
+ if dimensions.consistency < 80:
416
+ duplicate_rate = dimensions.consistency_details.get('duplicate_rate', 0)
417
+ if duplicate_rate > 20:
418
+ suggestions.append(
419
+ f"Consistency: {duplicate_rate:.1f}% duplicate values. "
420
+ "Review for data entry errors or implement deduplication."
421
+ )
422
+
423
+ # Accuracy suggestions
424
+ if dimensions.accuracy < 80:
425
+ outlier_rate = dimensions.accuracy_details.get('outlier_rate', 0)
426
+ if outlier_rate > 10:
427
+ suggestions.append(
428
+ f"Accuracy: {outlier_rate:.1f}% outliers detected. "
429
+ "Investigate outliers and implement range checks."
430
+ )
431
+
432
+ if not suggestions:
433
+ suggestions.append(
434
+ f"Quality grade {dimensions.grade}: Data quality is acceptable. "
435
+ "Continue monitoring for any degradation."
436
+ )
437
+
438
+ return suggestions
439
+
440
+
441
+ # Import pandas if available (gracefully handle if not)
442
+ try:
443
+ import pandas as pd
444
+ except ImportError:
445
+ pd = None