duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/methods.py +47 -0
- duckguard/anomaly/ml_methods.py +146 -21
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/core/result.py +5 -0
- duckguard/notifications/email.py +9 -0
- duckguard/notifications/notifiers.py +39 -1
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/RECORD +26 -17
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution analysis for enhanced profiling in DuckGuard 3.0.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive distribution analysis including:
|
|
5
|
+
- Distribution fitting (normal, uniform, exponential, etc.)
|
|
6
|
+
- Histogram generation
|
|
7
|
+
- Statistical moments (kurtosis, skewness)
|
|
8
|
+
- Best-fit distribution identification
|
|
9
|
+
|
|
10
|
+
Requirements:
|
|
11
|
+
- scipy>=1.11.0 for distribution fitting and tests
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
>>> from duckguard.profiler.distribution_analyzer import DistributionAnalyzer
|
|
15
|
+
>>> analyzer = DistributionAnalyzer()
|
|
16
|
+
>>> analysis = analyzer.analyze(column_values)
|
|
17
|
+
>>> print(f"Best fit: {analysis.best_fit_distribution}")
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class DistributionAnalysis:
|
|
27
|
+
"""Results of distribution analysis."""
|
|
28
|
+
|
|
29
|
+
# Basic statistics
|
|
30
|
+
mean: float
|
|
31
|
+
std: float
|
|
32
|
+
min: float
|
|
33
|
+
max: float
|
|
34
|
+
median: float
|
|
35
|
+
|
|
36
|
+
# Distribution shape
|
|
37
|
+
kurtosis: float
|
|
38
|
+
skewness: float
|
|
39
|
+
|
|
40
|
+
# Histogram
|
|
41
|
+
histogram_bins: list[tuple[float, float, int]] # (lower, upper, count)
|
|
42
|
+
|
|
43
|
+
# Distribution tests
|
|
44
|
+
is_normal: bool
|
|
45
|
+
is_uniform: bool
|
|
46
|
+
normality_pvalue: float
|
|
47
|
+
uniformity_pvalue: float
|
|
48
|
+
|
|
49
|
+
# Best fit
|
|
50
|
+
best_fit_distribution: str
|
|
51
|
+
best_fit_params: dict
|
|
52
|
+
best_fit_score: float
|
|
53
|
+
|
|
54
|
+
# Sample info
|
|
55
|
+
sample_count: int
|
|
56
|
+
null_count: int
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DistributionAnalyzer:
|
|
60
|
+
"""
|
|
61
|
+
Analyzes the distribution of numerical data.
|
|
62
|
+
|
|
63
|
+
Provides comprehensive statistical analysis including distribution fitting,
|
|
64
|
+
moment calculation, and hypothesis testing.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
MIN_SAMPLES = 30
|
|
68
|
+
SUPPORTED_DISTRIBUTIONS = [
|
|
69
|
+
'norm', # Normal/Gaussian
|
|
70
|
+
'uniform', # Uniform
|
|
71
|
+
'expon', # Exponential
|
|
72
|
+
'gamma', # Gamma
|
|
73
|
+
'lognorm', # Log-normal
|
|
74
|
+
'beta', # Beta
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
def __init__(self):
|
|
78
|
+
"""Initialize the distribution analyzer."""
|
|
79
|
+
self._scipy_available = self._check_scipy()
|
|
80
|
+
|
|
81
|
+
def _check_scipy(self) -> bool:
|
|
82
|
+
"""Check if scipy is available."""
|
|
83
|
+
try:
|
|
84
|
+
import scipy.stats
|
|
85
|
+
return True
|
|
86
|
+
except ImportError:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
def analyze(self, values: np.ndarray, num_bins: int = 20) -> DistributionAnalysis:
|
|
90
|
+
"""
|
|
91
|
+
Perform comprehensive distribution analysis.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
values: Array of numeric values (may contain NaN)
|
|
95
|
+
num_bins: Number of histogram bins (default 20)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
DistributionAnalysis with complete statistical analysis
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ImportError: If scipy is not available
|
|
102
|
+
ValueError: If insufficient valid samples
|
|
103
|
+
"""
|
|
104
|
+
if not self._scipy_available:
|
|
105
|
+
raise ImportError(
|
|
106
|
+
"scipy is required for distribution analysis. "
|
|
107
|
+
"Install with: pip install 'duckguard[statistics]'"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
import scipy.stats as stats
|
|
111
|
+
|
|
112
|
+
# Separate nulls from valid values
|
|
113
|
+
null_count = np.sum(np.isnan(values))
|
|
114
|
+
valid_values = values[~np.isnan(values)]
|
|
115
|
+
|
|
116
|
+
if len(valid_values) < self.MIN_SAMPLES:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Insufficient samples for distribution analysis: {len(valid_values)} "
|
|
119
|
+
f"(minimum {self.MIN_SAMPLES})"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Calculate basic statistics
|
|
123
|
+
mean = np.mean(valid_values)
|
|
124
|
+
std = np.std(valid_values, ddof=1)
|
|
125
|
+
min_val = np.min(valid_values)
|
|
126
|
+
max_val = np.max(valid_values)
|
|
127
|
+
median = np.median(valid_values)
|
|
128
|
+
|
|
129
|
+
# Calculate moments
|
|
130
|
+
kurtosis = stats.kurtosis(valid_values)
|
|
131
|
+
skewness = stats.skew(valid_values)
|
|
132
|
+
|
|
133
|
+
# Generate histogram
|
|
134
|
+
hist, bin_edges = np.histogram(valid_values, bins=num_bins)
|
|
135
|
+
histogram_bins = [
|
|
136
|
+
(float(bin_edges[i]), float(bin_edges[i+1]), int(hist[i]))
|
|
137
|
+
for i in range(len(hist))
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
# Test for normality
|
|
141
|
+
is_normal, normality_pvalue = self._test_normality(valid_values)
|
|
142
|
+
|
|
143
|
+
# Test for uniformity
|
|
144
|
+
is_uniform, uniformity_pvalue = self._test_uniformity(valid_values)
|
|
145
|
+
|
|
146
|
+
# Find best fit distribution
|
|
147
|
+
best_fit_dist, best_fit_params, best_fit_score = self._find_best_fit(valid_values)
|
|
148
|
+
|
|
149
|
+
return DistributionAnalysis(
|
|
150
|
+
mean=mean,
|
|
151
|
+
std=std,
|
|
152
|
+
min=min_val,
|
|
153
|
+
max=max_val,
|
|
154
|
+
median=median,
|
|
155
|
+
kurtosis=kurtosis,
|
|
156
|
+
skewness=skewness,
|
|
157
|
+
histogram_bins=histogram_bins,
|
|
158
|
+
is_normal=is_normal,
|
|
159
|
+
is_uniform=is_uniform,
|
|
160
|
+
normality_pvalue=normality_pvalue,
|
|
161
|
+
uniformity_pvalue=uniformity_pvalue,
|
|
162
|
+
best_fit_distribution=best_fit_dist,
|
|
163
|
+
best_fit_params=best_fit_params,
|
|
164
|
+
best_fit_score=best_fit_score,
|
|
165
|
+
sample_count=len(valid_values),
|
|
166
|
+
null_count=null_count,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def _test_normality(self, values: np.ndarray, alpha: float = 0.05) -> tuple[bool, float]:
|
|
170
|
+
"""
|
|
171
|
+
Test if data follows normal distribution using Kolmogorov-Smirnov test.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
values: Array of values to test
|
|
175
|
+
alpha: Significance level (default 0.05)
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Tuple of (is_normal, p_value)
|
|
179
|
+
"""
|
|
180
|
+
import scipy.stats as stats
|
|
181
|
+
|
|
182
|
+
# Normalize values
|
|
183
|
+
if np.std(values) > 0:
|
|
184
|
+
normalized = (values - np.mean(values)) / np.std(values)
|
|
185
|
+
statistic, pvalue = stats.kstest(normalized, 'norm')
|
|
186
|
+
is_normal = pvalue > alpha
|
|
187
|
+
return is_normal, pvalue
|
|
188
|
+
else:
|
|
189
|
+
# Zero variance - not normal
|
|
190
|
+
return False, 0.0
|
|
191
|
+
|
|
192
|
+
def _test_uniformity(self, values: np.ndarray, alpha: float = 0.05) -> tuple[bool, float]:
|
|
193
|
+
"""
|
|
194
|
+
Test if data follows uniform distribution using Kolmogorov-Smirnov test.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
values: Array of values to test
|
|
198
|
+
alpha: Significance level (default 0.05)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tuple of (is_uniform, p_value)
|
|
202
|
+
"""
|
|
203
|
+
import scipy.stats as stats
|
|
204
|
+
|
|
205
|
+
# Scale to [0, 1]
|
|
206
|
+
min_val = np.min(values)
|
|
207
|
+
max_val = np.max(values)
|
|
208
|
+
|
|
209
|
+
if min_val != max_val:
|
|
210
|
+
scaled = (values - min_val) / (max_val - min_val)
|
|
211
|
+
statistic, pvalue = stats.kstest(scaled, 'uniform')
|
|
212
|
+
is_uniform = pvalue > alpha
|
|
213
|
+
return is_uniform, pvalue
|
|
214
|
+
else:
|
|
215
|
+
# Constant values - not uniform
|
|
216
|
+
return False, 0.0
|
|
217
|
+
|
|
218
|
+
def _find_best_fit(self, values: np.ndarray) -> tuple[str, dict, float]:
|
|
219
|
+
"""
|
|
220
|
+
Find the best-fitting distribution from supported distributions.
|
|
221
|
+
|
|
222
|
+
Uses Kolmogorov-Smirnov test to measure goodness of fit.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
values: Array of values to fit
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Tuple of (distribution_name, parameters, ks_statistic)
|
|
229
|
+
"""
|
|
230
|
+
import scipy.stats as stats
|
|
231
|
+
|
|
232
|
+
best_dist = None
|
|
233
|
+
best_params = {}
|
|
234
|
+
best_score = float('inf') # Lower KS statistic is better
|
|
235
|
+
|
|
236
|
+
for dist_name in self.SUPPORTED_DISTRIBUTIONS:
|
|
237
|
+
try:
|
|
238
|
+
# Get distribution
|
|
239
|
+
dist = getattr(stats, dist_name)
|
|
240
|
+
|
|
241
|
+
# Fit distribution to data
|
|
242
|
+
if dist_name == 'norm':
|
|
243
|
+
# For normal, just use mean and std
|
|
244
|
+
params = (np.mean(values), np.std(values, ddof=1))
|
|
245
|
+
fitted_values = (values - params[0]) / params[1]
|
|
246
|
+
ks_stat, _ = stats.kstest(fitted_values, 'norm')
|
|
247
|
+
param_dict = {'loc': params[0], 'scale': params[1]}
|
|
248
|
+
|
|
249
|
+
elif dist_name == 'uniform':
|
|
250
|
+
# For uniform, use min and max
|
|
251
|
+
params = (np.min(values), np.max(values) - np.min(values))
|
|
252
|
+
scaled = (values - params[0]) / params[1]
|
|
253
|
+
ks_stat, _ = stats.kstest(scaled, 'uniform')
|
|
254
|
+
param_dict = {'loc': params[0], 'scale': params[1]}
|
|
255
|
+
|
|
256
|
+
elif dist_name == 'expon':
|
|
257
|
+
# For exponential, fit using MLE
|
|
258
|
+
params = dist.fit(values, floc=0) # Force loc=0 for exponential
|
|
259
|
+
ks_stat, _ = stats.kstest(values, dist_name, args=params)
|
|
260
|
+
param_dict = {'loc': params[0], 'scale': params[1]}
|
|
261
|
+
|
|
262
|
+
else:
|
|
263
|
+
# For other distributions, use MLE fitting
|
|
264
|
+
params = dist.fit(values)
|
|
265
|
+
ks_stat, _ = stats.kstest(values, dist_name, args=params)
|
|
266
|
+
|
|
267
|
+
# Extract param names
|
|
268
|
+
if len(params) == 2:
|
|
269
|
+
param_dict = {'loc': params[0], 'scale': params[1]}
|
|
270
|
+
elif len(params) == 3:
|
|
271
|
+
param_dict = {'shape': params[0], 'loc': params[1], 'scale': params[2]}
|
|
272
|
+
else:
|
|
273
|
+
param_dict = {f'param_{i}': p for i, p in enumerate(params)}
|
|
274
|
+
|
|
275
|
+
# Update best if this is better
|
|
276
|
+
if ks_stat < best_score:
|
|
277
|
+
best_score = ks_stat
|
|
278
|
+
best_dist = dist_name
|
|
279
|
+
best_params = param_dict
|
|
280
|
+
|
|
281
|
+
except Exception:
|
|
282
|
+
# Skip distributions that fail to fit
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# Default to normal if no fit found
|
|
286
|
+
if best_dist is None:
|
|
287
|
+
best_dist = 'norm'
|
|
288
|
+
best_params = {'loc': np.mean(values), 'scale': np.std(values, ddof=1)}
|
|
289
|
+
best_score = 1.0
|
|
290
|
+
|
|
291
|
+
return best_dist, best_params, best_score
|
|
292
|
+
|
|
293
|
+
def interpret_skewness(self, skewness: float) -> str:
|
|
294
|
+
"""
|
|
295
|
+
Interpret skewness value.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
skewness: Skewness value
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Human-readable interpretation
|
|
302
|
+
"""
|
|
303
|
+
if abs(skewness) < 0.5:
|
|
304
|
+
return "approximately symmetric"
|
|
305
|
+
elif skewness > 0.5:
|
|
306
|
+
return "right-skewed (positive skew)"
|
|
307
|
+
else:
|
|
308
|
+
return "left-skewed (negative skew)"
|
|
309
|
+
|
|
310
|
+
def interpret_kurtosis(self, kurtosis: float) -> str:
|
|
311
|
+
"""
|
|
312
|
+
Interpret kurtosis value (excess kurtosis).
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
kurtosis: Excess kurtosis value
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Human-readable interpretation
|
|
319
|
+
"""
|
|
320
|
+
if abs(kurtosis) < 1:
|
|
321
|
+
return "mesokurtic (normal-like tails)"
|
|
322
|
+
elif kurtosis > 1:
|
|
323
|
+
return "leptokurtic (heavy tails)"
|
|
324
|
+
else:
|
|
325
|
+
return "platykurtic (light tails)"
|
|
326
|
+
|
|
327
|
+
def suggest_checks(self, analysis: DistributionAnalysis) -> list[dict]:
|
|
328
|
+
"""
|
|
329
|
+
Suggest validation checks based on distribution analysis.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
analysis: Distribution analysis results
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
List of suggested check dictionaries
|
|
336
|
+
"""
|
|
337
|
+
suggestions = []
|
|
338
|
+
|
|
339
|
+
# Suggest range check based on distribution
|
|
340
|
+
if analysis.best_fit_distribution == 'norm':
|
|
341
|
+
# For normal, suggest mean ± 3*std
|
|
342
|
+
lower = analysis.mean - 3 * analysis.std
|
|
343
|
+
upper = analysis.mean + 3 * analysis.std
|
|
344
|
+
suggestions.append({
|
|
345
|
+
'check': 'between',
|
|
346
|
+
'min_value': lower,
|
|
347
|
+
'max_value': upper,
|
|
348
|
+
'reason': 'Normal distribution: ~99.7% within 3 standard deviations'
|
|
349
|
+
})
|
|
350
|
+
|
|
351
|
+
elif analysis.best_fit_distribution == 'uniform':
|
|
352
|
+
# For uniform, use observed min/max
|
|
353
|
+
suggestions.append({
|
|
354
|
+
'check': 'between',
|
|
355
|
+
'min_value': analysis.min,
|
|
356
|
+
'max_value': analysis.max,
|
|
357
|
+
'reason': 'Uniform distribution: values bounded by observed range'
|
|
358
|
+
})
|
|
359
|
+
|
|
360
|
+
# Suggest normality check if data is normal
|
|
361
|
+
if analysis.is_normal and analysis.normality_pvalue > 0.1:
|
|
362
|
+
suggestions.append({
|
|
363
|
+
'check': 'expect_distribution_normal',
|
|
364
|
+
'significance_level': 0.05,
|
|
365
|
+
'reason': f'Data follows normal distribution (p={analysis.normality_pvalue:.3f})'
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
# Suggest uniformity check if data is uniform
|
|
369
|
+
if analysis.is_uniform and analysis.uniformity_pvalue > 0.1:
|
|
370
|
+
suggestions.append({
|
|
371
|
+
'check': 'expect_distribution_uniform',
|
|
372
|
+
'significance_level': 0.05,
|
|
373
|
+
'reason': f'Data follows uniform distribution (p={analysis.uniformity_pvalue:.3f})'
|
|
374
|
+
})
|
|
375
|
+
|
|
376
|
+
# Check for outliers based on IQR
|
|
377
|
+
if abs(analysis.kurtosis) > 3:
|
|
378
|
+
suggestions.append({
|
|
379
|
+
'check': 'outlier_detection',
|
|
380
|
+
'method': 'iqr',
|
|
381
|
+
'reason': f'High kurtosis ({analysis.kurtosis:.2f}) suggests potential outliers'
|
|
382
|
+
})
|
|
383
|
+
|
|
384
|
+
return suggestions
|