duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ """
2
+ Distribution analysis for enhanced profiling in DuckGuard 3.0.
3
+
4
+ This module provides comprehensive distribution analysis including:
5
+ - Distribution fitting (normal, uniform, exponential, etc.)
6
+ - Histogram generation
7
+ - Statistical moments (kurtosis, skewness)
8
+ - Best-fit distribution identification
9
+
10
+ Requirements:
11
+ - scipy>=1.11.0 for distribution fitting and tests
12
+
13
+ Example:
14
+ >>> from duckguard.profiler.distribution_analyzer import DistributionAnalyzer
15
+ >>> analyzer = DistributionAnalyzer()
16
+ >>> analysis = analyzer.analyze(column_values)
17
+ >>> print(f"Best fit: {analysis.best_fit_distribution}")
18
+ """
19
+
20
+ from dataclasses import dataclass
21
+
22
+ import numpy as np
23
+
24
+
25
+ @dataclass
26
+ class DistributionAnalysis:
27
+ """Results of distribution analysis."""
28
+
29
+ # Basic statistics
30
+ mean: float
31
+ std: float
32
+ min: float
33
+ max: float
34
+ median: float
35
+
36
+ # Distribution shape
37
+ kurtosis: float
38
+ skewness: float
39
+
40
+ # Histogram
41
+ histogram_bins: list[tuple[float, float, int]] # (lower, upper, count)
42
+
43
+ # Distribution tests
44
+ is_normal: bool
45
+ is_uniform: bool
46
+ normality_pvalue: float
47
+ uniformity_pvalue: float
48
+
49
+ # Best fit
50
+ best_fit_distribution: str
51
+ best_fit_params: dict
52
+ best_fit_score: float
53
+
54
+ # Sample info
55
+ sample_count: int
56
+ null_count: int
57
+
58
+
59
+ class DistributionAnalyzer:
60
+ """
61
+ Analyzes the distribution of numerical data.
62
+
63
+ Provides comprehensive statistical analysis including distribution fitting,
64
+ moment calculation, and hypothesis testing.
65
+ """
66
+
67
+ MIN_SAMPLES = 30
68
+ SUPPORTED_DISTRIBUTIONS = [
69
+ 'norm', # Normal/Gaussian
70
+ 'uniform', # Uniform
71
+ 'expon', # Exponential
72
+ 'gamma', # Gamma
73
+ 'lognorm', # Log-normal
74
+ 'beta', # Beta
75
+ ]
76
+
77
+ def __init__(self):
78
+ """Initialize the distribution analyzer."""
79
+ self._scipy_available = self._check_scipy()
80
+
81
+ def _check_scipy(self) -> bool:
82
+ """Check if scipy is available."""
83
+ try:
84
+ import scipy.stats
85
+ return True
86
+ except ImportError:
87
+ return False
88
+
89
+ def analyze(self, values: np.ndarray, num_bins: int = 20) -> DistributionAnalysis:
90
+ """
91
+ Perform comprehensive distribution analysis.
92
+
93
+ Args:
94
+ values: Array of numeric values (may contain NaN)
95
+ num_bins: Number of histogram bins (default 20)
96
+
97
+ Returns:
98
+ DistributionAnalysis with complete statistical analysis
99
+
100
+ Raises:
101
+ ImportError: If scipy is not available
102
+ ValueError: If insufficient valid samples
103
+ """
104
+ if not self._scipy_available:
105
+ raise ImportError(
106
+ "scipy is required for distribution analysis. "
107
+ "Install with: pip install 'duckguard[statistics]'"
108
+ )
109
+
110
+ import scipy.stats as stats
111
+
112
+ # Separate nulls from valid values
113
+ null_count = np.sum(np.isnan(values))
114
+ valid_values = values[~np.isnan(values)]
115
+
116
+ if len(valid_values) < self.MIN_SAMPLES:
117
+ raise ValueError(
118
+ f"Insufficient samples for distribution analysis: {len(valid_values)} "
119
+ f"(minimum {self.MIN_SAMPLES})"
120
+ )
121
+
122
+ # Calculate basic statistics
123
+ mean = np.mean(valid_values)
124
+ std = np.std(valid_values, ddof=1)
125
+ min_val = np.min(valid_values)
126
+ max_val = np.max(valid_values)
127
+ median = np.median(valid_values)
128
+
129
+ # Calculate moments
130
+ kurtosis = stats.kurtosis(valid_values)
131
+ skewness = stats.skew(valid_values)
132
+
133
+ # Generate histogram
134
+ hist, bin_edges = np.histogram(valid_values, bins=num_bins)
135
+ histogram_bins = [
136
+ (float(bin_edges[i]), float(bin_edges[i+1]), int(hist[i]))
137
+ for i in range(len(hist))
138
+ ]
139
+
140
+ # Test for normality
141
+ is_normal, normality_pvalue = self._test_normality(valid_values)
142
+
143
+ # Test for uniformity
144
+ is_uniform, uniformity_pvalue = self._test_uniformity(valid_values)
145
+
146
+ # Find best fit distribution
147
+ best_fit_dist, best_fit_params, best_fit_score = self._find_best_fit(valid_values)
148
+
149
+ return DistributionAnalysis(
150
+ mean=mean,
151
+ std=std,
152
+ min=min_val,
153
+ max=max_val,
154
+ median=median,
155
+ kurtosis=kurtosis,
156
+ skewness=skewness,
157
+ histogram_bins=histogram_bins,
158
+ is_normal=is_normal,
159
+ is_uniform=is_uniform,
160
+ normality_pvalue=normality_pvalue,
161
+ uniformity_pvalue=uniformity_pvalue,
162
+ best_fit_distribution=best_fit_dist,
163
+ best_fit_params=best_fit_params,
164
+ best_fit_score=best_fit_score,
165
+ sample_count=len(valid_values),
166
+ null_count=null_count,
167
+ )
168
+
169
+ def _test_normality(self, values: np.ndarray, alpha: float = 0.05) -> tuple[bool, float]:
170
+ """
171
+ Test if data follows normal distribution using Kolmogorov-Smirnov test.
172
+
173
+ Args:
174
+ values: Array of values to test
175
+ alpha: Significance level (default 0.05)
176
+
177
+ Returns:
178
+ Tuple of (is_normal, p_value)
179
+ """
180
+ import scipy.stats as stats
181
+
182
+ # Normalize values
183
+ if np.std(values) > 0:
184
+ normalized = (values - np.mean(values)) / np.std(values)
185
+ statistic, pvalue = stats.kstest(normalized, 'norm')
186
+ is_normal = pvalue > alpha
187
+ return is_normal, pvalue
188
+ else:
189
+ # Zero variance - not normal
190
+ return False, 0.0
191
+
192
+ def _test_uniformity(self, values: np.ndarray, alpha: float = 0.05) -> tuple[bool, float]:
193
+ """
194
+ Test if data follows uniform distribution using Kolmogorov-Smirnov test.
195
+
196
+ Args:
197
+ values: Array of values to test
198
+ alpha: Significance level (default 0.05)
199
+
200
+ Returns:
201
+ Tuple of (is_uniform, p_value)
202
+ """
203
+ import scipy.stats as stats
204
+
205
+ # Scale to [0, 1]
206
+ min_val = np.min(values)
207
+ max_val = np.max(values)
208
+
209
+ if min_val != max_val:
210
+ scaled = (values - min_val) / (max_val - min_val)
211
+ statistic, pvalue = stats.kstest(scaled, 'uniform')
212
+ is_uniform = pvalue > alpha
213
+ return is_uniform, pvalue
214
+ else:
215
+ # Constant values - not uniform
216
+ return False, 0.0
217
+
218
+ def _find_best_fit(self, values: np.ndarray) -> tuple[str, dict, float]:
219
+ """
220
+ Find the best-fitting distribution from supported distributions.
221
+
222
+ Uses Kolmogorov-Smirnov test to measure goodness of fit.
223
+
224
+ Args:
225
+ values: Array of values to fit
226
+
227
+ Returns:
228
+ Tuple of (distribution_name, parameters, ks_statistic)
229
+ """
230
+ import scipy.stats as stats
231
+
232
+ best_dist = None
233
+ best_params = {}
234
+ best_score = float('inf') # Lower KS statistic is better
235
+
236
+ for dist_name in self.SUPPORTED_DISTRIBUTIONS:
237
+ try:
238
+ # Get distribution
239
+ dist = getattr(stats, dist_name)
240
+
241
+ # Fit distribution to data
242
+ if dist_name == 'norm':
243
+ # For normal, just use mean and std
244
+ params = (np.mean(values), np.std(values, ddof=1))
245
+ fitted_values = (values - params[0]) / params[1]
246
+ ks_stat, _ = stats.kstest(fitted_values, 'norm')
247
+ param_dict = {'loc': params[0], 'scale': params[1]}
248
+
249
+ elif dist_name == 'uniform':
250
+ # For uniform, use min and max
251
+ params = (np.min(values), np.max(values) - np.min(values))
252
+ scaled = (values - params[0]) / params[1]
253
+ ks_stat, _ = stats.kstest(scaled, 'uniform')
254
+ param_dict = {'loc': params[0], 'scale': params[1]}
255
+
256
+ elif dist_name == 'expon':
257
+ # For exponential, fit using MLE
258
+ params = dist.fit(values, floc=0) # Force loc=0 for exponential
259
+ ks_stat, _ = stats.kstest(values, dist_name, args=params)
260
+ param_dict = {'loc': params[0], 'scale': params[1]}
261
+
262
+ else:
263
+ # For other distributions, use MLE fitting
264
+ params = dist.fit(values)
265
+ ks_stat, _ = stats.kstest(values, dist_name, args=params)
266
+
267
+ # Extract param names
268
+ if len(params) == 2:
269
+ param_dict = {'loc': params[0], 'scale': params[1]}
270
+ elif len(params) == 3:
271
+ param_dict = {'shape': params[0], 'loc': params[1], 'scale': params[2]}
272
+ else:
273
+ param_dict = {f'param_{i}': p for i, p in enumerate(params)}
274
+
275
+ # Update best if this is better
276
+ if ks_stat < best_score:
277
+ best_score = ks_stat
278
+ best_dist = dist_name
279
+ best_params = param_dict
280
+
281
+ except Exception:
282
+ # Skip distributions that fail to fit
283
+ continue
284
+
285
+ # Default to normal if no fit found
286
+ if best_dist is None:
287
+ best_dist = 'norm'
288
+ best_params = {'loc': np.mean(values), 'scale': np.std(values, ddof=1)}
289
+ best_score = 1.0
290
+
291
+ return best_dist, best_params, best_score
292
+
293
+ def interpret_skewness(self, skewness: float) -> str:
294
+ """
295
+ Interpret skewness value.
296
+
297
+ Args:
298
+ skewness: Skewness value
299
+
300
+ Returns:
301
+ Human-readable interpretation
302
+ """
303
+ if abs(skewness) < 0.5:
304
+ return "approximately symmetric"
305
+ elif skewness > 0.5:
306
+ return "right-skewed (positive skew)"
307
+ else:
308
+ return "left-skewed (negative skew)"
309
+
310
+ def interpret_kurtosis(self, kurtosis: float) -> str:
311
+ """
312
+ Interpret kurtosis value (excess kurtosis).
313
+
314
+ Args:
315
+ kurtosis: Excess kurtosis value
316
+
317
+ Returns:
318
+ Human-readable interpretation
319
+ """
320
+ if abs(kurtosis) < 1:
321
+ return "mesokurtic (normal-like tails)"
322
+ elif kurtosis > 1:
323
+ return "leptokurtic (heavy tails)"
324
+ else:
325
+ return "platykurtic (light tails)"
326
+
327
+ def suggest_checks(self, analysis: DistributionAnalysis) -> list[dict]:
328
+ """
329
+ Suggest validation checks based on distribution analysis.
330
+
331
+ Args:
332
+ analysis: Distribution analysis results
333
+
334
+ Returns:
335
+ List of suggested check dictionaries
336
+ """
337
+ suggestions = []
338
+
339
+ # Suggest range check based on distribution
340
+ if analysis.best_fit_distribution == 'norm':
341
+ # For normal, suggest mean ± 3*std
342
+ lower = analysis.mean - 3 * analysis.std
343
+ upper = analysis.mean + 3 * analysis.std
344
+ suggestions.append({
345
+ 'check': 'between',
346
+ 'min_value': lower,
347
+ 'max_value': upper,
348
+ 'reason': 'Normal distribution: ~99.7% within 3 standard deviations'
349
+ })
350
+
351
+ elif analysis.best_fit_distribution == 'uniform':
352
+ # For uniform, use observed min/max
353
+ suggestions.append({
354
+ 'check': 'between',
355
+ 'min_value': analysis.min,
356
+ 'max_value': analysis.max,
357
+ 'reason': 'Uniform distribution: values bounded by observed range'
358
+ })
359
+
360
+ # Suggest normality check if data is normal
361
+ if analysis.is_normal and analysis.normality_pvalue > 0.1:
362
+ suggestions.append({
363
+ 'check': 'expect_distribution_normal',
364
+ 'significance_level': 0.05,
365
+ 'reason': f'Data follows normal distribution (p={analysis.normality_pvalue:.3f})'
366
+ })
367
+
368
+ # Suggest uniformity check if data is uniform
369
+ if analysis.is_uniform and analysis.uniformity_pvalue > 0.1:
370
+ suggestions.append({
371
+ 'check': 'expect_distribution_uniform',
372
+ 'significance_level': 0.05,
373
+ 'reason': f'Data follows uniform distribution (p={analysis.uniformity_pvalue:.3f})'
374
+ })
375
+
376
+ # Check for outliers based on IQR
377
+ if abs(analysis.kurtosis) > 3:
378
+ suggestions.append({
379
+ 'check': 'outlier_detection',
380
+ 'method': 'iqr',
381
+ 'reason': f'High kurtosis ({analysis.kurtosis:.2f}) suggests potential outliers'
382
+ })
383
+
384
+ return suggestions