duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,524 @@
1
+ """
2
+ Distributional checks for DuckGuard 3.0.
3
+
4
+ This module provides statistical distribution validation using standard
5
+ statistical tests like Kolmogorov-Smirnov (KS) test and chi-square test.
6
+
7
+ Requirements:
8
+ - scipy>=1.11.0 for statistical tests
9
+
10
+ Example:
11
+ >>> from duckguard import connect
12
+ >>> data = connect("measurements.csv")
13
+ >>> # Test if column follows normal distribution
14
+ >>> result = data.temperature.expect_distribution_normal(significance_level=0.05)
15
+ >>> assert result.passed
16
+ """
17
+
18
+ from dataclasses import dataclass
19
+
20
+ from duckguard.core.result import ValidationResult
21
+
22
+
23
+ @dataclass
24
+ class DistributionTestResult:
25
+ """Result of a distribution test."""
26
+
27
+ test_name: str
28
+ statistic: float
29
+ pvalue: float
30
+ is_significant: bool
31
+ significance_level: float
32
+ distribution_type: str
33
+
34
+
35
+ class DistributionalCheckHandler:
36
+ """
37
+ Executes distributional validation checks using statistical tests.
38
+
39
+ This handler provides methods to test if data follows specific distributions
40
+ (normal, uniform) or to perform goodness-of-fit tests.
41
+ """
42
+
43
+ MIN_SAMPLES = 30 # Minimum samples required for reliable tests
44
+
45
+ def __init__(self):
46
+ """Initialize the distributional check handler."""
47
+ self._scipy_available = self._check_scipy_availability()
48
+
49
+ def _check_scipy_availability(self) -> bool:
50
+ """Check if scipy is available."""
51
+ try:
52
+ import scipy.stats
53
+ return True
54
+ except ImportError:
55
+ return False
56
+
57
+ def _ensure_scipy(self):
58
+ """Ensure scipy is available, raise if not."""
59
+ if not self._scipy_available:
60
+ raise ImportError(
61
+ "scipy is required for distributional checks. "
62
+ "Install with: pip install 'duckguard[statistics]'"
63
+ )
64
+
65
+ def execute_distribution_normal(
66
+ self,
67
+ dataset,
68
+ column: str,
69
+ significance_level: float = 0.05
70
+ ) -> ValidationResult:
71
+ """
72
+ Test if column data follows a normal distribution.
73
+
74
+ Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
75
+
76
+ Args:
77
+ dataset: Dataset to test
78
+ column: Column name
79
+ significance_level: Significance level (default 0.05)
80
+
81
+ Returns:
82
+ ValidationResult (passed if p-value > significance_level)
83
+
84
+ Example:
85
+ >>> data = connect("measurements.csv")
86
+ >>> result = data.temperature.expect_distribution_normal()
87
+ >>> assert result.passed # Temperature follows normal distribution
88
+ """
89
+ self._ensure_scipy()
90
+ import numpy as np
91
+ import scipy.stats as stats
92
+
93
+ # Get column values
94
+ values = self._get_numeric_values(dataset, column)
95
+
96
+ # Check minimum samples
97
+ if len(values) < self.MIN_SAMPLES:
98
+ return ValidationResult(
99
+ passed=False,
100
+ actual_value=len(values),
101
+ expected_value=f">= {self.MIN_SAMPLES} samples",
102
+ message=f"Insufficient samples for distribution test: {len(values)} (minimum {self.MIN_SAMPLES})",
103
+ details={
104
+ "column": column,
105
+ "sample_count": len(values),
106
+ "min_required": self.MIN_SAMPLES
107
+ }
108
+ )
109
+
110
+ # Normalize values (subtract mean, divide by std)
111
+ mean = np.mean(values)
112
+ std = np.std(values, ddof=1)
113
+
114
+ if std == 0:
115
+ return ValidationResult(
116
+ passed=False,
117
+ actual_value=0.0,
118
+ expected_value="> 0",
119
+ message="Zero standard deviation - cannot test distribution",
120
+ details={
121
+ "column": column,
122
+ "mean": mean,
123
+ "std": std
124
+ }
125
+ )
126
+
127
+ normalized_values = (values - mean) / std
128
+
129
+ # Perform KS test against standard normal
130
+ statistic, pvalue = stats.kstest(normalized_values, 'norm')
131
+
132
+ # Test passes if p-value > significance level
133
+ # (null hypothesis: data follows normal distribution)
134
+ passed = bool(pvalue > significance_level)
135
+
136
+ if passed:
137
+ message = f"Column '{column}' follows normal distribution (p={pvalue:.4f}, alpha={significance_level})"
138
+ else:
139
+ message = f"Column '{column}' does not follow normal distribution (p={pvalue:.4f}, alpha={significance_level})"
140
+
141
+ return ValidationResult(
142
+ passed=passed,
143
+ actual_value=pvalue,
144
+ expected_value=f"> {significance_level}",
145
+ message=message,
146
+ details={
147
+ "test": "Kolmogorov-Smirnov",
148
+ "distribution": "normal",
149
+ "statistic": statistic,
150
+ "pvalue": pvalue,
151
+ "significance_level": significance_level,
152
+ "sample_count": len(values),
153
+ "mean": mean,
154
+ "std": std,
155
+ }
156
+ )
157
+
158
+ def execute_distribution_uniform(
159
+ self,
160
+ dataset,
161
+ column: str,
162
+ significance_level: float = 0.05
163
+ ) -> ValidationResult:
164
+ """
165
+ Test if column data follows a uniform distribution.
166
+
167
+ Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
168
+
169
+ Args:
170
+ dataset: Dataset to test
171
+ column: Column name
172
+ significance_level: Significance level (default 0.05)
173
+
174
+ Returns:
175
+ ValidationResult (passed if p-value > significance_level)
176
+
177
+ Example:
178
+ >>> data = connect("random_numbers.csv")
179
+ >>> result = data.random_value.expect_distribution_uniform()
180
+ >>> assert result.passed # Random values are uniformly distributed
181
+ """
182
+ self._ensure_scipy()
183
+ import numpy as np
184
+ import scipy.stats as stats
185
+
186
+ # Get column values
187
+ values = self._get_numeric_values(dataset, column)
188
+
189
+ # Check minimum samples
190
+ if len(values) < self.MIN_SAMPLES:
191
+ return ValidationResult(
192
+ passed=False,
193
+ actual_value=len(values),
194
+ expected_value=f">= {self.MIN_SAMPLES} samples",
195
+ message=f"Insufficient samples for distribution test: {len(values)} (minimum {self.MIN_SAMPLES})",
196
+ details={
197
+ "column": column,
198
+ "sample_count": len(values),
199
+ "min_required": self.MIN_SAMPLES
200
+ }
201
+ )
202
+
203
+ # Scale values to [0, 1] range
204
+ min_val = np.min(values)
205
+ max_val = np.max(values)
206
+
207
+ if min_val == max_val:
208
+ return ValidationResult(
209
+ passed=False,
210
+ actual_value="constant",
211
+ expected_value="varying values",
212
+ message="All values are identical - cannot test distribution",
213
+ details={
214
+ "column": column,
215
+ "value": min_val,
216
+ }
217
+ )
218
+
219
+ scaled_values = (values - min_val) / (max_val - min_val)
220
+
221
+ # Perform KS test against uniform distribution [0, 1]
222
+ statistic, pvalue = stats.kstest(scaled_values, 'uniform')
223
+
224
+ # Test passes if p-value > significance level
225
+ passed = bool(pvalue > significance_level)
226
+
227
+ if passed:
228
+ message = f"Column '{column}' follows uniform distribution (p={pvalue:.4f}, alpha={significance_level})"
229
+ else:
230
+ message = f"Column '{column}' does not follow uniform distribution (p={pvalue:.4f}, alpha={significance_level})"
231
+
232
+ return ValidationResult(
233
+ passed=passed,
234
+ actual_value=pvalue,
235
+ expected_value=f"> {significance_level}",
236
+ message=message,
237
+ details={
238
+ "test": "Kolmogorov-Smirnov",
239
+ "distribution": "uniform",
240
+ "statistic": statistic,
241
+ "pvalue": pvalue,
242
+ "significance_level": significance_level,
243
+ "sample_count": len(values),
244
+ "min": min_val,
245
+ "max": max_val,
246
+ }
247
+ )
248
+
249
+ def execute_ks_test(
250
+ self,
251
+ dataset,
252
+ column: str,
253
+ distribution: str = "norm",
254
+ significance_level: float = 0.05
255
+ ) -> ValidationResult:
256
+ """
257
+ Perform Kolmogorov-Smirnov test for specified distribution.
258
+
259
+ Args:
260
+ dataset: Dataset to test
261
+ column: Column name
262
+ distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
263
+ significance_level: Significance level (default 0.05)
264
+
265
+ Returns:
266
+ ValidationResult (passed if p-value > significance_level)
267
+
268
+ Example:
269
+ >>> data = connect("data.csv")
270
+ >>> result = data.values.expect_ks_test(distribution='norm')
271
+ >>> assert result.passed
272
+ """
273
+ self._ensure_scipy()
274
+ import numpy as np
275
+ import scipy.stats as stats
276
+
277
+ # Get column values
278
+ values = self._get_numeric_values(dataset, column)
279
+
280
+ # Check minimum samples
281
+ if len(values) < self.MIN_SAMPLES:
282
+ return ValidationResult(
283
+ passed=False,
284
+ actual_value=len(values),
285
+ expected_value=f">= {self.MIN_SAMPLES} samples",
286
+ message=f"Insufficient samples for KS test: {len(values)} (minimum {self.MIN_SAMPLES})",
287
+ details={
288
+ "column": column,
289
+ "sample_count": len(values),
290
+ "min_required": self.MIN_SAMPLES
291
+ }
292
+ )
293
+
294
+ # Normalize values based on distribution
295
+ if distribution == "norm":
296
+ # Normalize to standard normal
297
+ mean = np.mean(values)
298
+ std = np.std(values, ddof=1)
299
+ if std > 0:
300
+ values = (values - mean) / std
301
+ elif distribution == "uniform":
302
+ # Scale to [0, 1] range for uniform distribution
303
+ min_val = np.min(values)
304
+ max_val = np.max(values)
305
+ if max_val > min_val:
306
+ values = (values - min_val) / (max_val - min_val)
307
+
308
+ # Perform KS test
309
+ try:
310
+ statistic, pvalue = stats.kstest(values, distribution)
311
+
312
+ passed = bool(pvalue > significance_level)
313
+
314
+ if passed:
315
+ message = f"Column '{column}' follows '{distribution}' distribution (p={pvalue:.4f}, alpha={significance_level})"
316
+ else:
317
+ message = f"Column '{column}' does not follow '{distribution}' distribution (p={pvalue:.4f}, alpha={significance_level})"
318
+
319
+ return ValidationResult(
320
+ passed=passed,
321
+ actual_value=pvalue,
322
+ expected_value=f"> {significance_level}",
323
+ message=message,
324
+ details={
325
+ "test": "Kolmogorov-Smirnov",
326
+ "distribution": distribution,
327
+ "statistic": statistic,
328
+ "pvalue": pvalue,
329
+ "significance_level": significance_level,
330
+ "sample_count": len(values),
331
+ }
332
+ )
333
+
334
+ except Exception as e:
335
+ return ValidationResult(
336
+ passed=False,
337
+ actual_value=None,
338
+ expected_value=f"{distribution} distribution",
339
+ message=f"KS test failed: {str(e)}",
340
+ details={
341
+ "column": column,
342
+ "distribution": distribution,
343
+ "error": str(e)
344
+ }
345
+ )
346
+
347
+ def execute_chi_square_test(
348
+ self,
349
+ dataset,
350
+ column: str,
351
+ expected_frequencies: dict | None = None,
352
+ significance_level: float = 0.05
353
+ ) -> ValidationResult:
354
+ """
355
+ Perform chi-square goodness-of-fit test for categorical data.
356
+
357
+ Tests if observed frequencies match expected frequencies.
358
+
359
+ Args:
360
+ dataset: Dataset to test
361
+ column: Column name (categorical)
362
+ expected_frequencies: Dict mapping categories to expected frequencies
363
+ If None, assumes uniform distribution
364
+ significance_level: Significance level (default 0.05)
365
+
366
+ Returns:
367
+ ValidationResult (passed if p-value > significance_level)
368
+
369
+ Example:
370
+ >>> data = connect("dice_rolls.csv")
371
+ >>> # Test if dice is fair (uniform distribution)
372
+ >>> result = data.roll.expect_chi_square_test(
373
+ ... expected_frequencies={1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
374
+ ... )
375
+ >>> assert result.passed
376
+ """
377
+ self._ensure_scipy()
378
+ import numpy as np
379
+ import scipy.stats as stats
380
+
381
+ # Get value counts
382
+ value_counts = self._get_value_counts(dataset, column)
383
+
384
+ if len(value_counts) < 2:
385
+ return ValidationResult(
386
+ passed=False,
387
+ actual_value=len(value_counts),
388
+ expected_value=">= 2 categories",
389
+ message=f"Insufficient categories for chi-square test: {len(value_counts)}",
390
+ details={
391
+ "column": column,
392
+ "categories": len(value_counts)
393
+ }
394
+ )
395
+
396
+ # Total observations
397
+ total = sum(value_counts.values())
398
+
399
+ if total < self.MIN_SAMPLES:
400
+ return ValidationResult(
401
+ passed=False,
402
+ actual_value=total,
403
+ expected_value=f">= {self.MIN_SAMPLES} samples",
404
+ message=f"Insufficient samples for chi-square test: {total}",
405
+ details={
406
+ "column": column,
407
+ "sample_count": total,
408
+ "min_required": self.MIN_SAMPLES
409
+ }
410
+ )
411
+
412
+ # Build observed and expected frequencies
413
+ categories = sorted(value_counts.keys())
414
+ observed = np.array([value_counts[cat] for cat in categories])
415
+
416
+ if expected_frequencies is None:
417
+ # Assume uniform distribution
418
+ expected = np.array([total / len(categories)] * len(categories))
419
+ else:
420
+ # Use provided expected frequencies
421
+ expected = np.array([
422
+ expected_frequencies.get(cat, 0) * total
423
+ for cat in categories
424
+ ])
425
+
426
+ # Check for zero expected frequencies
427
+ if np.any(expected == 0):
428
+ return ValidationResult(
429
+ passed=False,
430
+ actual_value="zero expected frequencies",
431
+ expected_value="non-zero expected frequencies",
432
+ message="Chi-square test requires non-zero expected frequencies",
433
+ details={
434
+ "column": column,
435
+ "categories_with_zero_expected": [
436
+ cat for cat, exp in zip(categories, expected) if exp == 0
437
+ ]
438
+ }
439
+ )
440
+
441
+ # Perform chi-square test
442
+ try:
443
+ statistic, pvalue = stats.chisquare(observed, expected)
444
+
445
+ passed = bool(pvalue > significance_level)
446
+
447
+ if passed:
448
+ message = f"Column '{column}' matches expected distribution (p={pvalue:.4f}, alpha={significance_level})"
449
+ else:
450
+ message = f"Column '{column}' does not match expected distribution (p={pvalue:.4f}, alpha={significance_level})"
451
+
452
+ return ValidationResult(
453
+ passed=passed,
454
+ actual_value=pvalue,
455
+ expected_value=f"> {significance_level}",
456
+ message=message,
457
+ details={
458
+ "test": "chi-square",
459
+ "statistic": statistic,
460
+ "pvalue": pvalue,
461
+ "significance_level": significance_level,
462
+ "degrees_of_freedom": len(categories) - 1,
463
+ "categories": len(categories),
464
+ "sample_count": total,
465
+ "observed": dict(zip(categories, observed.tolist())),
466
+ "expected": dict(zip(categories, expected.tolist())),
467
+ }
468
+ )
469
+
470
+ except Exception as e:
471
+ return ValidationResult(
472
+ passed=False,
473
+ actual_value=None,
474
+ expected_value="valid chi-square test",
475
+ message=f"Chi-square test failed: {str(e)}",
476
+ details={
477
+ "column": column,
478
+ "error": str(e)
479
+ }
480
+ )
481
+
482
+ def _get_numeric_values(self, dataset, column: str):
483
+ """Get numeric values from column, excluding nulls."""
484
+ import numpy as np
485
+
486
+ engine = dataset._engine
487
+ # Normalize path for DuckDB (forward slashes work on all platforms)
488
+ table_name = dataset._source.replace('\\', '/')
489
+
490
+ # Query to get non-null numeric values
491
+ query = f"""
492
+ SELECT "{column}"
493
+ FROM '{table_name}'
494
+ WHERE "{column}" IS NOT NULL
495
+ """
496
+
497
+ try:
498
+ result = engine.fetch_all(query)
499
+ values = np.array([row[0] for row in result], dtype=float)
500
+ return values
501
+ except Exception as e:
502
+ raise ValueError(f"Failed to get numeric values from column '{column}': {str(e)}")
503
+
504
+ def _get_value_counts(self, dataset, column: str) -> dict:
505
+ """Get value counts for categorical column."""
506
+ engine = dataset._engine
507
+ # Normalize path for DuckDB (forward slashes work on all platforms)
508
+ table_name = dataset._source.replace('\\', '/')
509
+
510
+ # Query to get value counts
511
+ query = f"""
512
+ SELECT "{column}", COUNT(*) as count
513
+ FROM '{table_name}'
514
+ WHERE "{column}" IS NOT NULL
515
+ GROUP BY "{column}"
516
+ ORDER BY "{column}"
517
+ """
518
+
519
+ try:
520
+ result = engine.fetch_all(query)
521
+ value_counts = {row[0]: row[1] for row in result}
522
+ return value_counts
523
+ except Exception as e:
524
+ raise ValueError(f"Failed to get value counts from column '{column}': {str(e)}")