duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,497 @@
1
+ """
2
+ Outlier detection for enhanced profiling in DuckGuard 3.0.
3
+
4
+ This module provides multiple outlier detection methods:
5
+ - Z-score method (parametric)
6
+ - IQR (Interquartile Range) method (non-parametric)
7
+ - Isolation Forest (machine learning)
8
+ - Local Outlier Factor (density-based)
9
+ - Consensus outlier detection (combining multiple methods)
10
+
11
+ Requirements:
12
+ - scipy>=1.11.0 for statistical methods
13
+ - scikit-learn>=1.3.0 for ML methods (Isolation Forest, LOF)
14
+
15
+ Example:
16
+ >>> from duckguard.profiler.outlier_detector import OutlierDetector
17
+ >>> detector = OutlierDetector()
18
+ >>> result = detector.detect(column_values, method='consensus')
19
+ >>> print(f"Found {result.outlier_count} outliers")
20
+ """
21
+
22
+ from dataclasses import dataclass
23
+
24
+ import numpy as np
25
+
26
+
27
+ @dataclass
28
+ class OutlierAnalysis:
29
+ """Results of outlier detection."""
30
+
31
+ # Overall results
32
+ outlier_count: int
33
+ outlier_percentage: float
34
+ outlier_indices: list[int]
35
+
36
+ # Method-specific results
37
+ method_results: dict[str, int] # method -> outlier count
38
+ method_indices: dict[str, list[int]] # method -> indices
39
+
40
+ # Consensus results (if applicable)
41
+ consensus_outliers: list[int] # Indices detected by multiple methods
42
+ consensus_threshold: int # Minimum methods agreeing
43
+
44
+ # Sample info
45
+ sample_count: int
46
+ methods_used: list[str]
47
+
48
+
49
+ class OutlierDetector:
50
+ """
51
+ Detects outliers using multiple statistical and ML methods.
52
+
53
+ Provides both individual method results and consensus detection
54
+ where outliers must be flagged by multiple methods.
55
+ """
56
+
57
+ MIN_SAMPLES = 30
58
+
59
+ def __init__(self):
60
+ """Initialize the outlier detector."""
61
+ self._scipy_available = self._check_scipy()
62
+ self._sklearn_available = self._check_sklearn()
63
+
64
+ def _check_scipy(self) -> bool:
65
+ """Check if scipy is available."""
66
+ try:
67
+ import scipy.stats
68
+ return True
69
+ except ImportError:
70
+ return False
71
+
72
+ def _check_sklearn(self) -> bool:
73
+ """Check if scikit-learn is available."""
74
+ try:
75
+ import sklearn
76
+ return True
77
+ except ImportError:
78
+ return False
79
+
80
+ def detect(
81
+ self,
82
+ values: np.ndarray,
83
+ method: str = 'consensus',
84
+ contamination: float = 0.05,
85
+ consensus_threshold: int = 2
86
+ ) -> OutlierAnalysis:
87
+ """
88
+ Detect outliers using specified method(s).
89
+
90
+ Args:
91
+ values: Array of numeric values (may contain NaN)
92
+ method: Detection method - 'zscore', 'iqr', 'isolation_forest',
93
+ 'lof', 'consensus' (default)
94
+ contamination: Expected proportion of outliers (0.01-0.5)
95
+ consensus_threshold: Min methods agreeing for consensus (default 2)
96
+
97
+ Returns:
98
+ OutlierAnalysis with detection results
99
+
100
+ Raises:
101
+ ValueError: If insufficient samples or invalid method
102
+ """
103
+ # Remove NaN values
104
+ valid_values = values[~np.isnan(values)]
105
+ original_indices = np.where(~np.isnan(values))[0]
106
+
107
+ if len(valid_values) < self.MIN_SAMPLES:
108
+ raise ValueError(
109
+ f"Insufficient samples for outlier detection: {len(valid_values)} "
110
+ f"(minimum {self.MIN_SAMPLES})"
111
+ )
112
+
113
+ # Detect based on method
114
+ if method == 'zscore':
115
+ return self._detect_zscore(valid_values, original_indices)
116
+
117
+ elif method == 'iqr':
118
+ return self._detect_iqr(valid_values, original_indices)
119
+
120
+ elif method == 'isolation_forest':
121
+ if not self._sklearn_available:
122
+ raise ImportError(
123
+ "scikit-learn required for Isolation Forest. "
124
+ "Install with: pip install scikit-learn"
125
+ )
126
+ return self._detect_isolation_forest(valid_values, original_indices, contamination)
127
+
128
+ elif method == 'lof':
129
+ if not self._sklearn_available:
130
+ raise ImportError(
131
+ "scikit-learn required for Local Outlier Factor. "
132
+ "Install with: pip install scikit-learn"
133
+ )
134
+ return self._detect_lof(valid_values, original_indices, contamination)
135
+
136
+ elif method == 'consensus':
137
+ return self._detect_consensus(
138
+ valid_values,
139
+ original_indices,
140
+ contamination,
141
+ consensus_threshold
142
+ )
143
+
144
+ else:
145
+ raise ValueError(
146
+ f"Unknown method: {method}. "
147
+ f"Valid methods: zscore, iqr, isolation_forest, lof, consensus"
148
+ )
149
+
150
+ def _detect_zscore(
151
+ self,
152
+ values: np.ndarray,
153
+ original_indices: np.ndarray,
154
+ threshold: float = 3.0
155
+ ) -> OutlierAnalysis:
156
+ """
157
+ Detect outliers using Z-score method.
158
+
159
+ Points with |z-score| > threshold are considered outliers.
160
+
161
+ Args:
162
+ values: Array of values
163
+ original_indices: Original indices in the dataset
164
+ threshold: Z-score threshold (default 3.0)
165
+
166
+ Returns:
167
+ OutlierAnalysis
168
+ """
169
+ if not self._scipy_available:
170
+ raise ImportError("scipy required for Z-score method")
171
+
172
+ import scipy.stats as stats
173
+
174
+ # Calculate z-scores
175
+ z_scores = np.abs(stats.zscore(values))
176
+
177
+ # Find outliers
178
+ outlier_mask = z_scores > threshold
179
+ outlier_indices = original_indices[outlier_mask].tolist()
180
+ outlier_count = len(outlier_indices)
181
+
182
+ return OutlierAnalysis(
183
+ outlier_count=outlier_count,
184
+ outlier_percentage=(outlier_count / len(values)) * 100,
185
+ outlier_indices=outlier_indices,
186
+ method_results={'zscore': outlier_count},
187
+ method_indices={'zscore': outlier_indices},
188
+ consensus_outliers=[],
189
+ consensus_threshold=1,
190
+ sample_count=len(values),
191
+ methods_used=['zscore']
192
+ )
193
+
194
+ def _detect_iqr(
195
+ self,
196
+ values: np.ndarray,
197
+ original_indices: np.ndarray,
198
+ multiplier: float = 1.5
199
+ ) -> OutlierAnalysis:
200
+ """
201
+ Detect outliers using IQR (Interquartile Range) method.
202
+
203
+ Points outside [Q1 - multiplier*IQR, Q3 + multiplier*IQR] are outliers.
204
+
205
+ Args:
206
+ values: Array of values
207
+ original_indices: Original indices in the dataset
208
+ multiplier: IQR multiplier (default 1.5)
209
+
210
+ Returns:
211
+ OutlierAnalysis
212
+ """
213
+ # Calculate quartiles
214
+ Q1 = np.percentile(values, 25)
215
+ Q3 = np.percentile(values, 75)
216
+ IQR = Q3 - Q1
217
+
218
+ # Calculate bounds
219
+ lower_bound = Q1 - multiplier * IQR
220
+ upper_bound = Q3 + multiplier * IQR
221
+
222
+ # Find outliers
223
+ outlier_mask = (values < lower_bound) | (values > upper_bound)
224
+ outlier_indices = original_indices[outlier_mask].tolist()
225
+ outlier_count = len(outlier_indices)
226
+
227
+ return OutlierAnalysis(
228
+ outlier_count=outlier_count,
229
+ outlier_percentage=(outlier_count / len(values)) * 100,
230
+ outlier_indices=outlier_indices,
231
+ method_results={'iqr': outlier_count},
232
+ method_indices={'iqr': outlier_indices},
233
+ consensus_outliers=[],
234
+ consensus_threshold=1,
235
+ sample_count=len(values),
236
+ methods_used=['iqr']
237
+ )
238
+
239
+ def _detect_isolation_forest(
240
+ self,
241
+ values: np.ndarray,
242
+ original_indices: np.ndarray,
243
+ contamination: float
244
+ ) -> OutlierAnalysis:
245
+ """
246
+ Detect outliers using Isolation Forest algorithm.
247
+
248
+ Args:
249
+ values: Array of values
250
+ original_indices: Original indices in the dataset
251
+ contamination: Expected proportion of outliers
252
+
253
+ Returns:
254
+ OutlierAnalysis
255
+ """
256
+ from sklearn.ensemble import IsolationForest
257
+
258
+ # Reshape for sklearn
259
+ X = values.reshape(-1, 1)
260
+
261
+ # Fit Isolation Forest
262
+ iso = IsolationForest(contamination=contamination, random_state=42)
263
+ predictions = iso.fit_predict(X)
264
+
265
+ # Find outliers (predictions == -1)
266
+ outlier_mask = predictions == -1
267
+ outlier_indices = original_indices[outlier_mask].tolist()
268
+ outlier_count = len(outlier_indices)
269
+
270
+ return OutlierAnalysis(
271
+ outlier_count=outlier_count,
272
+ outlier_percentage=(outlier_count / len(values)) * 100,
273
+ outlier_indices=outlier_indices,
274
+ method_results={'isolation_forest': outlier_count},
275
+ method_indices={'isolation_forest': outlier_indices},
276
+ consensus_outliers=[],
277
+ consensus_threshold=1,
278
+ sample_count=len(values),
279
+ methods_used=['isolation_forest']
280
+ )
281
+
282
+ def _detect_lof(
283
+ self,
284
+ values: np.ndarray,
285
+ original_indices: np.ndarray,
286
+ contamination: float
287
+ ) -> OutlierAnalysis:
288
+ """
289
+ Detect outliers using Local Outlier Factor.
290
+
291
+ Args:
292
+ values: Array of values
293
+ original_indices: Original indices in the dataset
294
+ contamination: Expected proportion of outliers
295
+
296
+ Returns:
297
+ OutlierAnalysis
298
+ """
299
+ from sklearn.neighbors import LocalOutlierFactor
300
+
301
+ # Reshape for sklearn
302
+ X = values.reshape(-1, 1)
303
+
304
+ # Fit LOF
305
+ lof = LocalOutlierFactor(contamination=contamination, n_neighbors=20)
306
+ predictions = lof.fit_predict(X)
307
+
308
+ # Find outliers (predictions == -1)
309
+ outlier_mask = predictions == -1
310
+ outlier_indices = original_indices[outlier_mask].tolist()
311
+ outlier_count = len(outlier_indices)
312
+
313
+ return OutlierAnalysis(
314
+ outlier_count=outlier_count,
315
+ outlier_percentage=(outlier_count / len(values)) * 100,
316
+ outlier_indices=outlier_indices,
317
+ method_results={'lof': outlier_count},
318
+ method_indices={'lof': outlier_indices},
319
+ consensus_outliers=[],
320
+ consensus_threshold=1,
321
+ sample_count=len(values),
322
+ methods_used=['lof']
323
+ )
324
+
325
+ def _detect_consensus(
326
+ self,
327
+ values: np.ndarray,
328
+ original_indices: np.ndarray,
329
+ contamination: float,
330
+ threshold: int
331
+ ) -> OutlierAnalysis:
332
+ """
333
+ Detect outliers using consensus of multiple methods.
334
+
335
+ An outlier must be flagged by at least 'threshold' methods.
336
+
337
+ Args:
338
+ values: Array of values
339
+ original_indices: Original indices in the dataset
340
+ contamination: Expected proportion of outliers
341
+ threshold: Minimum methods agreeing
342
+
343
+ Returns:
344
+ OutlierAnalysis with consensus results
345
+ """
346
+ methods_used = []
347
+ method_results = {}
348
+ method_indices = {}
349
+
350
+ # Run Z-score method
351
+ if self._scipy_available:
352
+ try:
353
+ result = self._detect_zscore(values, original_indices)
354
+ method_results['zscore'] = result.outlier_count
355
+ method_indices['zscore'] = result.outlier_indices
356
+ methods_used.append('zscore')
357
+ except Exception:
358
+ pass
359
+
360
+ # Run IQR method
361
+ try:
362
+ result = self._detect_iqr(values, original_indices)
363
+ method_results['iqr'] = result.outlier_count
364
+ method_indices['iqr'] = result.outlier_indices
365
+ methods_used.append('iqr')
366
+ except Exception:
367
+ pass
368
+
369
+ # Run Isolation Forest (if available)
370
+ if self._sklearn_available:
371
+ try:
372
+ result = self._detect_isolation_forest(values, original_indices, contamination)
373
+ method_results['isolation_forest'] = result.outlier_count
374
+ method_indices['isolation_forest'] = result.outlier_indices
375
+ methods_used.append('isolation_forest')
376
+ except Exception:
377
+ pass
378
+
379
+ # Run LOF (if available)
380
+ if self._sklearn_available:
381
+ try:
382
+ result = self._detect_lof(values, original_indices, contamination)
383
+ method_results['lof'] = result.outlier_count
384
+ method_indices['lof'] = result.outlier_indices
385
+ methods_used.append('lof')
386
+ except Exception:
387
+ pass
388
+
389
+ # Find consensus outliers
390
+ # Count how many methods flagged each index
391
+ index_counts = {}
392
+ for method, indices in method_indices.items():
393
+ for idx in indices:
394
+ index_counts[idx] = index_counts.get(idx, 0) + 1
395
+
396
+ # Filter by threshold
397
+ consensus_outliers = [
398
+ idx for idx, count in index_counts.items()
399
+ if count >= threshold
400
+ ]
401
+
402
+ # Calculate overall outlier set (union of all methods)
403
+ all_outliers = set()
404
+ for indices in method_indices.values():
405
+ all_outliers.update(indices)
406
+
407
+ return OutlierAnalysis(
408
+ outlier_count=len(all_outliers),
409
+ outlier_percentage=(len(all_outliers) / len(values)) * 100,
410
+ outlier_indices=sorted(list(all_outliers)),
411
+ method_results=method_results,
412
+ method_indices=method_indices,
413
+ consensus_outliers=sorted(consensus_outliers),
414
+ consensus_threshold=threshold,
415
+ sample_count=len(values),
416
+ methods_used=methods_used
417
+ )
418
+
419
+ def get_outlier_stats(
420
+ self,
421
+ values: np.ndarray,
422
+ outlier_indices: list[int]
423
+ ) -> dict:
424
+ """
425
+ Get statistics about outliers.
426
+
427
+ Args:
428
+ values: Original array of values
429
+ outlier_indices: Indices of outliers
430
+
431
+ Returns:
432
+ Dictionary with outlier statistics
433
+ """
434
+ if len(outlier_indices) == 0:
435
+ return {
436
+ 'count': 0,
437
+ 'percentage': 0.0,
438
+ 'min': None,
439
+ 'max': None,
440
+ 'mean': None,
441
+ }
442
+
443
+ outlier_values = values[outlier_indices]
444
+ valid_outliers = outlier_values[~np.isnan(outlier_values)]
445
+
446
+ if len(valid_outliers) == 0:
447
+ return {
448
+ 'count': 0,
449
+ 'percentage': 0.0,
450
+ 'min': None,
451
+ 'max': None,
452
+ 'mean': None,
453
+ }
454
+
455
+ return {
456
+ 'count': len(valid_outliers),
457
+ 'percentage': (len(valid_outliers) / len(values)) * 100,
458
+ 'min': float(np.min(valid_outliers)),
459
+ 'max': float(np.max(valid_outliers)),
460
+ 'mean': float(np.mean(valid_outliers)),
461
+ }
462
+
463
+ def suggest_handling(self, analysis: OutlierAnalysis) -> list[str]:
464
+ """
465
+ Suggest how to handle detected outliers.
466
+
467
+ Args:
468
+ analysis: Outlier analysis results
469
+
470
+ Returns:
471
+ List of suggestions
472
+ """
473
+ suggestions = []
474
+
475
+ if analysis.outlier_percentage < 1:
476
+ suggestions.append(
477
+ "Low outlier rate (<1%): Consider investigating and removing "
478
+ "if confirmed as errors"
479
+ )
480
+ elif analysis.outlier_percentage < 5:
481
+ suggestions.append(
482
+ "Moderate outlier rate (1-5%): Review outliers, consider "
483
+ "robust statistical methods"
484
+ )
485
+ else:
486
+ suggestions.append(
487
+ "High outlier rate (>5%): Data may not follow expected distribution, "
488
+ "review data collection process"
489
+ )
490
+
491
+ if len(analysis.consensus_outliers) > 0:
492
+ suggestions.append(
493
+ f"{len(analysis.consensus_outliers)} outliers flagged by multiple methods - "
494
+ "strong evidence of anomalies"
495
+ )
496
+
497
+ return suggestions