panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,529 @@
1
+ """
2
+ Outlier detection and leverage diagnostics for panel data models.
3
+
4
+ This module implements various methods for detecting outliers and high-leverage
5
+ points in panel data, including:
6
+ - Univariate methods (IQR, Z-score)
7
+ - Multivariate methods (Mahalanobis distance)
8
+ - Regression diagnostics (standardized residuals, studentized residuals)
9
+ - Leverage diagnostics (hat values)
10
+
11
+ References
12
+ ----------
13
+ Cook, R. D., & Weisberg, S. (1982). Residuals and Influence in Regression.
14
+ Chapman and Hall.
15
+ Rousseeuw, P. J., & Leroy, A. M. (1987). Robust Regression and Outlier Detection.
16
+ John Wiley & Sons.
17
+ """
18
+
19
+ from typing import Optional, Dict, Any, Tuple, List, Union
20
+ import warnings
21
+ import numpy as np
22
+ import pandas as pd
23
+ from scipy import stats
24
+ from dataclasses import dataclass
25
+
26
+ from panelbox.core.results import PanelResults
27
+
28
+
29
+ @dataclass
30
+ class OutlierResults:
31
+ """
32
+ Container for outlier detection results.
33
+
34
+ Attributes
35
+ ----------
36
+ outliers : pd.DataFrame
37
+ DataFrame with outlier flags and diagnostic statistics
38
+ method : str
39
+ Method used for detection
40
+ threshold : float
41
+ Threshold used for detection
42
+ n_outliers : int
43
+ Number of outliers detected
44
+ """
45
+ outliers: pd.DataFrame
46
+ method: str
47
+ threshold: float
48
+ n_outliers: int
49
+
50
+ def summary(self) -> str:
51
+ """Generate summary of outlier detection."""
52
+ lines = []
53
+ lines.append("Outlier Detection Results")
54
+ lines.append("=" * 70)
55
+ lines.append(f"Method: {self.method}")
56
+ lines.append(f"Threshold: {self.threshold}")
57
+ lines.append(f"Outliers detected: {self.n_outliers} / {len(self.outliers)}")
58
+ lines.append(f"Percentage: {100 * self.n_outliers / len(self.outliers):.2f}%")
59
+
60
+ if self.n_outliers > 0:
61
+ lines.append("")
62
+ lines.append("Top 10 outliers:")
63
+ lines.append("-" * 70)
64
+ top_outliers = self.outliers[self.outliers['is_outlier']].head(10)
65
+ lines.append(top_outliers.to_string())
66
+
67
+ return "\n".join(lines)
68
+
69
+
70
+ class OutlierDetector:
71
+ """
72
+ Outlier detection for panel data models.
73
+
74
+ This class provides various methods for detecting outliers and
75
+ high-leverage points in panel data regression models.
76
+
77
+ Parameters
78
+ ----------
79
+ results : PanelResults
80
+ Fitted model results to analyze
81
+ verbose : bool, default=True
82
+ Whether to print progress information
83
+
84
+ Attributes
85
+ ----------
86
+ outlier_results_ : OutlierResults
87
+ Results after calling detection methods
88
+
89
+ Examples
90
+ --------
91
+ >>> import panelbox as pb
92
+ >>> import pandas as pd
93
+ >>>
94
+ >>> # Fit model
95
+ >>> data = pd.read_csv('panel_data.csv')
96
+ >>> fe = pb.FixedEffects("y ~ x1 + x2", data, "entity_id", "time")
97
+ >>> results = fe.fit()
98
+ >>>
99
+ >>> # Detect outliers
100
+ >>> detector = pb.OutlierDetector(results)
101
+ >>>
102
+ >>> # Univariate methods
103
+ >>> outliers_iqr = detector.detect_outliers_univariate(method='iqr')
104
+ >>> outliers_zscore = detector.detect_outliers_univariate(method='zscore')
105
+ >>>
106
+ >>> # Multivariate method
107
+ >>> outliers_mahal = detector.detect_outliers_multivariate()
108
+ >>>
109
+ >>> # Regression diagnostics
110
+ >>> outliers_resid = detector.detect_outliers_residuals(method='standardized')
111
+ >>>
112
+ >>> # Leverage points
113
+ >>> leverage = detector.detect_leverage_points()
114
+ >>>
115
+ >>> # Plot diagnostics
116
+ >>> detector.plot_diagnostics()
117
+
118
+ Notes
119
+ -----
120
+ - Different methods may identify different outliers
121
+ - Combine multiple methods for robust detection
122
+ - Outliers should be investigated, not automatically removed
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ results: PanelResults,
128
+ verbose: bool = True
129
+ ):
130
+ self.results = results
131
+ self.verbose = verbose
132
+
133
+ # Extract model information
134
+ self.model = results._model
135
+ self.data = self.model.data.data
136
+
137
+ # Get entity and time columns
138
+ self.entity_col = self.model.data.entity_col
139
+ self.time_col = self.model.data.time_col
140
+
141
+ # Results storage
142
+ self.outlier_results_: Optional[OutlierResults] = None
143
+
144
+ def detect_outliers_univariate(
145
+ self,
146
+ variable: Optional[str] = None,
147
+ method: str = 'iqr',
148
+ threshold: float = 1.5
149
+ ) -> OutlierResults:
150
+ """
151
+ Detect outliers using univariate methods.
152
+
153
+ Parameters
154
+ ----------
155
+ variable : str, optional
156
+ Variable to check for outliers. If None, uses residuals.
157
+ method : {'iqr', 'zscore'}, default='iqr'
158
+ Detection method:
159
+
160
+ - 'iqr': Interquartile range method (Q1 - k*IQR, Q3 + k*IQR)
161
+ - 'zscore': Z-score method (|z| > threshold)
162
+ threshold : float, default=1.5
163
+ Threshold parameter:
164
+
165
+ - For IQR: multiplier for IQR (typically 1.5 or 3.0)
166
+ - For Z-score: threshold for |z| (typically 2.5 or 3.0)
167
+
168
+ Returns
169
+ -------
170
+ outlier_results : OutlierResults
171
+ Outlier detection results
172
+ """
173
+ if variable is None:
174
+ # Use residuals
175
+ values = self.results.resid
176
+ var_name = 'residuals'
177
+ else:
178
+ values = self.data[variable].values
179
+ var_name = variable
180
+
181
+ if method == 'iqr':
182
+ # IQR method
183
+ Q1 = np.percentile(values, 25)
184
+ Q3 = np.percentile(values, 75)
185
+ IQR = Q3 - Q1
186
+
187
+ lower_bound = Q1 - threshold * IQR
188
+ upper_bound = Q3 + threshold * IQR
189
+
190
+ is_outlier = (values < lower_bound) | (values > upper_bound)
191
+ distance = np.minimum(
192
+ np.abs(values - lower_bound),
193
+ np.abs(values - upper_bound)
194
+ )
195
+
196
+ method_name = f"IQR (k={threshold})"
197
+
198
+ elif method == 'zscore':
199
+ # Z-score method
200
+ mean = np.mean(values)
201
+ std = np.std(values)
202
+ z_scores = (values - mean) / std
203
+
204
+ is_outlier = np.abs(z_scores) > threshold
205
+ distance = np.abs(z_scores)
206
+
207
+ method_name = f"Z-score (threshold={threshold})"
208
+
209
+ else:
210
+ raise ValueError(f"Unknown method: {method}. Use 'iqr' or 'zscore'")
211
+
212
+ # Create results DataFrame
213
+ outliers_df = pd.DataFrame({
214
+ 'entity': self.data[self.entity_col].values,
215
+ 'time': self.data[self.time_col].values,
216
+ 'value': values,
217
+ 'is_outlier': is_outlier,
218
+ 'distance': distance
219
+ })
220
+
221
+ n_outliers = is_outlier.sum()
222
+
223
+ self.outlier_results_ = OutlierResults(
224
+ outliers=outliers_df,
225
+ method=f"{method_name} on {var_name}",
226
+ threshold=threshold,
227
+ n_outliers=n_outliers
228
+ )
229
+
230
+ if self.verbose:
231
+ print(f"Detected {n_outliers} outliers using {method_name}")
232
+
233
+ return self.outlier_results_
234
+
235
+ def detect_outliers_multivariate(
236
+ self,
237
+ threshold: float = 3.0
238
+ ) -> OutlierResults:
239
+ """
240
+ Detect outliers using Mahalanobis distance.
241
+
242
+ Parameters
243
+ ----------
244
+ threshold : float, default=3.0
245
+ Threshold for Mahalanobis distance (in units of chi-square quantile)
246
+
247
+ Returns
248
+ -------
249
+ outlier_results : OutlierResults
250
+ Outlier detection results
251
+
252
+ Notes
253
+ -----
254
+ Mahalanobis distance accounts for correlations between variables
255
+ and is more appropriate for multivariate outlier detection than
256
+ univariate methods.
257
+ """
258
+ # Get design matrix (X)
259
+ from patsy import dmatrix
260
+ formula_rhs = self.results.formula.split('~')[1].strip()
261
+ X = dmatrix(formula_rhs, self.data, return_type='dataframe')
262
+
263
+ # Compute Mahalanobis distance
264
+ mean = X.mean().values
265
+ # Use covariance matrix, handling potential singularity
266
+ try:
267
+ cov = np.cov(X.values.T)
268
+ cov_inv = np.linalg.inv(cov)
269
+ except np.linalg.LinAlgError:
270
+ # Use pseudo-inverse if singular
271
+ warnings.warn("Covariance matrix is singular, using pseudo-inverse")
272
+ cov_inv = np.linalg.pinv(np.cov(X.values.T))
273
+
274
+ diff = X.values - mean
275
+ mahal_dist = np.sqrt(np.sum(diff @ cov_inv * diff, axis=1))
276
+
277
+ # Threshold based on chi-square distribution
278
+ df = X.shape[1]
279
+ chi2_threshold = stats.chi2.ppf(0.975, df) # 97.5th percentile
280
+ threshold_value = np.sqrt(chi2_threshold) * threshold
281
+
282
+ is_outlier = mahal_dist > threshold_value
283
+
284
+ # Create results DataFrame
285
+ outliers_df = pd.DataFrame({
286
+ 'entity': self.data[self.entity_col].values,
287
+ 'time': self.data[self.time_col].values,
288
+ 'mahalanobis_distance': mahal_dist,
289
+ 'is_outlier': is_outlier,
290
+ 'distance': mahal_dist
291
+ })
292
+
293
+ n_outliers = is_outlier.sum()
294
+
295
+ self.outlier_results_ = OutlierResults(
296
+ outliers=outliers_df,
297
+ method=f"Mahalanobis distance",
298
+ threshold=threshold_value,
299
+ n_outliers=n_outliers
300
+ )
301
+
302
+ if self.verbose:
303
+ print(f"Detected {n_outliers} outliers using Mahalanobis distance")
304
+
305
+ return self.outlier_results_
306
+
307
+ def detect_outliers_residuals(
308
+ self,
309
+ method: str = 'standardized',
310
+ threshold: float = 2.5
311
+ ) -> OutlierResults:
312
+ """
313
+ Detect outliers using residual-based methods.
314
+
315
+ Parameters
316
+ ----------
317
+ method : {'standardized', 'studentized'}, default='standardized'
318
+ Type of residuals:
319
+
320
+ - 'standardized': Residuals / sqrt(MSE)
321
+ - 'studentized': Residuals / sqrt(MSE * (1 - h_ii))
322
+ threshold : float, default=2.5
323
+ Threshold for absolute residual value
324
+
325
+ Returns
326
+ -------
327
+ outlier_results : OutlierResults
328
+ Outlier detection results
329
+ """
330
+ residuals = self.results.resid
331
+
332
+ if method == 'standardized':
333
+ # Standardized residuals: r / sqrt(MSE)
334
+ mse = np.sum(residuals ** 2) / self.results.df_resid
335
+ std_residuals = residuals / np.sqrt(mse)
336
+ is_outlier = np.abs(std_residuals) > threshold
337
+
338
+ outliers_df = pd.DataFrame({
339
+ 'entity': self.data[self.entity_col].values,
340
+ 'time': self.data[self.time_col].values,
341
+ 'residual': residuals,
342
+ 'standardized_residual': std_residuals,
343
+ 'is_outlier': is_outlier,
344
+ 'distance': np.abs(std_residuals)
345
+ })
346
+
347
+ elif method == 'studentized':
348
+ # Studentized residuals require leverage values
349
+ # For panel data, this is approximate
350
+ mse = np.sum(residuals ** 2) / self.results.df_resid
351
+
352
+ # Approximate leverage (would need full hat matrix for exact)
353
+ n = len(residuals)
354
+ k = len(self.results.params)
355
+ approx_leverage = k / n # Average leverage
356
+
357
+ studentized_residuals = residuals / np.sqrt(mse * (1 - approx_leverage))
358
+ is_outlier = np.abs(studentized_residuals) > threshold
359
+
360
+ outliers_df = pd.DataFrame({
361
+ 'entity': self.data[self.entity_col].values,
362
+ 'time': self.data[self.time_col].values,
363
+ 'residual': residuals,
364
+ 'studentized_residual': studentized_residuals,
365
+ 'is_outlier': is_outlier,
366
+ 'distance': np.abs(studentized_residuals)
367
+ })
368
+
369
+ else:
370
+ raise ValueError(f"Unknown method: {method}")
371
+
372
+ n_outliers = is_outlier.sum()
373
+
374
+ self.outlier_results_ = OutlierResults(
375
+ outliers=outliers_df,
376
+ method=f"{method.capitalize()} residuals",
377
+ threshold=threshold,
378
+ n_outliers=n_outliers
379
+ )
380
+
381
+ if self.verbose:
382
+ print(f"Detected {n_outliers} outliers using {method} residuals")
383
+
384
+ return self.outlier_results_
385
+
386
+ def detect_leverage_points(
387
+ self,
388
+ threshold: Optional[float] = None
389
+ ) -> pd.DataFrame:
390
+ """
391
+ Detect high-leverage points.
392
+
393
+ Parameters
394
+ ----------
395
+ threshold : float, optional
396
+ Threshold for leverage. If None, uses 2*k/n (common rule of thumb)
397
+ where k is number of parameters and n is number of observations
398
+
399
+ Returns
400
+ -------
401
+ leverage_df : pd.DataFrame
402
+ DataFrame with leverage values and flags
403
+
404
+ Notes
405
+ -----
406
+ For panel data with fixed effects, exact leverage calculation
407
+ requires the full hat matrix, which can be memory-intensive.
408
+ This implementation provides an approximation.
409
+ """
410
+ n = len(self.results.resid)
411
+ k = len(self.results.params)
412
+
413
+ if threshold is None:
414
+ threshold = 2 * k / n
415
+
416
+ # For panel FE models, this is an approximation
417
+ # True leverage would require hat matrix: H = X(X'X)^-1 X'
418
+ # We approximate using distance from means
419
+
420
+ from patsy import dmatrix
421
+ formula_rhs = self.results.formula.split('~')[1].strip()
422
+ X = dmatrix(formula_rhs, self.data, return_type='dataframe')
423
+
424
+ # Approximate leverage using Mahalanobis distance
425
+ mean = X.mean().values
426
+ try:
427
+ cov = np.cov(X.values.T)
428
+ cov_inv = np.linalg.inv(cov)
429
+ except np.linalg.LinAlgError:
430
+ warnings.warn("Using pseudo-inverse for leverage calculation")
431
+ cov_inv = np.linalg.pinv(np.cov(X.values.T))
432
+
433
+ diff = X.values - mean
434
+ mahal_dist_sq = np.sum(diff @ cov_inv * diff, axis=1)
435
+
436
+ # Convert to approximate leverage (0 to 1 scale)
437
+ leverage = mahal_dist_sq / (n - 1) + 1 / n
438
+
439
+ is_high_leverage = leverage > threshold
440
+
441
+ leverage_df = pd.DataFrame({
442
+ 'entity': self.data[self.entity_col].values,
443
+ 'time': self.data[self.time_col].values,
444
+ 'leverage': leverage,
445
+ 'is_high_leverage': is_high_leverage
446
+ })
447
+
448
+ n_high_leverage = is_high_leverage.sum()
449
+
450
+ if self.verbose:
451
+ print(f"Detected {n_high_leverage} high-leverage points (threshold={threshold:.4f})")
452
+
453
+ return leverage_df
454
+
455
+ def plot_diagnostics(
456
+ self,
457
+ save_path: Optional[str] = None
458
+ ):
459
+ """
460
+ Plot diagnostic plots for outlier detection.
461
+
462
+ Parameters
463
+ ----------
464
+ save_path : str, optional
465
+ Path to save the plot. If None, displays the plot.
466
+
467
+ Raises
468
+ ------
469
+ ImportError
470
+ If matplotlib is not installed
471
+ """
472
+ try:
473
+ import matplotlib.pyplot as plt
474
+ except ImportError:
475
+ raise ImportError("matplotlib is required for plotting. "
476
+ "Install with: pip install matplotlib")
477
+
478
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
479
+
480
+ residuals = self.results.resid
481
+ fitted = self.results.fittedvalues
482
+
483
+ # Plot 1: Residuals vs Fitted
484
+ ax1 = axes[0, 0]
485
+ ax1.scatter(fitted, residuals, alpha=0.5, s=20)
486
+ ax1.axhline(y=0, color='r', linestyle='--', linewidth=1)
487
+ ax1.set_xlabel('Fitted Values')
488
+ ax1.set_ylabel('Residuals')
489
+ ax1.set_title('Residuals vs Fitted')
490
+ ax1.grid(True, alpha=0.3)
491
+
492
+ # Plot 2: Q-Q plot
493
+ ax2 = axes[0, 1]
494
+ stats.probplot(residuals, dist="norm", plot=ax2)
495
+ ax2.set_title('Normal Q-Q Plot')
496
+ ax2.grid(True, alpha=0.3)
497
+
498
+ # Plot 3: Scale-Location (sqrt of standardized residuals vs fitted)
499
+ ax3 = axes[1, 0]
500
+ mse = np.sum(residuals ** 2) / self.results.df_resid
501
+ std_residuals = residuals / np.sqrt(mse)
502
+ ax3.scatter(fitted, np.sqrt(np.abs(std_residuals)), alpha=0.5, s=20)
503
+ ax3.set_xlabel('Fitted Values')
504
+ ax3.set_ylabel('√|Standardized Residuals|')
505
+ ax3.set_title('Scale-Location Plot')
506
+ ax3.grid(True, alpha=0.3)
507
+
508
+ # Plot 4: Histogram of residuals
509
+ ax4 = axes[1, 1]
510
+ ax4.hist(residuals, bins=30, density=True, alpha=0.7, edgecolor='black')
511
+
512
+ # Overlay normal distribution
513
+ mu, sigma = residuals.mean(), residuals.std()
514
+ x = np.linspace(residuals.min(), residuals.max(), 100)
515
+ ax4.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal')
516
+ ax4.set_xlabel('Residuals')
517
+ ax4.set_ylabel('Density')
518
+ ax4.set_title('Distribution of Residuals')
519
+ ax4.legend()
520
+ ax4.grid(True, alpha=0.3)
521
+
522
+ plt.tight_layout()
523
+
524
+ if save_path:
525
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
526
+ if self.verbose:
527
+ print(f"Plot saved to {save_path}")
528
+ else:
529
+ plt.show()