panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Outlier detection and leverage diagnostics for panel data models.
|
|
3
|
+
|
|
4
|
+
This module implements various methods for detecting outliers and high-leverage
|
|
5
|
+
points in panel data, including:
|
|
6
|
+
- Univariate methods (IQR, Z-score)
|
|
7
|
+
- Multivariate methods (Mahalanobis distance)
|
|
8
|
+
- Regression diagnostics (standardized residuals, studentized residuals)
|
|
9
|
+
- Leverage diagnostics (hat values)
|
|
10
|
+
|
|
11
|
+
References
|
|
12
|
+
----------
|
|
13
|
+
Cook, R. D., & Weisberg, S. (1982). Residuals and Influence in Regression.
|
|
14
|
+
Chapman and Hall.
|
|
15
|
+
Rousseeuw, P. J., & Leroy, A. M. (1987). Robust Regression and Outlier Detection.
|
|
16
|
+
John Wiley & Sons.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from typing import Optional, Dict, Any, Tuple, List, Union
|
|
20
|
+
import warnings
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
from scipy import stats
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
|
|
26
|
+
from panelbox.core.results import PanelResults
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class OutlierResults:
|
|
31
|
+
"""
|
|
32
|
+
Container for outlier detection results.
|
|
33
|
+
|
|
34
|
+
Attributes
|
|
35
|
+
----------
|
|
36
|
+
outliers : pd.DataFrame
|
|
37
|
+
DataFrame with outlier flags and diagnostic statistics
|
|
38
|
+
method : str
|
|
39
|
+
Method used for detection
|
|
40
|
+
threshold : float
|
|
41
|
+
Threshold used for detection
|
|
42
|
+
n_outliers : int
|
|
43
|
+
Number of outliers detected
|
|
44
|
+
"""
|
|
45
|
+
outliers: pd.DataFrame
|
|
46
|
+
method: str
|
|
47
|
+
threshold: float
|
|
48
|
+
n_outliers: int
|
|
49
|
+
|
|
50
|
+
def summary(self) -> str:
|
|
51
|
+
"""Generate summary of outlier detection."""
|
|
52
|
+
lines = []
|
|
53
|
+
lines.append("Outlier Detection Results")
|
|
54
|
+
lines.append("=" * 70)
|
|
55
|
+
lines.append(f"Method: {self.method}")
|
|
56
|
+
lines.append(f"Threshold: {self.threshold}")
|
|
57
|
+
lines.append(f"Outliers detected: {self.n_outliers} / {len(self.outliers)}")
|
|
58
|
+
lines.append(f"Percentage: {100 * self.n_outliers / len(self.outliers):.2f}%")
|
|
59
|
+
|
|
60
|
+
if self.n_outliers > 0:
|
|
61
|
+
lines.append("")
|
|
62
|
+
lines.append("Top 10 outliers:")
|
|
63
|
+
lines.append("-" * 70)
|
|
64
|
+
top_outliers = self.outliers[self.outliers['is_outlier']].head(10)
|
|
65
|
+
lines.append(top_outliers.to_string())
|
|
66
|
+
|
|
67
|
+
return "\n".join(lines)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class OutlierDetector:
|
|
71
|
+
"""
|
|
72
|
+
Outlier detection for panel data models.
|
|
73
|
+
|
|
74
|
+
This class provides various methods for detecting outliers and
|
|
75
|
+
high-leverage points in panel data regression models.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
results : PanelResults
|
|
80
|
+
Fitted model results to analyze
|
|
81
|
+
verbose : bool, default=True
|
|
82
|
+
Whether to print progress information
|
|
83
|
+
|
|
84
|
+
Attributes
|
|
85
|
+
----------
|
|
86
|
+
outlier_results_ : OutlierResults
|
|
87
|
+
Results after calling detection methods
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
>>> import panelbox as pb
|
|
92
|
+
>>> import pandas as pd
|
|
93
|
+
>>>
|
|
94
|
+
>>> # Fit model
|
|
95
|
+
>>> data = pd.read_csv('panel_data.csv')
|
|
96
|
+
>>> fe = pb.FixedEffects("y ~ x1 + x2", data, "entity_id", "time")
|
|
97
|
+
>>> results = fe.fit()
|
|
98
|
+
>>>
|
|
99
|
+
>>> # Detect outliers
|
|
100
|
+
>>> detector = pb.OutlierDetector(results)
|
|
101
|
+
>>>
|
|
102
|
+
>>> # Univariate methods
|
|
103
|
+
>>> outliers_iqr = detector.detect_outliers_univariate(method='iqr')
|
|
104
|
+
>>> outliers_zscore = detector.detect_outliers_univariate(method='zscore')
|
|
105
|
+
>>>
|
|
106
|
+
>>> # Multivariate method
|
|
107
|
+
>>> outliers_mahal = detector.detect_outliers_multivariate()
|
|
108
|
+
>>>
|
|
109
|
+
>>> # Regression diagnostics
|
|
110
|
+
>>> outliers_resid = detector.detect_outliers_residuals(method='standardized')
|
|
111
|
+
>>>
|
|
112
|
+
>>> # Leverage points
|
|
113
|
+
>>> leverage = detector.detect_leverage_points()
|
|
114
|
+
>>>
|
|
115
|
+
>>> # Plot diagnostics
|
|
116
|
+
>>> detector.plot_diagnostics()
|
|
117
|
+
|
|
118
|
+
Notes
|
|
119
|
+
-----
|
|
120
|
+
- Different methods may identify different outliers
|
|
121
|
+
- Combine multiple methods for robust detection
|
|
122
|
+
- Outliers should be investigated, not automatically removed
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
results: PanelResults,
|
|
128
|
+
verbose: bool = True
|
|
129
|
+
):
|
|
130
|
+
self.results = results
|
|
131
|
+
self.verbose = verbose
|
|
132
|
+
|
|
133
|
+
# Extract model information
|
|
134
|
+
self.model = results._model
|
|
135
|
+
self.data = self.model.data.data
|
|
136
|
+
|
|
137
|
+
# Get entity and time columns
|
|
138
|
+
self.entity_col = self.model.data.entity_col
|
|
139
|
+
self.time_col = self.model.data.time_col
|
|
140
|
+
|
|
141
|
+
# Results storage
|
|
142
|
+
self.outlier_results_: Optional[OutlierResults] = None
|
|
143
|
+
|
|
144
|
+
def detect_outliers_univariate(
|
|
145
|
+
self,
|
|
146
|
+
variable: Optional[str] = None,
|
|
147
|
+
method: str = 'iqr',
|
|
148
|
+
threshold: float = 1.5
|
|
149
|
+
) -> OutlierResults:
|
|
150
|
+
"""
|
|
151
|
+
Detect outliers using univariate methods.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
variable : str, optional
|
|
156
|
+
Variable to check for outliers. If None, uses residuals.
|
|
157
|
+
method : {'iqr', 'zscore'}, default='iqr'
|
|
158
|
+
Detection method:
|
|
159
|
+
|
|
160
|
+
- 'iqr': Interquartile range method (Q1 - k*IQR, Q3 + k*IQR)
|
|
161
|
+
- 'zscore': Z-score method (|z| > threshold)
|
|
162
|
+
threshold : float, default=1.5
|
|
163
|
+
Threshold parameter:
|
|
164
|
+
|
|
165
|
+
- For IQR: multiplier for IQR (typically 1.5 or 3.0)
|
|
166
|
+
- For Z-score: threshold for |z| (typically 2.5 or 3.0)
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
outlier_results : OutlierResults
|
|
171
|
+
Outlier detection results
|
|
172
|
+
"""
|
|
173
|
+
if variable is None:
|
|
174
|
+
# Use residuals
|
|
175
|
+
values = self.results.resid
|
|
176
|
+
var_name = 'residuals'
|
|
177
|
+
else:
|
|
178
|
+
values = self.data[variable].values
|
|
179
|
+
var_name = variable
|
|
180
|
+
|
|
181
|
+
if method == 'iqr':
|
|
182
|
+
# IQR method
|
|
183
|
+
Q1 = np.percentile(values, 25)
|
|
184
|
+
Q3 = np.percentile(values, 75)
|
|
185
|
+
IQR = Q3 - Q1
|
|
186
|
+
|
|
187
|
+
lower_bound = Q1 - threshold * IQR
|
|
188
|
+
upper_bound = Q3 + threshold * IQR
|
|
189
|
+
|
|
190
|
+
is_outlier = (values < lower_bound) | (values > upper_bound)
|
|
191
|
+
distance = np.minimum(
|
|
192
|
+
np.abs(values - lower_bound),
|
|
193
|
+
np.abs(values - upper_bound)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
method_name = f"IQR (k={threshold})"
|
|
197
|
+
|
|
198
|
+
elif method == 'zscore':
|
|
199
|
+
# Z-score method
|
|
200
|
+
mean = np.mean(values)
|
|
201
|
+
std = np.std(values)
|
|
202
|
+
z_scores = (values - mean) / std
|
|
203
|
+
|
|
204
|
+
is_outlier = np.abs(z_scores) > threshold
|
|
205
|
+
distance = np.abs(z_scores)
|
|
206
|
+
|
|
207
|
+
method_name = f"Z-score (threshold={threshold})"
|
|
208
|
+
|
|
209
|
+
else:
|
|
210
|
+
raise ValueError(f"Unknown method: {method}. Use 'iqr' or 'zscore'")
|
|
211
|
+
|
|
212
|
+
# Create results DataFrame
|
|
213
|
+
outliers_df = pd.DataFrame({
|
|
214
|
+
'entity': self.data[self.entity_col].values,
|
|
215
|
+
'time': self.data[self.time_col].values,
|
|
216
|
+
'value': values,
|
|
217
|
+
'is_outlier': is_outlier,
|
|
218
|
+
'distance': distance
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
n_outliers = is_outlier.sum()
|
|
222
|
+
|
|
223
|
+
self.outlier_results_ = OutlierResults(
|
|
224
|
+
outliers=outliers_df,
|
|
225
|
+
method=f"{method_name} on {var_name}",
|
|
226
|
+
threshold=threshold,
|
|
227
|
+
n_outliers=n_outliers
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if self.verbose:
|
|
231
|
+
print(f"Detected {n_outliers} outliers using {method_name}")
|
|
232
|
+
|
|
233
|
+
return self.outlier_results_
|
|
234
|
+
|
|
235
|
+
def detect_outliers_multivariate(
|
|
236
|
+
self,
|
|
237
|
+
threshold: float = 3.0
|
|
238
|
+
) -> OutlierResults:
|
|
239
|
+
"""
|
|
240
|
+
Detect outliers using Mahalanobis distance.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
threshold : float, default=3.0
|
|
245
|
+
Threshold for Mahalanobis distance (in units of chi-square quantile)
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
outlier_results : OutlierResults
|
|
250
|
+
Outlier detection results
|
|
251
|
+
|
|
252
|
+
Notes
|
|
253
|
+
-----
|
|
254
|
+
Mahalanobis distance accounts for correlations between variables
|
|
255
|
+
and is more appropriate for multivariate outlier detection than
|
|
256
|
+
univariate methods.
|
|
257
|
+
"""
|
|
258
|
+
# Get design matrix (X)
|
|
259
|
+
from patsy import dmatrix
|
|
260
|
+
formula_rhs = self.results.formula.split('~')[1].strip()
|
|
261
|
+
X = dmatrix(formula_rhs, self.data, return_type='dataframe')
|
|
262
|
+
|
|
263
|
+
# Compute Mahalanobis distance
|
|
264
|
+
mean = X.mean().values
|
|
265
|
+
# Use covariance matrix, handling potential singularity
|
|
266
|
+
try:
|
|
267
|
+
cov = np.cov(X.values.T)
|
|
268
|
+
cov_inv = np.linalg.inv(cov)
|
|
269
|
+
except np.linalg.LinAlgError:
|
|
270
|
+
# Use pseudo-inverse if singular
|
|
271
|
+
warnings.warn("Covariance matrix is singular, using pseudo-inverse")
|
|
272
|
+
cov_inv = np.linalg.pinv(np.cov(X.values.T))
|
|
273
|
+
|
|
274
|
+
diff = X.values - mean
|
|
275
|
+
mahal_dist = np.sqrt(np.sum(diff @ cov_inv * diff, axis=1))
|
|
276
|
+
|
|
277
|
+
# Threshold based on chi-square distribution
|
|
278
|
+
df = X.shape[1]
|
|
279
|
+
chi2_threshold = stats.chi2.ppf(0.975, df) # 97.5th percentile
|
|
280
|
+
threshold_value = np.sqrt(chi2_threshold) * threshold
|
|
281
|
+
|
|
282
|
+
is_outlier = mahal_dist > threshold_value
|
|
283
|
+
|
|
284
|
+
# Create results DataFrame
|
|
285
|
+
outliers_df = pd.DataFrame({
|
|
286
|
+
'entity': self.data[self.entity_col].values,
|
|
287
|
+
'time': self.data[self.time_col].values,
|
|
288
|
+
'mahalanobis_distance': mahal_dist,
|
|
289
|
+
'is_outlier': is_outlier,
|
|
290
|
+
'distance': mahal_dist
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
n_outliers = is_outlier.sum()
|
|
294
|
+
|
|
295
|
+
self.outlier_results_ = OutlierResults(
|
|
296
|
+
outliers=outliers_df,
|
|
297
|
+
method=f"Mahalanobis distance",
|
|
298
|
+
threshold=threshold_value,
|
|
299
|
+
n_outliers=n_outliers
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if self.verbose:
|
|
303
|
+
print(f"Detected {n_outliers} outliers using Mahalanobis distance")
|
|
304
|
+
|
|
305
|
+
return self.outlier_results_
|
|
306
|
+
|
|
307
|
+
def detect_outliers_residuals(
|
|
308
|
+
self,
|
|
309
|
+
method: str = 'standardized',
|
|
310
|
+
threshold: float = 2.5
|
|
311
|
+
) -> OutlierResults:
|
|
312
|
+
"""
|
|
313
|
+
Detect outliers using residual-based methods.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
method : {'standardized', 'studentized'}, default='standardized'
|
|
318
|
+
Type of residuals:
|
|
319
|
+
|
|
320
|
+
- 'standardized': Residuals / sqrt(MSE)
|
|
321
|
+
- 'studentized': Residuals / sqrt(MSE * (1 - h_ii))
|
|
322
|
+
threshold : float, default=2.5
|
|
323
|
+
Threshold for absolute residual value
|
|
324
|
+
|
|
325
|
+
Returns
|
|
326
|
+
-------
|
|
327
|
+
outlier_results : OutlierResults
|
|
328
|
+
Outlier detection results
|
|
329
|
+
"""
|
|
330
|
+
residuals = self.results.resid
|
|
331
|
+
|
|
332
|
+
if method == 'standardized':
|
|
333
|
+
# Standardized residuals: r / sqrt(MSE)
|
|
334
|
+
mse = np.sum(residuals ** 2) / self.results.df_resid
|
|
335
|
+
std_residuals = residuals / np.sqrt(mse)
|
|
336
|
+
is_outlier = np.abs(std_residuals) > threshold
|
|
337
|
+
|
|
338
|
+
outliers_df = pd.DataFrame({
|
|
339
|
+
'entity': self.data[self.entity_col].values,
|
|
340
|
+
'time': self.data[self.time_col].values,
|
|
341
|
+
'residual': residuals,
|
|
342
|
+
'standardized_residual': std_residuals,
|
|
343
|
+
'is_outlier': is_outlier,
|
|
344
|
+
'distance': np.abs(std_residuals)
|
|
345
|
+
})
|
|
346
|
+
|
|
347
|
+
elif method == 'studentized':
|
|
348
|
+
# Studentized residuals require leverage values
|
|
349
|
+
# For panel data, this is approximate
|
|
350
|
+
mse = np.sum(residuals ** 2) / self.results.df_resid
|
|
351
|
+
|
|
352
|
+
# Approximate leverage (would need full hat matrix for exact)
|
|
353
|
+
n = len(residuals)
|
|
354
|
+
k = len(self.results.params)
|
|
355
|
+
approx_leverage = k / n # Average leverage
|
|
356
|
+
|
|
357
|
+
studentized_residuals = residuals / np.sqrt(mse * (1 - approx_leverage))
|
|
358
|
+
is_outlier = np.abs(studentized_residuals) > threshold
|
|
359
|
+
|
|
360
|
+
outliers_df = pd.DataFrame({
|
|
361
|
+
'entity': self.data[self.entity_col].values,
|
|
362
|
+
'time': self.data[self.time_col].values,
|
|
363
|
+
'residual': residuals,
|
|
364
|
+
'studentized_residual': studentized_residuals,
|
|
365
|
+
'is_outlier': is_outlier,
|
|
366
|
+
'distance': np.abs(studentized_residuals)
|
|
367
|
+
})
|
|
368
|
+
|
|
369
|
+
else:
|
|
370
|
+
raise ValueError(f"Unknown method: {method}")
|
|
371
|
+
|
|
372
|
+
n_outliers = is_outlier.sum()
|
|
373
|
+
|
|
374
|
+
self.outlier_results_ = OutlierResults(
|
|
375
|
+
outliers=outliers_df,
|
|
376
|
+
method=f"{method.capitalize()} residuals",
|
|
377
|
+
threshold=threshold,
|
|
378
|
+
n_outliers=n_outliers
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
if self.verbose:
|
|
382
|
+
print(f"Detected {n_outliers} outliers using {method} residuals")
|
|
383
|
+
|
|
384
|
+
return self.outlier_results_
|
|
385
|
+
|
|
386
|
+
def detect_leverage_points(
|
|
387
|
+
self,
|
|
388
|
+
threshold: Optional[float] = None
|
|
389
|
+
) -> pd.DataFrame:
|
|
390
|
+
"""
|
|
391
|
+
Detect high-leverage points.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
threshold : float, optional
|
|
396
|
+
Threshold for leverage. If None, uses 2*k/n (common rule of thumb)
|
|
397
|
+
where k is number of parameters and n is number of observations
|
|
398
|
+
|
|
399
|
+
Returns
|
|
400
|
+
-------
|
|
401
|
+
leverage_df : pd.DataFrame
|
|
402
|
+
DataFrame with leverage values and flags
|
|
403
|
+
|
|
404
|
+
Notes
|
|
405
|
+
-----
|
|
406
|
+
For panel data with fixed effects, exact leverage calculation
|
|
407
|
+
requires the full hat matrix, which can be memory-intensive.
|
|
408
|
+
This implementation provides an approximation.
|
|
409
|
+
"""
|
|
410
|
+
n = len(self.results.resid)
|
|
411
|
+
k = len(self.results.params)
|
|
412
|
+
|
|
413
|
+
if threshold is None:
|
|
414
|
+
threshold = 2 * k / n
|
|
415
|
+
|
|
416
|
+
# For panel FE models, this is an approximation
|
|
417
|
+
# True leverage would require hat matrix: H = X(X'X)^-1 X'
|
|
418
|
+
# We approximate using distance from means
|
|
419
|
+
|
|
420
|
+
from patsy import dmatrix
|
|
421
|
+
formula_rhs = self.results.formula.split('~')[1].strip()
|
|
422
|
+
X = dmatrix(formula_rhs, self.data, return_type='dataframe')
|
|
423
|
+
|
|
424
|
+
# Approximate leverage using Mahalanobis distance
|
|
425
|
+
mean = X.mean().values
|
|
426
|
+
try:
|
|
427
|
+
cov = np.cov(X.values.T)
|
|
428
|
+
cov_inv = np.linalg.inv(cov)
|
|
429
|
+
except np.linalg.LinAlgError:
|
|
430
|
+
warnings.warn("Using pseudo-inverse for leverage calculation")
|
|
431
|
+
cov_inv = np.linalg.pinv(np.cov(X.values.T))
|
|
432
|
+
|
|
433
|
+
diff = X.values - mean
|
|
434
|
+
mahal_dist_sq = np.sum(diff @ cov_inv * diff, axis=1)
|
|
435
|
+
|
|
436
|
+
# Convert to approximate leverage (0 to 1 scale)
|
|
437
|
+
leverage = mahal_dist_sq / (n - 1) + 1 / n
|
|
438
|
+
|
|
439
|
+
is_high_leverage = leverage > threshold
|
|
440
|
+
|
|
441
|
+
leverage_df = pd.DataFrame({
|
|
442
|
+
'entity': self.data[self.entity_col].values,
|
|
443
|
+
'time': self.data[self.time_col].values,
|
|
444
|
+
'leverage': leverage,
|
|
445
|
+
'is_high_leverage': is_high_leverage
|
|
446
|
+
})
|
|
447
|
+
|
|
448
|
+
n_high_leverage = is_high_leverage.sum()
|
|
449
|
+
|
|
450
|
+
if self.verbose:
|
|
451
|
+
print(f"Detected {n_high_leverage} high-leverage points (threshold={threshold:.4f})")
|
|
452
|
+
|
|
453
|
+
return leverage_df
|
|
454
|
+
|
|
455
|
+
def plot_diagnostics(
|
|
456
|
+
self,
|
|
457
|
+
save_path: Optional[str] = None
|
|
458
|
+
):
|
|
459
|
+
"""
|
|
460
|
+
Plot diagnostic plots for outlier detection.
|
|
461
|
+
|
|
462
|
+
Parameters
|
|
463
|
+
----------
|
|
464
|
+
save_path : str, optional
|
|
465
|
+
Path to save the plot. If None, displays the plot.
|
|
466
|
+
|
|
467
|
+
Raises
|
|
468
|
+
------
|
|
469
|
+
ImportError
|
|
470
|
+
If matplotlib is not installed
|
|
471
|
+
"""
|
|
472
|
+
try:
|
|
473
|
+
import matplotlib.pyplot as plt
|
|
474
|
+
except ImportError:
|
|
475
|
+
raise ImportError("matplotlib is required for plotting. "
|
|
476
|
+
"Install with: pip install matplotlib")
|
|
477
|
+
|
|
478
|
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
|
479
|
+
|
|
480
|
+
residuals = self.results.resid
|
|
481
|
+
fitted = self.results.fittedvalues
|
|
482
|
+
|
|
483
|
+
# Plot 1: Residuals vs Fitted
|
|
484
|
+
ax1 = axes[0, 0]
|
|
485
|
+
ax1.scatter(fitted, residuals, alpha=0.5, s=20)
|
|
486
|
+
ax1.axhline(y=0, color='r', linestyle='--', linewidth=1)
|
|
487
|
+
ax1.set_xlabel('Fitted Values')
|
|
488
|
+
ax1.set_ylabel('Residuals')
|
|
489
|
+
ax1.set_title('Residuals vs Fitted')
|
|
490
|
+
ax1.grid(True, alpha=0.3)
|
|
491
|
+
|
|
492
|
+
# Plot 2: Q-Q plot
|
|
493
|
+
ax2 = axes[0, 1]
|
|
494
|
+
stats.probplot(residuals, dist="norm", plot=ax2)
|
|
495
|
+
ax2.set_title('Normal Q-Q Plot')
|
|
496
|
+
ax2.grid(True, alpha=0.3)
|
|
497
|
+
|
|
498
|
+
# Plot 3: Scale-Location (sqrt of standardized residuals vs fitted)
|
|
499
|
+
ax3 = axes[1, 0]
|
|
500
|
+
mse = np.sum(residuals ** 2) / self.results.df_resid
|
|
501
|
+
std_residuals = residuals / np.sqrt(mse)
|
|
502
|
+
ax3.scatter(fitted, np.sqrt(np.abs(std_residuals)), alpha=0.5, s=20)
|
|
503
|
+
ax3.set_xlabel('Fitted Values')
|
|
504
|
+
ax3.set_ylabel('√|Standardized Residuals|')
|
|
505
|
+
ax3.set_title('Scale-Location Plot')
|
|
506
|
+
ax3.grid(True, alpha=0.3)
|
|
507
|
+
|
|
508
|
+
# Plot 4: Histogram of residuals
|
|
509
|
+
ax4 = axes[1, 1]
|
|
510
|
+
ax4.hist(residuals, bins=30, density=True, alpha=0.7, edgecolor='black')
|
|
511
|
+
|
|
512
|
+
# Overlay normal distribution
|
|
513
|
+
mu, sigma = residuals.mean(), residuals.std()
|
|
514
|
+
x = np.linspace(residuals.min(), residuals.max(), 100)
|
|
515
|
+
ax4.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal')
|
|
516
|
+
ax4.set_xlabel('Residuals')
|
|
517
|
+
ax4.set_ylabel('Density')
|
|
518
|
+
ax4.set_title('Distribution of Residuals')
|
|
519
|
+
ax4.legend()
|
|
520
|
+
ax4.grid(True, alpha=0.3)
|
|
521
|
+
|
|
522
|
+
plt.tight_layout()
|
|
523
|
+
|
|
524
|
+
if save_path:
|
|
525
|
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
|
526
|
+
if self.verbose:
|
|
527
|
+
print(f"Plot saved to {save_path}")
|
|
528
|
+
else:
|
|
529
|
+
plt.show()
|