panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,809 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sensitivity Analysis for Panel Data Models.
|
|
3
|
+
|
|
4
|
+
This module provides tools for assessing the robustness of panel data estimation
|
|
5
|
+
results through various sensitivity analysis methods including:
|
|
6
|
+
- Leave-one-out analysis (entities and periods)
|
|
7
|
+
- Subsample sensitivity analysis
|
|
8
|
+
- Visualization of sensitivity results
|
|
9
|
+
|
|
10
|
+
Author: PanelBox Development Team
|
|
11
|
+
Date: 2026-01-22
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import warnings
|
|
17
|
+
from typing import Optional, Union, Dict, List, Tuple, Callable
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
# Optional matplotlib import
|
|
21
|
+
try:
|
|
22
|
+
import matplotlib.pyplot as plt
|
|
23
|
+
from matplotlib.figure import Figure
|
|
24
|
+
HAS_MATPLOTLIB = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
HAS_MATPLOTLIB = False
|
|
27
|
+
Figure = None
|
|
28
|
+
|
|
29
|
+
from panelbox.core.results import PanelResults
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SensitivityResults:
|
|
34
|
+
"""
|
|
35
|
+
Container for sensitivity analysis results.
|
|
36
|
+
|
|
37
|
+
Attributes
|
|
38
|
+
----------
|
|
39
|
+
method : str
|
|
40
|
+
Type of sensitivity analysis performed
|
|
41
|
+
estimates : pd.DataFrame
|
|
42
|
+
Parameter estimates for each subsample
|
|
43
|
+
std_errors : pd.DataFrame
|
|
44
|
+
Standard errors for each subsample
|
|
45
|
+
statistics : Dict
|
|
46
|
+
Summary statistics (max deviation, mean estimate, etc.)
|
|
47
|
+
influential_units : List
|
|
48
|
+
List of influential units (entities or periods)
|
|
49
|
+
subsample_info : pd.DataFrame
|
|
50
|
+
Information about each subsample used
|
|
51
|
+
"""
|
|
52
|
+
method: str
|
|
53
|
+
estimates: pd.DataFrame
|
|
54
|
+
std_errors: pd.DataFrame
|
|
55
|
+
statistics: Dict
|
|
56
|
+
influential_units: List
|
|
57
|
+
subsample_info: pd.DataFrame
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SensitivityAnalysis:
|
|
61
|
+
"""
|
|
62
|
+
Sensitivity Analysis for Panel Data Models.
|
|
63
|
+
|
|
64
|
+
This class provides comprehensive tools for assessing the sensitivity of
|
|
65
|
+
panel data model estimates to various changes in the sample composition.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
results : PanelResults
|
|
70
|
+
Fitted panel model results object
|
|
71
|
+
show_progress : bool, default=False
|
|
72
|
+
Whether to display progress bar during estimation
|
|
73
|
+
|
|
74
|
+
Attributes
|
|
75
|
+
----------
|
|
76
|
+
results : PanelResults
|
|
77
|
+
Original fitted model results
|
|
78
|
+
model : PanelModel
|
|
79
|
+
Original panel model object
|
|
80
|
+
params : pd.Series
|
|
81
|
+
Original parameter estimates
|
|
82
|
+
std_errors : pd.Series
|
|
83
|
+
Original standard errors
|
|
84
|
+
|
|
85
|
+
Examples
|
|
86
|
+
--------
|
|
87
|
+
>>> import panelbox as pb
|
|
88
|
+
>>>
|
|
89
|
+
>>> # Fit model
|
|
90
|
+
>>> fe = pb.FixedEffects("y ~ x1 + x2", data, "entity", "time")
|
|
91
|
+
>>> results = fe.fit()
|
|
92
|
+
>>>
|
|
93
|
+
>>> # Sensitivity analysis
|
|
94
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
95
|
+
>>>
|
|
96
|
+
>>> # Leave-one-out analysis
|
|
97
|
+
>>> loo_entities = sensitivity.leave_one_out_entities()
|
|
98
|
+
>>> loo_periods = sensitivity.leave_one_out_periods()
|
|
99
|
+
>>>
|
|
100
|
+
>>> # Subset sensitivity
|
|
101
|
+
>>> subset_results = sensitivity.subset_sensitivity(n_subsamples=20)
|
|
102
|
+
>>>
|
|
103
|
+
>>> # Visualize
|
|
104
|
+
>>> fig = sensitivity.plot_sensitivity(loo_entities)
|
|
105
|
+
>>> plt.show()
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
results: PanelResults,
|
|
111
|
+
show_progress: bool = False
|
|
112
|
+
):
|
|
113
|
+
"""Initialize sensitivity analysis."""
|
|
114
|
+
self.results = results
|
|
115
|
+
|
|
116
|
+
# Get model from results
|
|
117
|
+
if results._model is None:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"Results object must contain a reference to the original model. "
|
|
120
|
+
"Ensure the model stores a reference to itself in results._model"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
self.model = results._model
|
|
124
|
+
self.params = results.params
|
|
125
|
+
self.std_errors = results.std_errors
|
|
126
|
+
self.show_progress = show_progress
|
|
127
|
+
|
|
128
|
+
# Store original data info
|
|
129
|
+
self.entity_col = self.model.data.entity_col
|
|
130
|
+
self.time_col = self.model.data.time_col
|
|
131
|
+
self.data = self.model.data.data.copy()
|
|
132
|
+
|
|
133
|
+
# Get unique entities and time periods
|
|
134
|
+
self.entities = sorted(self.data[self.entity_col].unique())
|
|
135
|
+
self.time_periods = sorted(self.data[self.time_col].unique())
|
|
136
|
+
|
|
137
|
+
self.n_entities = len(self.entities)
|
|
138
|
+
self.n_periods = len(self.time_periods)
|
|
139
|
+
|
|
140
|
+
def leave_one_out_entities(
|
|
141
|
+
self,
|
|
142
|
+
influence_threshold: float = 2.0
|
|
143
|
+
) -> SensitivityResults:
|
|
144
|
+
"""
|
|
145
|
+
Leave-one-out analysis by entities.
|
|
146
|
+
|
|
147
|
+
Removes one entity at a time and re-estimates the model to assess
|
|
148
|
+
the influence of each entity on parameter estimates.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
influence_threshold : float, default=2.0
|
|
153
|
+
Threshold for identifying influential entities (in standard deviations)
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
SensitivityResults
|
|
158
|
+
Results containing estimates for each entity left out
|
|
159
|
+
|
|
160
|
+
Notes
|
|
161
|
+
-----
|
|
162
|
+
An entity is considered influential if removing it causes parameter
|
|
163
|
+
estimates to deviate by more than `influence_threshold` standard
|
|
164
|
+
deviations from the original estimates.
|
|
165
|
+
|
|
166
|
+
Examples
|
|
167
|
+
--------
|
|
168
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
169
|
+
>>> loo_results = sensitivity.leave_one_out_entities()
|
|
170
|
+
>>> print(loo_results.statistics)
|
|
171
|
+
>>> print(loo_results.influential_units)
|
|
172
|
+
"""
|
|
173
|
+
if self.show_progress:
|
|
174
|
+
try:
|
|
175
|
+
from tqdm import tqdm
|
|
176
|
+
iterator = tqdm(self.entities, desc="LOO Entities")
|
|
177
|
+
except ImportError:
|
|
178
|
+
iterator = self.entities
|
|
179
|
+
warnings.warn("Install tqdm for progress bars: pip install tqdm")
|
|
180
|
+
else:
|
|
181
|
+
iterator = self.entities
|
|
182
|
+
|
|
183
|
+
estimates_list = []
|
|
184
|
+
std_errors_list = []
|
|
185
|
+
subsample_info = []
|
|
186
|
+
|
|
187
|
+
for entity in iterator:
|
|
188
|
+
# Create subsample excluding this entity
|
|
189
|
+
subsample = self.data[self.data[self.entity_col] != entity].copy()
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Refit model on subsample
|
|
193
|
+
subsample_model = self._create_model(subsample)
|
|
194
|
+
subsample_results = subsample_model.fit()
|
|
195
|
+
|
|
196
|
+
estimates_list.append(subsample_results.params.values)
|
|
197
|
+
std_errors_list.append(subsample_results.std_errors.values)
|
|
198
|
+
|
|
199
|
+
subsample_info.append({
|
|
200
|
+
'excluded': entity,
|
|
201
|
+
'n_obs': len(subsample),
|
|
202
|
+
'converged': True
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
except Exception as e:
|
|
206
|
+
# If estimation fails, use NaN
|
|
207
|
+
estimates_list.append(np.full(len(self.params), np.nan))
|
|
208
|
+
std_errors_list.append(np.full(len(self.params), np.nan))
|
|
209
|
+
|
|
210
|
+
subsample_info.append({
|
|
211
|
+
'excluded': entity,
|
|
212
|
+
'n_obs': len(subsample),
|
|
213
|
+
'converged': False
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
if self.show_progress:
|
|
217
|
+
warnings.warn(f"Failed to estimate without entity {entity}: {e}")
|
|
218
|
+
|
|
219
|
+
# Convert to DataFrames
|
|
220
|
+
estimates_df = pd.DataFrame(
|
|
221
|
+
estimates_list,
|
|
222
|
+
index=[f"excl_{e}" for e in self.entities],
|
|
223
|
+
columns=self.params.index
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
std_errors_df = pd.DataFrame(
|
|
227
|
+
std_errors_list,
|
|
228
|
+
index=[f"excl_{e}" for e in self.entities],
|
|
229
|
+
columns=self.params.index
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
subsample_info_df = pd.DataFrame(subsample_info)
|
|
233
|
+
|
|
234
|
+
# Calculate statistics
|
|
235
|
+
statistics = self._calculate_statistics(
|
|
236
|
+
estimates_df,
|
|
237
|
+
influence_threshold
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Identify influential entities
|
|
241
|
+
influential_units = self._identify_influential_units(
|
|
242
|
+
estimates_df,
|
|
243
|
+
influence_threshold
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return SensitivityResults(
|
|
247
|
+
method='leave_one_out_entities',
|
|
248
|
+
estimates=estimates_df,
|
|
249
|
+
std_errors=std_errors_df,
|
|
250
|
+
statistics=statistics,
|
|
251
|
+
influential_units=influential_units,
|
|
252
|
+
subsample_info=subsample_info_df
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def leave_one_out_periods(
|
|
256
|
+
self,
|
|
257
|
+
influence_threshold: float = 2.0
|
|
258
|
+
) -> SensitivityResults:
|
|
259
|
+
"""
|
|
260
|
+
Leave-one-out analysis by time periods.
|
|
261
|
+
|
|
262
|
+
Removes one time period at a time and re-estimates the model to assess
|
|
263
|
+
the influence of each time period on parameter estimates.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
influence_threshold : float, default=2.0
|
|
268
|
+
Threshold for identifying influential periods (in standard deviations)
|
|
269
|
+
|
|
270
|
+
Returns
|
|
271
|
+
-------
|
|
272
|
+
SensitivityResults
|
|
273
|
+
Results containing estimates for each period left out
|
|
274
|
+
|
|
275
|
+
Notes
|
|
276
|
+
-----
|
|
277
|
+
A time period is considered influential if removing it causes parameter
|
|
278
|
+
estimates to deviate by more than `influence_threshold` standard
|
|
279
|
+
deviations from the original estimates.
|
|
280
|
+
|
|
281
|
+
Examples
|
|
282
|
+
--------
|
|
283
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
284
|
+
>>> loo_results = sensitivity.leave_one_out_periods()
|
|
285
|
+
>>> print(loo_results.statistics)
|
|
286
|
+
>>> print(loo_results.influential_units)
|
|
287
|
+
"""
|
|
288
|
+
if self.show_progress:
|
|
289
|
+
try:
|
|
290
|
+
from tqdm import tqdm
|
|
291
|
+
iterator = tqdm(self.time_periods, desc="LOO Periods")
|
|
292
|
+
except ImportError:
|
|
293
|
+
iterator = self.time_periods
|
|
294
|
+
warnings.warn("Install tqdm for progress bars: pip install tqdm")
|
|
295
|
+
else:
|
|
296
|
+
iterator = self.time_periods
|
|
297
|
+
|
|
298
|
+
estimates_list = []
|
|
299
|
+
std_errors_list = []
|
|
300
|
+
subsample_info = []
|
|
301
|
+
|
|
302
|
+
for period in iterator:
|
|
303
|
+
# Create subsample excluding this period
|
|
304
|
+
subsample = self.data[self.data[self.time_col] != period].copy()
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
# Refit model on subsample
|
|
308
|
+
subsample_model = self._create_model(subsample)
|
|
309
|
+
subsample_results = subsample_model.fit()
|
|
310
|
+
|
|
311
|
+
estimates_list.append(subsample_results.params.values)
|
|
312
|
+
std_errors_list.append(subsample_results.std_errors.values)
|
|
313
|
+
|
|
314
|
+
subsample_info.append({
|
|
315
|
+
'excluded': period,
|
|
316
|
+
'n_obs': len(subsample),
|
|
317
|
+
'converged': True
|
|
318
|
+
})
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
# If estimation fails, use NaN
|
|
322
|
+
estimates_list.append(np.full(len(self.params), np.nan))
|
|
323
|
+
std_errors_list.append(np.full(len(self.params), np.nan))
|
|
324
|
+
|
|
325
|
+
subsample_info.append({
|
|
326
|
+
'excluded': period,
|
|
327
|
+
'n_obs': len(subsample),
|
|
328
|
+
'converged': False
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
if self.show_progress:
|
|
332
|
+
warnings.warn(f"Failed to estimate without period {period}: {e}")
|
|
333
|
+
|
|
334
|
+
# Convert to DataFrames
|
|
335
|
+
estimates_df = pd.DataFrame(
|
|
336
|
+
estimates_list,
|
|
337
|
+
index=[f"excl_{t}" for t in self.time_periods],
|
|
338
|
+
columns=self.params.index
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
std_errors_df = pd.DataFrame(
|
|
342
|
+
std_errors_list,
|
|
343
|
+
index=[f"excl_{t}" for t in self.time_periods],
|
|
344
|
+
columns=self.params.index
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
subsample_info_df = pd.DataFrame(subsample_info)
|
|
348
|
+
|
|
349
|
+
# Calculate statistics
|
|
350
|
+
statistics = self._calculate_statistics(
|
|
351
|
+
estimates_df,
|
|
352
|
+
influence_threshold
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Identify influential periods
|
|
356
|
+
influential_units = self._identify_influential_units(
|
|
357
|
+
estimates_df,
|
|
358
|
+
influence_threshold
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return SensitivityResults(
|
|
362
|
+
method='leave_one_out_periods',
|
|
363
|
+
estimates=estimates_df,
|
|
364
|
+
std_errors=std_errors_df,
|
|
365
|
+
statistics=statistics,
|
|
366
|
+
influential_units=influential_units,
|
|
367
|
+
subsample_info=subsample_info_df
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def subset_sensitivity(
|
|
371
|
+
self,
|
|
372
|
+
n_subsamples: int = 20,
|
|
373
|
+
subsample_size: float = 0.8,
|
|
374
|
+
stratify: bool = True,
|
|
375
|
+
random_state: Optional[int] = None
|
|
376
|
+
) -> SensitivityResults:
|
|
377
|
+
"""
|
|
378
|
+
Subsample sensitivity analysis.
|
|
379
|
+
|
|
380
|
+
Randomly draws multiple subsamples and re-estimates the model on each
|
|
381
|
+
to assess the stability of parameter estimates across different samples.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
n_subsamples : int, default=20
|
|
386
|
+
Number of random subsamples to draw
|
|
387
|
+
subsample_size : float, default=0.8
|
|
388
|
+
Fraction of entities to include in each subsample (0 < size < 1)
|
|
389
|
+
stratify : bool, default=True
|
|
390
|
+
Whether to stratify sampling to maintain temporal balance
|
|
391
|
+
random_state : int, optional
|
|
392
|
+
Random seed for reproducibility
|
|
393
|
+
|
|
394
|
+
Returns
|
|
395
|
+
-------
|
|
396
|
+
SensitivityResults
|
|
397
|
+
Results containing estimates for each subsample
|
|
398
|
+
|
|
399
|
+
Notes
|
|
400
|
+
-----
|
|
401
|
+
Stratified sampling ensures each subsample maintains the same temporal
|
|
402
|
+
structure by randomly selecting a fraction of entities while keeping
|
|
403
|
+
all time periods for selected entities.
|
|
404
|
+
|
|
405
|
+
Examples
|
|
406
|
+
--------
|
|
407
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
408
|
+
>>> subset_results = sensitivity.subset_sensitivity(
|
|
409
|
+
... n_subsamples=50,
|
|
410
|
+
... subsample_size=0.75
|
|
411
|
+
... )
|
|
412
|
+
>>> print(subset_results.statistics)
|
|
413
|
+
"""
|
|
414
|
+
if not (0 < subsample_size < 1):
|
|
415
|
+
raise ValueError("subsample_size must be between 0 and 1")
|
|
416
|
+
|
|
417
|
+
if n_subsamples < 2:
|
|
418
|
+
raise ValueError("n_subsamples must be at least 2")
|
|
419
|
+
|
|
420
|
+
rng = np.random.RandomState(random_state)
|
|
421
|
+
|
|
422
|
+
if self.show_progress:
|
|
423
|
+
try:
|
|
424
|
+
from tqdm import tqdm
|
|
425
|
+
iterator = tqdm(range(n_subsamples), desc="Subsamples")
|
|
426
|
+
except ImportError:
|
|
427
|
+
iterator = range(n_subsamples)
|
|
428
|
+
warnings.warn("Install tqdm for progress bars: pip install tqdm")
|
|
429
|
+
else:
|
|
430
|
+
iterator = range(n_subsamples)
|
|
431
|
+
|
|
432
|
+
estimates_list = []
|
|
433
|
+
std_errors_list = []
|
|
434
|
+
subsample_info = []
|
|
435
|
+
|
|
436
|
+
n_entities_subsample = max(2, int(self.n_entities * subsample_size))
|
|
437
|
+
|
|
438
|
+
for i in iterator:
|
|
439
|
+
# Sample entities
|
|
440
|
+
sampled_entities = rng.choice(
|
|
441
|
+
self.entities,
|
|
442
|
+
size=n_entities_subsample,
|
|
443
|
+
replace=False
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Create subsample
|
|
447
|
+
subsample = self.data[
|
|
448
|
+
self.data[self.entity_col].isin(sampled_entities)
|
|
449
|
+
].copy()
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
# Refit model on subsample
|
|
453
|
+
subsample_model = self._create_model(subsample)
|
|
454
|
+
subsample_results = subsample_model.fit()
|
|
455
|
+
|
|
456
|
+
estimates_list.append(subsample_results.params.values)
|
|
457
|
+
std_errors_list.append(subsample_results.std_errors.values)
|
|
458
|
+
|
|
459
|
+
subsample_info.append({
|
|
460
|
+
'subsample_id': i,
|
|
461
|
+
'n_entities': len(sampled_entities),
|
|
462
|
+
'n_obs': len(subsample),
|
|
463
|
+
'converged': True
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
except Exception as e:
|
|
467
|
+
# If estimation fails, use NaN
|
|
468
|
+
estimates_list.append(np.full(len(self.params), np.nan))
|
|
469
|
+
std_errors_list.append(np.full(len(self.params), np.nan))
|
|
470
|
+
|
|
471
|
+
subsample_info.append({
|
|
472
|
+
'subsample_id': i,
|
|
473
|
+
'n_entities': len(sampled_entities),
|
|
474
|
+
'n_obs': len(subsample),
|
|
475
|
+
'converged': False
|
|
476
|
+
})
|
|
477
|
+
|
|
478
|
+
if self.show_progress:
|
|
479
|
+
warnings.warn(f"Failed to estimate subsample {i}: {e}")
|
|
480
|
+
|
|
481
|
+
# Convert to DataFrames
|
|
482
|
+
estimates_df = pd.DataFrame(
|
|
483
|
+
estimates_list,
|
|
484
|
+
index=[f"subsample_{i}" for i in range(n_subsamples)],
|
|
485
|
+
columns=self.params.index
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
std_errors_df = pd.DataFrame(
|
|
489
|
+
std_errors_list,
|
|
490
|
+
index=[f"subsample_{i}" for i in range(n_subsamples)],
|
|
491
|
+
columns=self.params.index
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
subsample_info_df = pd.DataFrame(subsample_info)
|
|
495
|
+
|
|
496
|
+
# Calculate statistics
|
|
497
|
+
statistics = self._calculate_statistics(estimates_df, threshold=2.0)
|
|
498
|
+
|
|
499
|
+
return SensitivityResults(
|
|
500
|
+
method='subset_sensitivity',
|
|
501
|
+
estimates=estimates_df,
|
|
502
|
+
std_errors=std_errors_df,
|
|
503
|
+
statistics=statistics,
|
|
504
|
+
influential_units=[], # Not applicable for subset analysis
|
|
505
|
+
subsample_info=subsample_info_df
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
def plot_sensitivity(
|
|
509
|
+
self,
|
|
510
|
+
sensitivity_results: SensitivityResults,
|
|
511
|
+
params: Optional[List[str]] = None,
|
|
512
|
+
figsize: Tuple[float, float] = (12, 6),
|
|
513
|
+
reference_line: bool = True,
|
|
514
|
+
confidence_band: bool = True,
|
|
515
|
+
**kwargs
|
|
516
|
+
) -> Figure:
|
|
517
|
+
"""
|
|
518
|
+
Plot sensitivity analysis results.
|
|
519
|
+
|
|
520
|
+
Creates visualization showing how parameter estimates vary across
|
|
521
|
+
different subsamples or leave-one-out analyses.
|
|
522
|
+
|
|
523
|
+
Parameters
|
|
524
|
+
----------
|
|
525
|
+
sensitivity_results : SensitivityResults
|
|
526
|
+
Results from sensitivity analysis
|
|
527
|
+
params : List[str], optional
|
|
528
|
+
List of parameters to plot. If None, plots all parameters
|
|
529
|
+
figsize : Tuple[float, float], default=(12, 6)
|
|
530
|
+
Figure size (width, height)
|
|
531
|
+
reference_line : bool, default=True
|
|
532
|
+
Whether to show reference line at original estimate
|
|
533
|
+
confidence_band : bool, default=True
|
|
534
|
+
Whether to show confidence band (mean ± 1.96 * std)
|
|
535
|
+
**kwargs
|
|
536
|
+
Additional keyword arguments passed to plt.subplots
|
|
537
|
+
|
|
538
|
+
Returns
|
|
539
|
+
-------
|
|
540
|
+
Figure
|
|
541
|
+
Matplotlib figure object
|
|
542
|
+
|
|
543
|
+
Examples
|
|
544
|
+
--------
|
|
545
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
546
|
+
>>> loo_results = sensitivity.leave_one_out_entities()
|
|
547
|
+
>>> fig = sensitivity.plot_sensitivity(loo_results)
|
|
548
|
+
>>> plt.show()
|
|
549
|
+
"""
|
|
550
|
+
if not HAS_MATPLOTLIB:
|
|
551
|
+
raise ImportError(
|
|
552
|
+
"Matplotlib is required for plotting. "
|
|
553
|
+
"Install it with: pip install matplotlib"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
if params is None:
|
|
557
|
+
params = list(self.params.index)
|
|
558
|
+
|
|
559
|
+
n_params = len(params)
|
|
560
|
+
|
|
561
|
+
# Create subplots
|
|
562
|
+
fig, axes = plt.subplots(
|
|
563
|
+
1, n_params,
|
|
564
|
+
figsize=figsize,
|
|
565
|
+
squeeze=False,
|
|
566
|
+
**kwargs
|
|
567
|
+
)
|
|
568
|
+
axes = axes.flatten()
|
|
569
|
+
|
|
570
|
+
for idx, param in enumerate(params):
|
|
571
|
+
ax = axes[idx]
|
|
572
|
+
|
|
573
|
+
# Get estimates for this parameter
|
|
574
|
+
estimates = sensitivity_results.estimates[param].dropna()
|
|
575
|
+
|
|
576
|
+
# Plot estimates
|
|
577
|
+
x = range(len(estimates))
|
|
578
|
+
ax.scatter(x, estimates, alpha=0.6, s=30)
|
|
579
|
+
|
|
580
|
+
# Reference line (original estimate)
|
|
581
|
+
if reference_line:
|
|
582
|
+
original_value = self.params[param]
|
|
583
|
+
ax.axhline(
|
|
584
|
+
original_value,
|
|
585
|
+
color='red',
|
|
586
|
+
linestyle='--',
|
|
587
|
+
linewidth=2,
|
|
588
|
+
label='Original',
|
|
589
|
+
alpha=0.7
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Confidence band
|
|
593
|
+
if confidence_band:
|
|
594
|
+
mean_est = estimates.mean()
|
|
595
|
+
std_est = estimates.std()
|
|
596
|
+
ax.axhline(mean_est, color='blue', linestyle='-', alpha=0.5, label='Mean')
|
|
597
|
+
ax.fill_between(
|
|
598
|
+
x,
|
|
599
|
+
mean_est - 1.96 * std_est,
|
|
600
|
+
mean_est + 1.96 * std_est,
|
|
601
|
+
alpha=0.2,
|
|
602
|
+
color='blue',
|
|
603
|
+
label='95% Band'
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
ax.set_xlabel('Subsample/Exclusion')
|
|
607
|
+
ax.set_ylabel('Estimate')
|
|
608
|
+
ax.set_title(f'{param}')
|
|
609
|
+
ax.legend(fontsize=8)
|
|
610
|
+
ax.grid(True, alpha=0.3)
|
|
611
|
+
|
|
612
|
+
# Overall title
|
|
613
|
+
method_title = sensitivity_results.method.replace('_', ' ').title()
|
|
614
|
+
fig.suptitle(f'Sensitivity Analysis: {method_title}', fontsize=14, y=1.02)
|
|
615
|
+
|
|
616
|
+
plt.tight_layout()
|
|
617
|
+
|
|
618
|
+
return fig
|
|
619
|
+
|
|
620
|
+
def summary(
|
|
621
|
+
self,
|
|
622
|
+
sensitivity_results: SensitivityResults
|
|
623
|
+
) -> pd.DataFrame:
|
|
624
|
+
"""
|
|
625
|
+
Generate summary table of sensitivity analysis results.
|
|
626
|
+
|
|
627
|
+
Parameters
|
|
628
|
+
----------
|
|
629
|
+
sensitivity_results : SensitivityResults
|
|
630
|
+
Results from sensitivity analysis
|
|
631
|
+
|
|
632
|
+
Returns
|
|
633
|
+
-------
|
|
634
|
+
pd.DataFrame
|
|
635
|
+
Summary statistics for each parameter
|
|
636
|
+
|
|
637
|
+
Examples
|
|
638
|
+
--------
|
|
639
|
+
>>> sensitivity = pb.SensitivityAnalysis(results)
|
|
640
|
+
>>> loo_results = sensitivity.leave_one_out_entities()
|
|
641
|
+
>>> summary = sensitivity.summary(loo_results)
|
|
642
|
+
>>> print(summary)
|
|
643
|
+
"""
|
|
644
|
+
estimates = sensitivity_results.estimates
|
|
645
|
+
|
|
646
|
+
summary_data = []
|
|
647
|
+
|
|
648
|
+
for param in estimates.columns:
|
|
649
|
+
param_estimates = estimates[param].dropna()
|
|
650
|
+
|
|
651
|
+
original = self.params[param]
|
|
652
|
+
mean_est = param_estimates.mean()
|
|
653
|
+
std_est = param_estimates.std()
|
|
654
|
+
min_est = param_estimates.min()
|
|
655
|
+
max_est = param_estimates.max()
|
|
656
|
+
|
|
657
|
+
# Max deviation from original (in standard deviations)
|
|
658
|
+
max_dev = np.abs(param_estimates - original).max()
|
|
659
|
+
max_dev_std = max_dev / self.std_errors[param]
|
|
660
|
+
|
|
661
|
+
summary_data.append({
|
|
662
|
+
'Parameter': param,
|
|
663
|
+
'Original': original,
|
|
664
|
+
'Mean': mean_est,
|
|
665
|
+
'Std': std_est,
|
|
666
|
+
'Min': min_est,
|
|
667
|
+
'Max': max_est,
|
|
668
|
+
'Range': max_est - min_est,
|
|
669
|
+
'Max Deviation': max_dev,
|
|
670
|
+
'Max Dev (SE)': max_dev_std,
|
|
671
|
+
'N Valid': len(param_estimates)
|
|
672
|
+
})
|
|
673
|
+
|
|
674
|
+
return pd.DataFrame(summary_data)
|
|
675
|
+
|
|
676
|
+
def _create_model(self, data: pd.DataFrame):
|
|
677
|
+
"""Create a new model instance with given data."""
|
|
678
|
+
# Get model class and parameters
|
|
679
|
+
model_class = type(self.model)
|
|
680
|
+
|
|
681
|
+
# Reconstruct model with new data
|
|
682
|
+
formula = self.model.formula_parser.formula
|
|
683
|
+
|
|
684
|
+
# Create new model
|
|
685
|
+
new_model = model_class(
|
|
686
|
+
formula=formula,
|
|
687
|
+
data=data,
|
|
688
|
+
entity_col=self.entity_col,
|
|
689
|
+
time_col=self.time_col
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
return new_model
|
|
693
|
+
|
|
694
|
+
def _calculate_statistics(
|
|
695
|
+
self,
|
|
696
|
+
estimates_df: pd.DataFrame,
|
|
697
|
+
threshold: float
|
|
698
|
+
) -> Dict:
|
|
699
|
+
"""Calculate summary statistics from estimates."""
|
|
700
|
+
statistics = {}
|
|
701
|
+
|
|
702
|
+
for param in estimates_df.columns:
|
|
703
|
+
param_estimates = estimates_df[param].dropna()
|
|
704
|
+
|
|
705
|
+
original = self.params[param]
|
|
706
|
+
original_se = self.std_errors[param]
|
|
707
|
+
|
|
708
|
+
# Deviations from original
|
|
709
|
+
deviations = param_estimates - original
|
|
710
|
+
abs_deviations = np.abs(deviations)
|
|
711
|
+
|
|
712
|
+
# Standardized deviations
|
|
713
|
+
std_deviations = deviations / original_se
|
|
714
|
+
|
|
715
|
+
statistics[param] = {
|
|
716
|
+
'mean': param_estimates.mean(),
|
|
717
|
+
'std': param_estimates.std(),
|
|
718
|
+
'min': param_estimates.min(),
|
|
719
|
+
'max': param_estimates.max(),
|
|
720
|
+
'range': param_estimates.max() - param_estimates.min(),
|
|
721
|
+
'max_abs_deviation': abs_deviations.max(),
|
|
722
|
+
'mean_abs_deviation': abs_deviations.mean(),
|
|
723
|
+
'max_std_deviation': np.abs(std_deviations).max(),
|
|
724
|
+
'n_beyond_threshold': (np.abs(std_deviations) > threshold).sum(),
|
|
725
|
+
'pct_beyond_threshold': (np.abs(std_deviations) > threshold).mean() * 100
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return statistics
|
|
729
|
+
|
|
730
|
+
def _identify_influential_units(
|
|
731
|
+
self,
|
|
732
|
+
estimates_df: pd.DataFrame,
|
|
733
|
+
threshold: float
|
|
734
|
+
) -> List:
|
|
735
|
+
"""Identify influential units based on threshold."""
|
|
736
|
+
influential = []
|
|
737
|
+
|
|
738
|
+
for idx in estimates_df.index:
|
|
739
|
+
# Check if any parameter estimate exceeds threshold
|
|
740
|
+
is_influential = False
|
|
741
|
+
|
|
742
|
+
for param in estimates_df.columns:
|
|
743
|
+
estimate = estimates_df.loc[idx, param]
|
|
744
|
+
|
|
745
|
+
if np.isnan(estimate):
|
|
746
|
+
continue
|
|
747
|
+
|
|
748
|
+
original = self.params[param]
|
|
749
|
+
original_se = self.std_errors[param]
|
|
750
|
+
|
|
751
|
+
# Standardized deviation
|
|
752
|
+
std_dev = np.abs(estimate - original) / original_se
|
|
753
|
+
|
|
754
|
+
if std_dev > threshold:
|
|
755
|
+
is_influential = True
|
|
756
|
+
break
|
|
757
|
+
|
|
758
|
+
if is_influential:
|
|
759
|
+
influential.append(idx)
|
|
760
|
+
|
|
761
|
+
return influential
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def dfbetas(
|
|
765
|
+
results: PanelResults,
|
|
766
|
+
entity_col: Optional[str] = None,
|
|
767
|
+
time_col: Optional[str] = None
|
|
768
|
+
) -> pd.DataFrame:
|
|
769
|
+
"""
|
|
770
|
+
Calculate DFBETAS influence statistics.
|
|
771
|
+
|
|
772
|
+
DFBETAS measures the change in parameter estimates when each observation
|
|
773
|
+
is deleted, standardized by the standard error.
|
|
774
|
+
|
|
775
|
+
Parameters
|
|
776
|
+
----------
|
|
777
|
+
results : PanelResults
|
|
778
|
+
Fitted panel model results
|
|
779
|
+
entity_col : str, optional
|
|
780
|
+
Entity column name (inferred from model if not provided)
|
|
781
|
+
time_col : str, optional
|
|
782
|
+
Time column name (inferred from model if not provided)
|
|
783
|
+
|
|
784
|
+
Returns
|
|
785
|
+
-------
|
|
786
|
+
pd.DataFrame
|
|
787
|
+
DFBETAS statistics for each observation and parameter
|
|
788
|
+
|
|
789
|
+
Notes
|
|
790
|
+
-----
|
|
791
|
+
DFBETAS_i = (beta - beta_{-i}) / SE_{-i}
|
|
792
|
+
|
|
793
|
+
where beta is the full sample estimate and beta_{-i} is the estimate
|
|
794
|
+
with observation i deleted.
|
|
795
|
+
|
|
796
|
+
Observations with |DFBETAS| > 2/sqrt(n) are considered influential.
|
|
797
|
+
|
|
798
|
+
Examples
|
|
799
|
+
--------
|
|
800
|
+
>>> dfbetas_stats = pb.dfbetas(results)
|
|
801
|
+
>>> influential = dfbetas_stats[dfbetas_stats.abs() > 2/np.sqrt(len(data))]
|
|
802
|
+
>>> print(influential)
|
|
803
|
+
"""
|
|
804
|
+
# This is a placeholder for future implementation
|
|
805
|
+
# Full DFBETAS requires refitting N times (computationally expensive)
|
|
806
|
+
raise NotImplementedError(
|
|
807
|
+
"DFBETAS calculation not yet implemented. "
|
|
808
|
+
"Use SensitivityAnalysis.leave_one_out_entities() for entity-level influence."
|
|
809
|
+
)
|