panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,809 @@
1
+ """
2
+ Sensitivity Analysis for Panel Data Models.
3
+
4
+ This module provides tools for assessing the robustness of panel data estimation
5
+ results through various sensitivity analysis methods including:
6
+ - Leave-one-out analysis (entities and periods)
7
+ - Subsample sensitivity analysis
8
+ - Visualization of sensitivity results
9
+
10
+ Author: PanelBox Development Team
11
+ Date: 2026-01-22
12
+ """
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import warnings
17
+ from typing import Optional, Union, Dict, List, Tuple, Callable
18
+ from dataclasses import dataclass
19
+
20
+ # Optional matplotlib import
21
+ try:
22
+ import matplotlib.pyplot as plt
23
+ from matplotlib.figure import Figure
24
+ HAS_MATPLOTLIB = True
25
+ except ImportError:
26
+ HAS_MATPLOTLIB = False
27
+ Figure = None
28
+
29
+ from panelbox.core.results import PanelResults
30
+
31
+
32
+ @dataclass
33
+ class SensitivityResults:
34
+ """
35
+ Container for sensitivity analysis results.
36
+
37
+ Attributes
38
+ ----------
39
+ method : str
40
+ Type of sensitivity analysis performed
41
+ estimates : pd.DataFrame
42
+ Parameter estimates for each subsample
43
+ std_errors : pd.DataFrame
44
+ Standard errors for each subsample
45
+ statistics : Dict
46
+ Summary statistics (max deviation, mean estimate, etc.)
47
+ influential_units : List
48
+ List of influential units (entities or periods)
49
+ subsample_info : pd.DataFrame
50
+ Information about each subsample used
51
+ """
52
+ method: str
53
+ estimates: pd.DataFrame
54
+ std_errors: pd.DataFrame
55
+ statistics: Dict
56
+ influential_units: List
57
+ subsample_info: pd.DataFrame
58
+
59
+
60
+ class SensitivityAnalysis:
61
+ """
62
+ Sensitivity Analysis for Panel Data Models.
63
+
64
+ This class provides comprehensive tools for assessing the sensitivity of
65
+ panel data model estimates to various changes in the sample composition.
66
+
67
+ Parameters
68
+ ----------
69
+ results : PanelResults
70
+ Fitted panel model results object
71
+ show_progress : bool, default=False
72
+ Whether to display progress bar during estimation
73
+
74
+ Attributes
75
+ ----------
76
+ results : PanelResults
77
+ Original fitted model results
78
+ model : PanelModel
79
+ Original panel model object
80
+ params : pd.Series
81
+ Original parameter estimates
82
+ std_errors : pd.Series
83
+ Original standard errors
84
+
85
+ Examples
86
+ --------
87
+ >>> import panelbox as pb
88
+ >>>
89
+ >>> # Fit model
90
+ >>> fe = pb.FixedEffects("y ~ x1 + x2", data, "entity", "time")
91
+ >>> results = fe.fit()
92
+ >>>
93
+ >>> # Sensitivity analysis
94
+ >>> sensitivity = pb.SensitivityAnalysis(results)
95
+ >>>
96
+ >>> # Leave-one-out analysis
97
+ >>> loo_entities = sensitivity.leave_one_out_entities()
98
+ >>> loo_periods = sensitivity.leave_one_out_periods()
99
+ >>>
100
+ >>> # Subset sensitivity
101
+ >>> subset_results = sensitivity.subset_sensitivity(n_subsamples=20)
102
+ >>>
103
+ >>> # Visualize
104
+ >>> fig = sensitivity.plot_sensitivity(loo_entities)
105
+ >>> plt.show()
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ results: PanelResults,
111
+ show_progress: bool = False
112
+ ):
113
+ """Initialize sensitivity analysis."""
114
+ self.results = results
115
+
116
+ # Get model from results
117
+ if results._model is None:
118
+ raise ValueError(
119
+ "Results object must contain a reference to the original model. "
120
+ "Ensure the model stores a reference to itself in results._model"
121
+ )
122
+
123
+ self.model = results._model
124
+ self.params = results.params
125
+ self.std_errors = results.std_errors
126
+ self.show_progress = show_progress
127
+
128
+ # Store original data info
129
+ self.entity_col = self.model.data.entity_col
130
+ self.time_col = self.model.data.time_col
131
+ self.data = self.model.data.data.copy()
132
+
133
+ # Get unique entities and time periods
134
+ self.entities = sorted(self.data[self.entity_col].unique())
135
+ self.time_periods = sorted(self.data[self.time_col].unique())
136
+
137
+ self.n_entities = len(self.entities)
138
+ self.n_periods = len(self.time_periods)
139
+
140
+ def leave_one_out_entities(
141
+ self,
142
+ influence_threshold: float = 2.0
143
+ ) -> SensitivityResults:
144
+ """
145
+ Leave-one-out analysis by entities.
146
+
147
+ Removes one entity at a time and re-estimates the model to assess
148
+ the influence of each entity on parameter estimates.
149
+
150
+ Parameters
151
+ ----------
152
+ influence_threshold : float, default=2.0
153
+ Threshold for identifying influential entities (in standard deviations)
154
+
155
+ Returns
156
+ -------
157
+ SensitivityResults
158
+ Results containing estimates for each entity left out
159
+
160
+ Notes
161
+ -----
162
+ An entity is considered influential if removing it causes parameter
163
+ estimates to deviate by more than `influence_threshold` standard
164
+ deviations from the original estimates.
165
+
166
+ Examples
167
+ --------
168
+ >>> sensitivity = pb.SensitivityAnalysis(results)
169
+ >>> loo_results = sensitivity.leave_one_out_entities()
170
+ >>> print(loo_results.statistics)
171
+ >>> print(loo_results.influential_units)
172
+ """
173
+ if self.show_progress:
174
+ try:
175
+ from tqdm import tqdm
176
+ iterator = tqdm(self.entities, desc="LOO Entities")
177
+ except ImportError:
178
+ iterator = self.entities
179
+ warnings.warn("Install tqdm for progress bars: pip install tqdm")
180
+ else:
181
+ iterator = self.entities
182
+
183
+ estimates_list = []
184
+ std_errors_list = []
185
+ subsample_info = []
186
+
187
+ for entity in iterator:
188
+ # Create subsample excluding this entity
189
+ subsample = self.data[self.data[self.entity_col] != entity].copy()
190
+
191
+ try:
192
+ # Refit model on subsample
193
+ subsample_model = self._create_model(subsample)
194
+ subsample_results = subsample_model.fit()
195
+
196
+ estimates_list.append(subsample_results.params.values)
197
+ std_errors_list.append(subsample_results.std_errors.values)
198
+
199
+ subsample_info.append({
200
+ 'excluded': entity,
201
+ 'n_obs': len(subsample),
202
+ 'converged': True
203
+ })
204
+
205
+ except Exception as e:
206
+ # If estimation fails, use NaN
207
+ estimates_list.append(np.full(len(self.params), np.nan))
208
+ std_errors_list.append(np.full(len(self.params), np.nan))
209
+
210
+ subsample_info.append({
211
+ 'excluded': entity,
212
+ 'n_obs': len(subsample),
213
+ 'converged': False
214
+ })
215
+
216
+ if self.show_progress:
217
+ warnings.warn(f"Failed to estimate without entity {entity}: {e}")
218
+
219
+ # Convert to DataFrames
220
+ estimates_df = pd.DataFrame(
221
+ estimates_list,
222
+ index=[f"excl_{e}" for e in self.entities],
223
+ columns=self.params.index
224
+ )
225
+
226
+ std_errors_df = pd.DataFrame(
227
+ std_errors_list,
228
+ index=[f"excl_{e}" for e in self.entities],
229
+ columns=self.params.index
230
+ )
231
+
232
+ subsample_info_df = pd.DataFrame(subsample_info)
233
+
234
+ # Calculate statistics
235
+ statistics = self._calculate_statistics(
236
+ estimates_df,
237
+ influence_threshold
238
+ )
239
+
240
+ # Identify influential entities
241
+ influential_units = self._identify_influential_units(
242
+ estimates_df,
243
+ influence_threshold
244
+ )
245
+
246
+ return SensitivityResults(
247
+ method='leave_one_out_entities',
248
+ estimates=estimates_df,
249
+ std_errors=std_errors_df,
250
+ statistics=statistics,
251
+ influential_units=influential_units,
252
+ subsample_info=subsample_info_df
253
+ )
254
+
255
+ def leave_one_out_periods(
256
+ self,
257
+ influence_threshold: float = 2.0
258
+ ) -> SensitivityResults:
259
+ """
260
+ Leave-one-out analysis by time periods.
261
+
262
+ Removes one time period at a time and re-estimates the model to assess
263
+ the influence of each time period on parameter estimates.
264
+
265
+ Parameters
266
+ ----------
267
+ influence_threshold : float, default=2.0
268
+ Threshold for identifying influential periods (in standard deviations)
269
+
270
+ Returns
271
+ -------
272
+ SensitivityResults
273
+ Results containing estimates for each period left out
274
+
275
+ Notes
276
+ -----
277
+ A time period is considered influential if removing it causes parameter
278
+ estimates to deviate by more than `influence_threshold` standard
279
+ deviations from the original estimates.
280
+
281
+ Examples
282
+ --------
283
+ >>> sensitivity = pb.SensitivityAnalysis(results)
284
+ >>> loo_results = sensitivity.leave_one_out_periods()
285
+ >>> print(loo_results.statistics)
286
+ >>> print(loo_results.influential_units)
287
+ """
288
+ if self.show_progress:
289
+ try:
290
+ from tqdm import tqdm
291
+ iterator = tqdm(self.time_periods, desc="LOO Periods")
292
+ except ImportError:
293
+ iterator = self.time_periods
294
+ warnings.warn("Install tqdm for progress bars: pip install tqdm")
295
+ else:
296
+ iterator = self.time_periods
297
+
298
+ estimates_list = []
299
+ std_errors_list = []
300
+ subsample_info = []
301
+
302
+ for period in iterator:
303
+ # Create subsample excluding this period
304
+ subsample = self.data[self.data[self.time_col] != period].copy()
305
+
306
+ try:
307
+ # Refit model on subsample
308
+ subsample_model = self._create_model(subsample)
309
+ subsample_results = subsample_model.fit()
310
+
311
+ estimates_list.append(subsample_results.params.values)
312
+ std_errors_list.append(subsample_results.std_errors.values)
313
+
314
+ subsample_info.append({
315
+ 'excluded': period,
316
+ 'n_obs': len(subsample),
317
+ 'converged': True
318
+ })
319
+
320
+ except Exception as e:
321
+ # If estimation fails, use NaN
322
+ estimates_list.append(np.full(len(self.params), np.nan))
323
+ std_errors_list.append(np.full(len(self.params), np.nan))
324
+
325
+ subsample_info.append({
326
+ 'excluded': period,
327
+ 'n_obs': len(subsample),
328
+ 'converged': False
329
+ })
330
+
331
+ if self.show_progress:
332
+ warnings.warn(f"Failed to estimate without period {period}: {e}")
333
+
334
+ # Convert to DataFrames
335
+ estimates_df = pd.DataFrame(
336
+ estimates_list,
337
+ index=[f"excl_{t}" for t in self.time_periods],
338
+ columns=self.params.index
339
+ )
340
+
341
+ std_errors_df = pd.DataFrame(
342
+ std_errors_list,
343
+ index=[f"excl_{t}" for t in self.time_periods],
344
+ columns=self.params.index
345
+ )
346
+
347
+ subsample_info_df = pd.DataFrame(subsample_info)
348
+
349
+ # Calculate statistics
350
+ statistics = self._calculate_statistics(
351
+ estimates_df,
352
+ influence_threshold
353
+ )
354
+
355
+ # Identify influential periods
356
+ influential_units = self._identify_influential_units(
357
+ estimates_df,
358
+ influence_threshold
359
+ )
360
+
361
+ return SensitivityResults(
362
+ method='leave_one_out_periods',
363
+ estimates=estimates_df,
364
+ std_errors=std_errors_df,
365
+ statistics=statistics,
366
+ influential_units=influential_units,
367
+ subsample_info=subsample_info_df
368
+ )
369
+
370
+ def subset_sensitivity(
371
+ self,
372
+ n_subsamples: int = 20,
373
+ subsample_size: float = 0.8,
374
+ stratify: bool = True,
375
+ random_state: Optional[int] = None
376
+ ) -> SensitivityResults:
377
+ """
378
+ Subsample sensitivity analysis.
379
+
380
+ Randomly draws multiple subsamples and re-estimates the model on each
381
+ to assess the stability of parameter estimates across different samples.
382
+
383
+ Parameters
384
+ ----------
385
+ n_subsamples : int, default=20
386
+ Number of random subsamples to draw
387
+ subsample_size : float, default=0.8
388
+ Fraction of entities to include in each subsample (0 < size < 1)
389
+ stratify : bool, default=True
390
+ Whether to stratify sampling to maintain temporal balance
391
+ random_state : int, optional
392
+ Random seed for reproducibility
393
+
394
+ Returns
395
+ -------
396
+ SensitivityResults
397
+ Results containing estimates for each subsample
398
+
399
+ Notes
400
+ -----
401
+ Stratified sampling ensures each subsample maintains the same temporal
402
+ structure by randomly selecting a fraction of entities while keeping
403
+ all time periods for selected entities.
404
+
405
+ Examples
406
+ --------
407
+ >>> sensitivity = pb.SensitivityAnalysis(results)
408
+ >>> subset_results = sensitivity.subset_sensitivity(
409
+ ... n_subsamples=50,
410
+ ... subsample_size=0.75
411
+ ... )
412
+ >>> print(subset_results.statistics)
413
+ """
414
+ if not (0 < subsample_size < 1):
415
+ raise ValueError("subsample_size must be between 0 and 1")
416
+
417
+ if n_subsamples < 2:
418
+ raise ValueError("n_subsamples must be at least 2")
419
+
420
+ rng = np.random.RandomState(random_state)
421
+
422
+ if self.show_progress:
423
+ try:
424
+ from tqdm import tqdm
425
+ iterator = tqdm(range(n_subsamples), desc="Subsamples")
426
+ except ImportError:
427
+ iterator = range(n_subsamples)
428
+ warnings.warn("Install tqdm for progress bars: pip install tqdm")
429
+ else:
430
+ iterator = range(n_subsamples)
431
+
432
+ estimates_list = []
433
+ std_errors_list = []
434
+ subsample_info = []
435
+
436
+ n_entities_subsample = max(2, int(self.n_entities * subsample_size))
437
+
438
+ for i in iterator:
439
+ # Sample entities
440
+ sampled_entities = rng.choice(
441
+ self.entities,
442
+ size=n_entities_subsample,
443
+ replace=False
444
+ )
445
+
446
+ # Create subsample
447
+ subsample = self.data[
448
+ self.data[self.entity_col].isin(sampled_entities)
449
+ ].copy()
450
+
451
+ try:
452
+ # Refit model on subsample
453
+ subsample_model = self._create_model(subsample)
454
+ subsample_results = subsample_model.fit()
455
+
456
+ estimates_list.append(subsample_results.params.values)
457
+ std_errors_list.append(subsample_results.std_errors.values)
458
+
459
+ subsample_info.append({
460
+ 'subsample_id': i,
461
+ 'n_entities': len(sampled_entities),
462
+ 'n_obs': len(subsample),
463
+ 'converged': True
464
+ })
465
+
466
+ except Exception as e:
467
+ # If estimation fails, use NaN
468
+ estimates_list.append(np.full(len(self.params), np.nan))
469
+ std_errors_list.append(np.full(len(self.params), np.nan))
470
+
471
+ subsample_info.append({
472
+ 'subsample_id': i,
473
+ 'n_entities': len(sampled_entities),
474
+ 'n_obs': len(subsample),
475
+ 'converged': False
476
+ })
477
+
478
+ if self.show_progress:
479
+ warnings.warn(f"Failed to estimate subsample {i}: {e}")
480
+
481
+ # Convert to DataFrames
482
+ estimates_df = pd.DataFrame(
483
+ estimates_list,
484
+ index=[f"subsample_{i}" for i in range(n_subsamples)],
485
+ columns=self.params.index
486
+ )
487
+
488
+ std_errors_df = pd.DataFrame(
489
+ std_errors_list,
490
+ index=[f"subsample_{i}" for i in range(n_subsamples)],
491
+ columns=self.params.index
492
+ )
493
+
494
+ subsample_info_df = pd.DataFrame(subsample_info)
495
+
496
+ # Calculate statistics
497
+ statistics = self._calculate_statistics(estimates_df, threshold=2.0)
498
+
499
+ return SensitivityResults(
500
+ method='subset_sensitivity',
501
+ estimates=estimates_df,
502
+ std_errors=std_errors_df,
503
+ statistics=statistics,
504
+ influential_units=[], # Not applicable for subset analysis
505
+ subsample_info=subsample_info_df
506
+ )
507
+
508
+ def plot_sensitivity(
509
+ self,
510
+ sensitivity_results: SensitivityResults,
511
+ params: Optional[List[str]] = None,
512
+ figsize: Tuple[float, float] = (12, 6),
513
+ reference_line: bool = True,
514
+ confidence_band: bool = True,
515
+ **kwargs
516
+ ) -> Figure:
517
+ """
518
+ Plot sensitivity analysis results.
519
+
520
+ Creates visualization showing how parameter estimates vary across
521
+ different subsamples or leave-one-out analyses.
522
+
523
+ Parameters
524
+ ----------
525
+ sensitivity_results : SensitivityResults
526
+ Results from sensitivity analysis
527
+ params : List[str], optional
528
+ List of parameters to plot. If None, plots all parameters
529
+ figsize : Tuple[float, float], default=(12, 6)
530
+ Figure size (width, height)
531
+ reference_line : bool, default=True
532
+ Whether to show reference line at original estimate
533
+ confidence_band : bool, default=True
534
+ Whether to show confidence band (mean ± 1.96 * std)
535
+ **kwargs
536
+ Additional keyword arguments passed to plt.subplots
537
+
538
+ Returns
539
+ -------
540
+ Figure
541
+ Matplotlib figure object
542
+
543
+ Examples
544
+ --------
545
+ >>> sensitivity = pb.SensitivityAnalysis(results)
546
+ >>> loo_results = sensitivity.leave_one_out_entities()
547
+ >>> fig = sensitivity.plot_sensitivity(loo_results)
548
+ >>> plt.show()
549
+ """
550
+ if not HAS_MATPLOTLIB:
551
+ raise ImportError(
552
+ "Matplotlib is required for plotting. "
553
+ "Install it with: pip install matplotlib"
554
+ )
555
+
556
+ if params is None:
557
+ params = list(self.params.index)
558
+
559
+ n_params = len(params)
560
+
561
+ # Create subplots
562
+ fig, axes = plt.subplots(
563
+ 1, n_params,
564
+ figsize=figsize,
565
+ squeeze=False,
566
+ **kwargs
567
+ )
568
+ axes = axes.flatten()
569
+
570
+ for idx, param in enumerate(params):
571
+ ax = axes[idx]
572
+
573
+ # Get estimates for this parameter
574
+ estimates = sensitivity_results.estimates[param].dropna()
575
+
576
+ # Plot estimates
577
+ x = range(len(estimates))
578
+ ax.scatter(x, estimates, alpha=0.6, s=30)
579
+
580
+ # Reference line (original estimate)
581
+ if reference_line:
582
+ original_value = self.params[param]
583
+ ax.axhline(
584
+ original_value,
585
+ color='red',
586
+ linestyle='--',
587
+ linewidth=2,
588
+ label='Original',
589
+ alpha=0.7
590
+ )
591
+
592
+ # Confidence band
593
+ if confidence_band:
594
+ mean_est = estimates.mean()
595
+ std_est = estimates.std()
596
+ ax.axhline(mean_est, color='blue', linestyle='-', alpha=0.5, label='Mean')
597
+ ax.fill_between(
598
+ x,
599
+ mean_est - 1.96 * std_est,
600
+ mean_est + 1.96 * std_est,
601
+ alpha=0.2,
602
+ color='blue',
603
+ label='95% Band'
604
+ )
605
+
606
+ ax.set_xlabel('Subsample/Exclusion')
607
+ ax.set_ylabel('Estimate')
608
+ ax.set_title(f'{param}')
609
+ ax.legend(fontsize=8)
610
+ ax.grid(True, alpha=0.3)
611
+
612
+ # Overall title
613
+ method_title = sensitivity_results.method.replace('_', ' ').title()
614
+ fig.suptitle(f'Sensitivity Analysis: {method_title}', fontsize=14, y=1.02)
615
+
616
+ plt.tight_layout()
617
+
618
+ return fig
619
+
620
+ def summary(
621
+ self,
622
+ sensitivity_results: SensitivityResults
623
+ ) -> pd.DataFrame:
624
+ """
625
+ Generate summary table of sensitivity analysis results.
626
+
627
+ Parameters
628
+ ----------
629
+ sensitivity_results : SensitivityResults
630
+ Results from sensitivity analysis
631
+
632
+ Returns
633
+ -------
634
+ pd.DataFrame
635
+ Summary statistics for each parameter
636
+
637
+ Examples
638
+ --------
639
+ >>> sensitivity = pb.SensitivityAnalysis(results)
640
+ >>> loo_results = sensitivity.leave_one_out_entities()
641
+ >>> summary = sensitivity.summary(loo_results)
642
+ >>> print(summary)
643
+ """
644
+ estimates = sensitivity_results.estimates
645
+
646
+ summary_data = []
647
+
648
+ for param in estimates.columns:
649
+ param_estimates = estimates[param].dropna()
650
+
651
+ original = self.params[param]
652
+ mean_est = param_estimates.mean()
653
+ std_est = param_estimates.std()
654
+ min_est = param_estimates.min()
655
+ max_est = param_estimates.max()
656
+
657
+ # Max deviation from original (in standard deviations)
658
+ max_dev = np.abs(param_estimates - original).max()
659
+ max_dev_std = max_dev / self.std_errors[param]
660
+
661
+ summary_data.append({
662
+ 'Parameter': param,
663
+ 'Original': original,
664
+ 'Mean': mean_est,
665
+ 'Std': std_est,
666
+ 'Min': min_est,
667
+ 'Max': max_est,
668
+ 'Range': max_est - min_est,
669
+ 'Max Deviation': max_dev,
670
+ 'Max Dev (SE)': max_dev_std,
671
+ 'N Valid': len(param_estimates)
672
+ })
673
+
674
+ return pd.DataFrame(summary_data)
675
+
676
+ def _create_model(self, data: pd.DataFrame):
677
+ """Create a new model instance with given data."""
678
+ # Get model class and parameters
679
+ model_class = type(self.model)
680
+
681
+ # Reconstruct model with new data
682
+ formula = self.model.formula_parser.formula
683
+
684
+ # Create new model
685
+ new_model = model_class(
686
+ formula=formula,
687
+ data=data,
688
+ entity_col=self.entity_col,
689
+ time_col=self.time_col
690
+ )
691
+
692
+ return new_model
693
+
694
+ def _calculate_statistics(
695
+ self,
696
+ estimates_df: pd.DataFrame,
697
+ threshold: float
698
+ ) -> Dict:
699
+ """Calculate summary statistics from estimates."""
700
+ statistics = {}
701
+
702
+ for param in estimates_df.columns:
703
+ param_estimates = estimates_df[param].dropna()
704
+
705
+ original = self.params[param]
706
+ original_se = self.std_errors[param]
707
+
708
+ # Deviations from original
709
+ deviations = param_estimates - original
710
+ abs_deviations = np.abs(deviations)
711
+
712
+ # Standardized deviations
713
+ std_deviations = deviations / original_se
714
+
715
+ statistics[param] = {
716
+ 'mean': param_estimates.mean(),
717
+ 'std': param_estimates.std(),
718
+ 'min': param_estimates.min(),
719
+ 'max': param_estimates.max(),
720
+ 'range': param_estimates.max() - param_estimates.min(),
721
+ 'max_abs_deviation': abs_deviations.max(),
722
+ 'mean_abs_deviation': abs_deviations.mean(),
723
+ 'max_std_deviation': np.abs(std_deviations).max(),
724
+ 'n_beyond_threshold': (np.abs(std_deviations) > threshold).sum(),
725
+ 'pct_beyond_threshold': (np.abs(std_deviations) > threshold).mean() * 100
726
+ }
727
+
728
+ return statistics
729
+
730
+ def _identify_influential_units(
731
+ self,
732
+ estimates_df: pd.DataFrame,
733
+ threshold: float
734
+ ) -> List:
735
+ """Identify influential units based on threshold."""
736
+ influential = []
737
+
738
+ for idx in estimates_df.index:
739
+ # Check if any parameter estimate exceeds threshold
740
+ is_influential = False
741
+
742
+ for param in estimates_df.columns:
743
+ estimate = estimates_df.loc[idx, param]
744
+
745
+ if np.isnan(estimate):
746
+ continue
747
+
748
+ original = self.params[param]
749
+ original_se = self.std_errors[param]
750
+
751
+ # Standardized deviation
752
+ std_dev = np.abs(estimate - original) / original_se
753
+
754
+ if std_dev > threshold:
755
+ is_influential = True
756
+ break
757
+
758
+ if is_influential:
759
+ influential.append(idx)
760
+
761
+ return influential
762
+
763
+
764
+ def dfbetas(
765
+ results: PanelResults,
766
+ entity_col: Optional[str] = None,
767
+ time_col: Optional[str] = None
768
+ ) -> pd.DataFrame:
769
+ """
770
+ Calculate DFBETAS influence statistics.
771
+
772
+ DFBETAS measures the change in parameter estimates when each observation
773
+ is deleted, standardized by the standard error.
774
+
775
+ Parameters
776
+ ----------
777
+ results : PanelResults
778
+ Fitted panel model results
779
+ entity_col : str, optional
780
+ Entity column name (inferred from model if not provided)
781
+ time_col : str, optional
782
+ Time column name (inferred from model if not provided)
783
+
784
+ Returns
785
+ -------
786
+ pd.DataFrame
787
+ DFBETAS statistics for each observation and parameter
788
+
789
+ Notes
790
+ -----
791
+ DFBETAS_i = (beta - beta_{-i}) / SE_{-i}
792
+
793
+ where beta is the full sample estimate and beta_{-i} is the estimate
794
+ with observation i deleted.
795
+
796
+ Observations with |DFBETAS| > 2/sqrt(n) are considered influential.
797
+
798
+ Examples
799
+ --------
800
+ >>> dfbetas_stats = pb.dfbetas(results)
801
+ >>> influential = dfbetas_stats[dfbetas_stats.abs() > 2/np.sqrt(len(data))]
802
+ >>> print(influential)
803
+ """
804
+ # This is a placeholder for future implementation
805
+ # Full DFBETAS requires refitting N times (computationally expensive)
806
+ raise NotImplementedError(
807
+ "DFBETAS calculation not yet implemented. "
808
+ "Use SensitivityAnalysis.leave_one_out_entities() for entity-level influence."
809
+ )