panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,933 @@
1
+ """
2
+ Bootstrap inference for panel data models.
3
+
4
+ This module implements various bootstrap methods for panel data, including:
5
+ - Pairs bootstrap (entity resampling)
6
+ - Wild bootstrap (heteroskedasticity-robust)
7
+ - Block bootstrap (time series dependence)
8
+ - Residual bootstrap (i.i.d. errors)
9
+
10
+ References
11
+ ----------
12
+ Cameron, A. C., & Trivedi, P. K. (2005). Microeconometrics: Methods and Applications.
13
+ Efron, B., & Tibshirani, R. J. (1994). An Introduction to the Bootstrap.
14
+ """
15
+
16
+ from typing import Optional, Union, Literal, Tuple
17
+ import warnings
18
+ import numpy as np
19
+ import pandas as pd
20
+ from scipy import stats
21
+ from tqdm import tqdm
22
+
23
+ from panelbox.core.results import PanelResults
24
+
25
+
26
+ class PanelBootstrap:
27
+ """
28
+ Bootstrap inference for panel data models.
29
+
30
+ This class implements various bootstrap methods adapted for panel data structure.
31
+ Bootstrap resampling provides an alternative to asymptotic inference, particularly
32
+ useful for small samples or complex dependence structures.
33
+
34
+ Parameters
35
+ ----------
36
+ results : PanelResults
37
+ Fitted model results to bootstrap
38
+ n_bootstrap : int, default=1000
39
+ Number of bootstrap replications
40
+ method : {'pairs', 'wild', 'block', 'residual'}, default='pairs'
41
+ Bootstrap method to use:
42
+
43
+ - 'pairs': Resample entire entities (recommended for most cases)
44
+ - 'wild': Keep X fixed, resample residuals with random weights
45
+ - 'block': Resample blocks of time periods
46
+ - 'residual': Resample residuals (assumes i.i.d. errors)
47
+ block_size : int, optional
48
+ Block size for block bootstrap. If None, uses rule-of-thumb: T^(1/3)
49
+ random_state : int, optional
50
+ Random seed for reproducibility
51
+ show_progress : bool, default=True
52
+ Show progress bar during bootstrap
53
+ parallel : bool, default=False
54
+ Use parallel processing (not yet implemented)
55
+
56
+ Attributes
57
+ ----------
58
+ bootstrap_estimates_ : np.ndarray
59
+ Bootstrap coefficient estimates (n_bootstrap x n_params)
60
+ bootstrap_se_ : np.ndarray
61
+ Bootstrap standard errors for each parameter
62
+ bootstrap_t_stats_ : np.ndarray
63
+ Bootstrap t-statistics (for studentized bootstrap)
64
+ n_failed_ : int
65
+ Number of failed bootstrap replications
66
+
67
+ Examples
68
+ --------
69
+ >>> import panelbox as pb
70
+ >>> import pandas as pd
71
+ >>>
72
+ >>> # Fit a fixed effects model
73
+ >>> data = pd.read_csv('panel_data.csv')
74
+ >>> fe = pb.FixedEffects("y ~ x1 + x2", data, "id", "time")
75
+ >>> results = fe.fit()
76
+ >>>
77
+ >>> # Bootstrap with pairs method (recommended)
78
+ >>> bootstrap = pb.PanelBootstrap(
79
+ ... results,
80
+ ... n_bootstrap=1000,
81
+ ... method='pairs',
82
+ ... random_state=42
83
+ ... )
84
+ >>> bootstrap.run()
85
+ >>>
86
+ >>> # Get bootstrap confidence intervals
87
+ >>> ci = bootstrap.conf_int(alpha=0.05, method='percentile')
88
+ >>> print(ci)
89
+ >>>
90
+ >>> # Compare with asymptotic CI
91
+ >>> ci_asymp = results.conf_int(alpha=0.05)
92
+ >>> print(ci_asymp)
93
+ >>>
94
+ >>> # Get bootstrap standard errors
95
+ >>> se_boot = bootstrap.bootstrap_se_
96
+ >>> se_asymp = results.std_errors
97
+ >>> comparison = pd.DataFrame({
98
+ ... 'Bootstrap SE': se_boot,
99
+ ... 'Asymptotic SE': se_asymp
100
+ ... }, index=results.params.index)
101
+ >>> print(comparison)
102
+
103
+ Notes
104
+ -----
105
+ **Choosing a Bootstrap Method:**
106
+
107
+ 1. **Pairs Bootstrap** (default, recommended):
108
+ - Resamples entire entities (with all their time periods)
109
+ - Preserves within-entity correlation structure
110
+ - Robust to heteroskedasticity and serial correlation
111
+ - Use when entities are independent
112
+
113
+ 2. **Wild Bootstrap**:
114
+ - Keeps X fixed, resamples residuals with random weights
115
+ - Specifically designed for heteroskedasticity
116
+ - Does not account for serial correlation
117
+ - Use when heteroskedasticity is primary concern
118
+
119
+ 3. **Block Bootstrap**:
120
+ - Resamples blocks of consecutive time periods
121
+ - Preserves temporal dependence within blocks
122
+ - Use when time-series dependence is important
123
+
124
+ 4. **Residual Bootstrap**:
125
+ - Resamples residuals assuming i.i.d. errors
126
+ - Most restrictive assumptions
127
+ - Use only when you're confident errors are i.i.d.
128
+
129
+ **Number of Replications:**
130
+
131
+ - For standard errors: n_bootstrap >= 500
132
+ - For confidence intervals: n_bootstrap >= 1000
133
+ - For hypothesis testing: n_bootstrap >= 2000
134
+
135
+ References
136
+ ----------
137
+ Cameron, A. C., & Trivedi, P. K. (2005). "Microeconometrics: Methods and
138
+ Applications." Cambridge University Press, Chapter 11.
139
+
140
+ Efron, B., & Tibshirani, R. J. (1994). "An Introduction to the Bootstrap."
141
+ Chapman and Hall/CRC.
142
+
143
+ Cameron, A. C., Gelbach, J. B., & Miller, D. L. (2008). "Bootstrap-based
144
+ improvements for inference with clustered errors." *The Review of Economics
145
+ and Statistics*, 90(3), 414-427.
146
+ """
147
+
148
+ def __init__(
149
+ self,
150
+ results: PanelResults,
151
+ n_bootstrap: int = 1000,
152
+ method: Literal['pairs', 'wild', 'block', 'residual'] = 'pairs',
153
+ block_size: Optional[int] = None,
154
+ random_state: Optional[int] = None,
155
+ show_progress: bool = True,
156
+ parallel: bool = False
157
+ ):
158
+ # Validation
159
+ if not isinstance(results, PanelResults):
160
+ raise TypeError(
161
+ f"results must be PanelResults, got {type(results)}"
162
+ )
163
+
164
+ if n_bootstrap < 100:
165
+ warnings.warn(
166
+ f"n_bootstrap={n_bootstrap} is quite small. "
167
+ "Recommend at least 500 for standard errors, 1000 for confidence intervals.",
168
+ UserWarning
169
+ )
170
+
171
+ valid_methods = ['pairs', 'wild', 'block', 'residual']
172
+ if method not in valid_methods:
173
+ raise ValueError(
174
+ f"method must be one of {valid_methods}, got '{method}'"
175
+ )
176
+
177
+ # Store inputs
178
+ self.results = results
179
+ self.n_bootstrap = n_bootstrap
180
+ self.method = method
181
+ self.block_size = block_size
182
+ self.random_state = random_state
183
+ self.show_progress = show_progress
184
+ self.parallel = parallel
185
+
186
+ # Check if model is available for refitting
187
+ if results._model is None:
188
+ raise ValueError(
189
+ "Bootstrap requires access to the original model. "
190
+ "Ensure the model stores a reference to itself in results._model"
191
+ )
192
+
193
+ self.model = results._model
194
+
195
+ # Initialize random state
196
+ self.rng = np.random.RandomState(random_state)
197
+
198
+ # Check for parallel (not yet implemented)
199
+ if parallel:
200
+ warnings.warn(
201
+ "Parallel processing not yet implemented. Running sequentially.",
202
+ UserWarning
203
+ )
204
+
205
+ # Results storage
206
+ self.bootstrap_estimates_: Optional[np.ndarray] = None
207
+ self.bootstrap_se_: Optional[np.ndarray] = None
208
+ self.bootstrap_t_stats_: Optional[np.ndarray] = None
209
+ self.n_failed_: int = 0
210
+ self._fitted = False
211
+
212
+ def run(self) -> 'PanelBootstrap':
213
+ """
214
+ Run bootstrap procedure.
215
+
216
+ Performs bootstrap resampling according to the specified method
217
+ and stores results.
218
+
219
+ Returns
220
+ -------
221
+ self : PanelBootstrap
222
+ Returns self for method chaining
223
+
224
+ Raises
225
+ ------
226
+ ValueError
227
+ If bootstrap method is not recognized
228
+ RuntimeError
229
+ If too many bootstrap replications fail
230
+ """
231
+ # Dispatch to appropriate method
232
+ if self.method == 'pairs':
233
+ estimates = self._bootstrap_pairs()
234
+ elif self.method == 'wild':
235
+ estimates = self._bootstrap_wild()
236
+ elif self.method == 'block':
237
+ estimates = self._bootstrap_block()
238
+ elif self.method == 'residual':
239
+ estimates = self._bootstrap_residual()
240
+ else:
241
+ raise ValueError(f"Unknown bootstrap method: {self.method}")
242
+
243
+ # Store results
244
+ self.bootstrap_estimates_ = estimates
245
+
246
+ # Compute bootstrap standard errors
247
+ self.bootstrap_se_ = np.std(estimates, axis=0, ddof=1)
248
+
249
+ # Compute studentized statistics (for advanced CI methods)
250
+ # t_b = (θ_b - θ_hat) / se(θ_b)
251
+ theta_hat = self.results.params.values
252
+ self.bootstrap_t_stats_ = (estimates - theta_hat) / self.bootstrap_se_
253
+
254
+ self._fitted = True
255
+
256
+ # Warn if many failures
257
+ if self.n_failed_ > self.n_bootstrap * 0.1:
258
+ warnings.warn(
259
+ f"{self.n_failed_} out of {self.n_bootstrap} bootstrap replications failed "
260
+ f"({self.n_failed_/self.n_bootstrap*100:.1f}%). "
261
+ "Results may be unreliable. Consider using a different method or "
262
+ "checking your model specification.",
263
+ UserWarning
264
+ )
265
+
266
+ return self
267
+
268
+ def _bootstrap_pairs(self) -> np.ndarray:
269
+ """
270
+ Pairs (entity) bootstrap.
271
+
272
+ Resamples entire entities with replacement. This preserves the
273
+ within-entity correlation structure and is robust to both
274
+ heteroskedasticity and serial correlation within entities.
275
+
276
+ Returns
277
+ -------
278
+ estimates : np.ndarray
279
+ Bootstrap coefficient estimates (n_bootstrap x n_params)
280
+ """
281
+ # Get entity IDs
282
+ data_df = self.model.data.data
283
+ entity_col = self.model.data.entity_col
284
+ entities = data_df[entity_col].unique()
285
+ n_entities = len(entities)
286
+
287
+ # Storage for estimates
288
+ n_params = len(self.results.params)
289
+ estimates = np.zeros((self.n_bootstrap, n_params))
290
+
291
+ # Bootstrap loop
292
+ iterator = range(self.n_bootstrap)
293
+ if self.show_progress:
294
+ iterator = tqdm(iterator, desc=f"Bootstrap ({self.method})")
295
+
296
+ for b in iterator:
297
+ try:
298
+ # Resample entities with replacement
299
+ boot_entities = self.rng.choice(entities, size=n_entities, replace=True)
300
+
301
+ # Create bootstrap sample by stacking selected entities
302
+ boot_data_list = []
303
+ for entity in boot_entities:
304
+ entity_data = data_df[data_df[entity_col] == entity].copy()
305
+ boot_data_list.append(entity_data)
306
+
307
+ boot_data = pd.concat(boot_data_list, ignore_index=True)
308
+
309
+ # Refit model on bootstrap sample
310
+ # We need to create a new model instance with bootstrap data
311
+ boot_model = self._create_bootstrap_model(boot_data)
312
+ boot_results = boot_model.fit()
313
+
314
+ # Store estimates
315
+ estimates[b, :] = boot_results.params.values
316
+
317
+ except Exception as e:
318
+ # If estimation fails, use NaN
319
+ estimates[b, :] = np.nan
320
+ self.n_failed_ += 1
321
+
322
+ if self.show_progress and self.n_failed_ <= 5:
323
+ # Print first few failures for debugging
324
+ print(f"\nBootstrap iteration {b} failed: {str(e)}")
325
+
326
+ # Remove failed replications
327
+ valid_mask = ~np.isnan(estimates).any(axis=1)
328
+ estimates = estimates[valid_mask, :]
329
+
330
+ if estimates.shape[0] < self.n_bootstrap * 0.5:
331
+ raise RuntimeError(
332
+ f"More than 50% of bootstrap replications failed. "
333
+ f"Only {estimates.shape[0]} out of {self.n_bootstrap} succeeded. "
334
+ "Check your model specification."
335
+ )
336
+
337
+ return estimates
338
+
339
+ def _bootstrap_wild(self) -> np.ndarray:
340
+ """
341
+ Wild bootstrap.
342
+
343
+ Keeps X fixed and resamples residuals with random weights.
344
+ Designed for heteroskedasticity but does not account for
345
+ serial correlation.
346
+
347
+ Uses Rademacher distribution: w ∈ {-1, +1} with equal probability.
348
+ Alternative: Mammen distribution (set via _wild_distribution attribute).
349
+
350
+ Returns
351
+ -------
352
+ estimates : np.ndarray
353
+ Bootstrap coefficient estimates (n_bootstrap x n_params)
354
+
355
+ Notes
356
+ -----
357
+ Wild bootstrap is particularly useful for heteroskedastic errors.
358
+ It maintains the X matrix fixed and only resamples the error structure.
359
+
360
+ For panel data, this method resamples residuals for each observation
361
+ independently, which may not preserve serial correlation within entities.
362
+ Consider pairs bootstrap if serial correlation is a concern.
363
+
364
+ References
365
+ ----------
366
+ Liu, R. Y. (1988). "Bootstrap procedures under some non-i.i.d. models."
367
+ *The Annals of Statistics*, 16(4), 1696-1708.
368
+ """
369
+ # Get data
370
+ data_df = self.model.data.data
371
+ entity_col = self.model.data.entity_col
372
+ time_col = self.model.data.time_col
373
+
374
+ # Get residuals and fitted values from original model
375
+ residuals = self.results.resid
376
+ fitted_values = self.results.fittedvalues
377
+
378
+ # Get original data in same order
379
+ # We need to reconstruct y from fitted + residuals
380
+ y_original = fitted_values + residuals
381
+
382
+ # Storage for estimates
383
+ n_params = len(self.results.params)
384
+ estimates = np.zeros((self.n_bootstrap, n_params))
385
+
386
+ # Bootstrap loop
387
+ iterator = range(self.n_bootstrap)
388
+ if self.show_progress:
389
+ iterator = tqdm(iterator, desc=f"Bootstrap ({self.method})")
390
+
391
+ for b in iterator:
392
+ try:
393
+ # Generate wild bootstrap weights using Rademacher distribution
394
+ # w ∈ {-1, +1} with probability 0.5 each
395
+ weights = self.rng.choice([-1, 1], size=len(residuals))
396
+
397
+ # Create bootstrap residuals: e* = w * e
398
+ boot_residuals = weights * residuals
399
+
400
+ # Reconstruct bootstrap outcome: y* = ŷ + e*
401
+ y_boot = fitted_values + boot_residuals
402
+
403
+ # Create bootstrap dataset with new y values
404
+ boot_data = data_df.copy()
405
+
406
+ # Get the dependent variable name from the formula
407
+ dep_var = self.model.formula_parser.dependent
408
+ boot_data[dep_var] = y_boot
409
+
410
+ # Refit model on bootstrap sample
411
+ boot_model = self._create_bootstrap_model(boot_data)
412
+ boot_results = boot_model.fit()
413
+
414
+ # Store estimates
415
+ estimates[b, :] = boot_results.params.values
416
+
417
+ except Exception as e:
418
+ # If estimation fails, use NaN
419
+ estimates[b, :] = np.nan
420
+ self.n_failed_ += 1
421
+
422
+ if self.show_progress and self.n_failed_ <= 5:
423
+ print(f"\nBootstrap iteration {b} failed: {str(e)}")
424
+
425
+ # Remove failed replications
426
+ valid_mask = ~np.isnan(estimates).any(axis=1)
427
+ estimates = estimates[valid_mask, :]
428
+
429
+ if estimates.shape[0] < self.n_bootstrap * 0.5:
430
+ raise RuntimeError(
431
+ f"More than 50% of bootstrap replications failed. "
432
+ f"Only {estimates.shape[0]} out of {self.n_bootstrap} succeeded. "
433
+ "Check your model specification."
434
+ )
435
+
436
+ return estimates
437
+
438
+ def _bootstrap_block(self) -> np.ndarray:
439
+ """
440
+ Block bootstrap.
441
+
442
+ Resamples blocks of consecutive time periods. Preserves temporal
443
+ dependence within blocks.
444
+
445
+ Uses moving block bootstrap where blocks can overlap. Block size
446
+ is determined by the block_size parameter, or defaults to T^(1/3).
447
+
448
+ Returns
449
+ -------
450
+ estimates : np.ndarray
451
+ Bootstrap coefficient estimates (n_bootstrap x n_params)
452
+
453
+ Notes
454
+ -----
455
+ Block bootstrap is useful when there is temporal dependence in the data.
456
+ It preserves the correlation structure within blocks while breaking
457
+ dependence between blocks.
458
+
459
+ For panel data, blocks are time periods that apply to all entities.
460
+ This maintains the cross-sectional structure while accounting for
461
+ time-series dependence.
462
+
463
+ References
464
+ ----------
465
+ Künsch, H. R. (1989). "The jackknife and the bootstrap for general
466
+ stationary observations." *The Annals of Statistics*, 17(3), 1217-1241.
467
+ """
468
+ # Get data
469
+ data_df = self.model.data.data
470
+ entity_col = self.model.data.entity_col
471
+ time_col = self.model.data.time_col
472
+
473
+ # Get unique time periods
474
+ time_periods = sorted(data_df[time_col].unique())
475
+ n_periods = len(time_periods)
476
+
477
+ # Determine block size
478
+ if self.block_size is None:
479
+ # Rule of thumb: T^(1/3)
480
+ block_size = max(1, int(np.ceil(n_periods ** (1/3))))
481
+ if self.show_progress:
482
+ print(f"\nUsing automatic block size: {block_size} (T^(1/3) where T={n_periods})")
483
+ else:
484
+ block_size = self.block_size
485
+
486
+ if block_size > n_periods:
487
+ warnings.warn(
488
+ f"block_size={block_size} is larger than n_periods={n_periods}. "
489
+ f"Setting block_size={n_periods}",
490
+ UserWarning
491
+ )
492
+ block_size = n_periods
493
+
494
+ # Storage for estimates
495
+ n_params = len(self.results.params)
496
+ estimates = np.zeros((self.n_bootstrap, n_params))
497
+
498
+ # Bootstrap loop
499
+ iterator = range(self.n_bootstrap)
500
+ if self.show_progress:
501
+ iterator = tqdm(iterator, desc=f"Bootstrap ({self.method}, block_size={block_size})")
502
+
503
+ for b in iterator:
504
+ try:
505
+ # Resample blocks to cover approximately n_periods
506
+ boot_time_periods = []
507
+ n_blocks_needed = int(np.ceil(n_periods / block_size))
508
+
509
+ for _ in range(n_blocks_needed):
510
+ # Randomly select a starting point for the block
511
+ start_idx = self.rng.randint(0, n_periods - block_size + 1)
512
+
513
+ # Extract block of time periods
514
+ block = time_periods[start_idx:start_idx + block_size]
515
+ boot_time_periods.extend(block)
516
+
517
+ # Trim to original length
518
+ boot_time_periods = boot_time_periods[:n_periods]
519
+
520
+ # Create bootstrap sample by selecting these time periods
521
+ boot_data_list = []
522
+ for t in boot_time_periods:
523
+ time_data = data_df[data_df[time_col] == t].copy()
524
+ boot_data_list.append(time_data)
525
+
526
+ boot_data = pd.concat(boot_data_list, ignore_index=True)
527
+
528
+ # Refit model on bootstrap sample
529
+ boot_model = self._create_bootstrap_model(boot_data)
530
+ boot_results = boot_model.fit()
531
+
532
+ # Store estimates
533
+ estimates[b, :] = boot_results.params.values
534
+
535
+ except Exception as e:
536
+ # If estimation fails, use NaN
537
+ estimates[b, :] = np.nan
538
+ self.n_failed_ += 1
539
+
540
+ if self.show_progress and self.n_failed_ <= 5:
541
+ print(f"\nBootstrap iteration {b} failed: {str(e)}")
542
+
543
+ # Remove failed replications
544
+ valid_mask = ~np.isnan(estimates).any(axis=1)
545
+ estimates = estimates[valid_mask, :]
546
+
547
+ if estimates.shape[0] < self.n_bootstrap * 0.5:
548
+ raise RuntimeError(
549
+ f"More than 50% of bootstrap replications failed. "
550
+ f"Only {estimates.shape[0]} out of {self.n_bootstrap} succeeded. "
551
+ "Check your model specification."
552
+ )
553
+
554
+ return estimates
555
+
556
+ def _bootstrap_residual(self) -> np.ndarray:
557
+ """
558
+ Residual bootstrap.
559
+
560
+ Resamples residuals assuming they are i.i.d. Most restrictive
561
+ assumptions.
562
+
563
+ Returns
564
+ -------
565
+ estimates : np.ndarray
566
+ Bootstrap coefficient estimates (n_bootstrap x n_params)
567
+
568
+ Notes
569
+ -----
570
+ Residual bootstrap assumes residuals are independent and identically
571
+ distributed (i.i.d.). This is the most restrictive assumption among
572
+ bootstrap methods.
573
+
574
+ **When to use**:
575
+ - When you're confident errors are i.i.d.
576
+ - After conditioning on X (design matrix fixed)
577
+ - For computational efficiency
578
+
579
+ **When NOT to use**:
580
+ - With heteroskedasticity (use wild bootstrap instead)
581
+ - With serial correlation (use block or pairs bootstrap)
582
+ - With clustered errors (use pairs bootstrap)
583
+
584
+ The algorithm:
585
+ 1. Center residuals: e_centered = e - mean(e)
586
+ 2. Resample centered residuals with replacement
587
+ 3. Reconstruct y* = ŷ + e*
588
+ 4. Refit model
589
+
590
+ References
591
+ ----------
592
+ Efron, B., & Tibshirani, R. J. (1994). "An Introduction to the
593
+ Bootstrap." CRC press, Chapter 6.
594
+ """
595
+ # Get data
596
+ data_df = self.model.data.data
597
+
598
+ # Get residuals and fitted values from original model
599
+ residuals = self.results.resid
600
+ fitted_values = self.results.fittedvalues
601
+
602
+ # Center residuals (important for maintaining mean zero)
603
+ centered_residuals = residuals - np.mean(residuals)
604
+
605
+ # Storage for estimates
606
+ n_params = len(self.results.params)
607
+ estimates = np.zeros((self.n_bootstrap, n_params))
608
+
609
+ # Bootstrap loop
610
+ iterator = range(self.n_bootstrap)
611
+ if self.show_progress:
612
+ iterator = tqdm(iterator, desc=f"Bootstrap ({self.method})")
613
+
614
+ for b in iterator:
615
+ try:
616
+ # Resample centered residuals with replacement
617
+ boot_indices = self.rng.choice(len(residuals), size=len(residuals), replace=True)
618
+ boot_residuals = centered_residuals[boot_indices]
619
+
620
+ # Reconstruct bootstrap outcome: y* = ŷ + e*
621
+ y_boot = fitted_values + boot_residuals
622
+
623
+ # Create bootstrap dataset with new y values
624
+ boot_data = data_df.copy()
625
+
626
+ # Get the dependent variable name from the formula
627
+ dep_var = self.model.formula_parser.dependent
628
+ boot_data[dep_var] = y_boot
629
+
630
+ # Refit model on bootstrap sample
631
+ boot_model = self._create_bootstrap_model(boot_data)
632
+ boot_results = boot_model.fit()
633
+
634
+ # Store estimates
635
+ estimates[b, :] = boot_results.params.values
636
+
637
+ except Exception as e:
638
+ # If estimation fails, use NaN
639
+ estimates[b, :] = np.nan
640
+ self.n_failed_ += 1
641
+
642
+ if self.show_progress and self.n_failed_ <= 5:
643
+ print(f"\nBootstrap iteration {b} failed: {str(e)}")
644
+
645
+ # Remove failed replications
646
+ valid_mask = ~np.isnan(estimates).any(axis=1)
647
+ estimates = estimates[valid_mask, :]
648
+
649
+ if estimates.shape[0] < self.n_bootstrap * 0.5:
650
+ raise RuntimeError(
651
+ f"More than 50% of bootstrap replications failed. "
652
+ f"Only {estimates.shape[0]} out of {self.n_bootstrap} succeeded. "
653
+ "Check your model specification."
654
+ )
655
+
656
+ return estimates
657
+
658
+ def _create_bootstrap_model(self, boot_data: pd.DataFrame):
659
+ """
660
+ Create a new model instance with bootstrap data.
661
+
662
+ Parameters
663
+ ----------
664
+ boot_data : pd.DataFrame
665
+ Bootstrap sample data
666
+
667
+ Returns
668
+ -------
669
+ model
670
+ New model instance
671
+ """
672
+ # Get model class and parameters
673
+ model_class = type(self.model)
674
+
675
+ # Common parameters for all models
676
+ init_kwargs = {
677
+ 'formula': self.model.formula,
678
+ 'data': boot_data,
679
+ 'entity_col': self.model.data.entity_col,
680
+ 'time_col': self.model.data.time_col,
681
+ }
682
+
683
+ # Add model-specific parameters
684
+ if hasattr(self.model, 'entity_effects'):
685
+ init_kwargs['entity_effects'] = self.model.entity_effects
686
+ if hasattr(self.model, 'time_effects'):
687
+ init_kwargs['time_effects'] = self.model.time_effects
688
+ if hasattr(self.model, 'weights'):
689
+ init_kwargs['weights'] = self.model.weights
690
+
691
+ # Create model instance
692
+ boot_model = model_class(**init_kwargs)
693
+
694
+ return boot_model
695
+
696
+ def conf_int(
697
+ self,
698
+ alpha: float = 0.05,
699
+ method: Literal['percentile', 'basic', 'bca', 'studentized'] = 'percentile'
700
+ ) -> pd.DataFrame:
701
+ """
702
+ Compute bootstrap confidence intervals.
703
+
704
+ Parameters
705
+ ----------
706
+ alpha : float, default=0.05
707
+ Significance level (e.g., 0.05 for 95% CI)
708
+ method : {'percentile', 'basic', 'bca', 'studentized'}, default='percentile'
709
+ Method for computing confidence intervals:
710
+
711
+ - 'percentile': Percentile method (simplest, recommended)
712
+ - 'basic': Basic bootstrap (reflection method)
713
+ - 'bca': Bias-corrected and accelerated (most accurate but complex)
714
+ - 'studentized': Studentized bootstrap (requires nested bootstrap)
715
+
716
+ Returns
717
+ -------
718
+ conf_int : pd.DataFrame
719
+ Confidence intervals with columns 'lower' and 'upper'
720
+
721
+ Examples
722
+ --------
723
+ >>> # After running bootstrap
724
+ >>> ci_perc = bootstrap.conf_int(alpha=0.05, method='percentile')
725
+ >>> ci_basic = bootstrap.conf_int(alpha=0.05, method='basic')
726
+ >>>
727
+ >>> # Compare with asymptotic
728
+ >>> ci_asymp = results.conf_int(alpha=0.05)
729
+ """
730
+ if not self._fitted:
731
+ raise RuntimeError("Must call run() before conf_int()")
732
+
733
+ if method == 'percentile':
734
+ ci = self._conf_int_percentile(alpha)
735
+ elif method == 'basic':
736
+ ci = self._conf_int_basic(alpha)
737
+ elif method == 'bca':
738
+ ci = self._conf_int_bca(alpha)
739
+ elif method == 'studentized':
740
+ ci = self._conf_int_studentized(alpha)
741
+ else:
742
+ raise ValueError(
743
+ f"method must be 'percentile', 'basic', 'bca', or 'studentized', "
744
+ f"got '{method}'"
745
+ )
746
+
747
+ return ci
748
+
749
+ def _conf_int_percentile(self, alpha: float) -> pd.DataFrame:
750
+ """
751
+ Percentile confidence interval.
752
+
753
+ CI = [θ_α/2, θ_1-α/2] where θ_p is the p-th percentile of bootstrap estimates.
754
+ """
755
+ lower_pct = alpha / 2 * 100
756
+ upper_pct = (1 - alpha / 2) * 100
757
+
758
+ lower = np.percentile(self.bootstrap_estimates_, lower_pct, axis=0)
759
+ upper = np.percentile(self.bootstrap_estimates_, upper_pct, axis=0)
760
+
761
+ ci = pd.DataFrame({
762
+ 'lower': lower,
763
+ 'upper': upper
764
+ }, index=self.results.params.index)
765
+
766
+ return ci
767
+
768
+ def _conf_int_basic(self, alpha: float) -> pd.DataFrame:
769
+ """
770
+ Basic (reflection) confidence interval.
771
+
772
+ CI = [2θ_hat - θ_1-α/2, 2θ_hat - θ_α/2]
773
+ """
774
+ theta_hat = self.results.params.values
775
+
776
+ lower_pct = alpha / 2 * 100
777
+ upper_pct = (1 - alpha / 2) * 100
778
+
779
+ # Note the reversal for basic method
780
+ lower = 2 * theta_hat - np.percentile(self.bootstrap_estimates_, upper_pct, axis=0)
781
+ upper = 2 * theta_hat - np.percentile(self.bootstrap_estimates_, lower_pct, axis=0)
782
+
783
+ ci = pd.DataFrame({
784
+ 'lower': lower,
785
+ 'upper': upper
786
+ }, index=self.results.params.index)
787
+
788
+ return ci
789
+
790
+ def _conf_int_bca(self, alpha: float) -> pd.DataFrame:
791
+ """
792
+ Bias-corrected and accelerated (BCa) confidence interval.
793
+
794
+ More accurate than percentile but requires estimating bias and acceleration.
795
+ """
796
+ warnings.warn(
797
+ "BCa confidence intervals not yet fully implemented. "
798
+ "Falling back to percentile method.",
799
+ UserWarning
800
+ )
801
+ return self._conf_int_percentile(alpha)
802
+
803
+ def _conf_int_studentized(self, alpha: float) -> pd.DataFrame:
804
+ """
805
+ Studentized (bootstrap-t) confidence interval.
806
+
807
+ Uses bootstrap distribution of t-statistics. Most accurate but computationally
808
+ intensive (requires nested bootstrap).
809
+ """
810
+ warnings.warn(
811
+ "Studentized confidence intervals not yet fully implemented. "
812
+ "Falling back to percentile method.",
813
+ UserWarning
814
+ )
815
+ return self._conf_int_percentile(alpha)
816
+
817
+ def summary(self) -> pd.DataFrame:
818
+ """
819
+ Generate bootstrap summary table.
820
+
821
+ Returns
822
+ -------
823
+ summary : pd.DataFrame
824
+ Summary table with original estimates, bootstrap SEs, and comparison
825
+
826
+ Examples
827
+ --------
828
+ >>> summary = bootstrap.summary()
829
+ >>> print(summary)
830
+ """
831
+ if not self._fitted:
832
+ raise RuntimeError("Must call run() before summary()")
833
+
834
+ summary = pd.DataFrame({
835
+ 'Original': self.results.params,
836
+ 'Bootstrap Mean': self.bootstrap_estimates_.mean(axis=0),
837
+ 'Bootstrap Bias': self.bootstrap_estimates_.mean(axis=0) - self.results.params.values,
838
+ 'Original SE': self.results.std_errors,
839
+ 'Bootstrap SE': self.bootstrap_se_,
840
+ 'SE Ratio': self.bootstrap_se_ / self.results.std_errors.values
841
+ }, index=self.results.params.index)
842
+
843
+ return summary
844
+
845
+ def plot_distribution(self, param: Optional[str] = None):
846
+ """
847
+ Plot bootstrap distribution of coefficients.
848
+
849
+ Parameters
850
+ ----------
851
+ param : str, optional
852
+ Parameter name to plot. If None, plots all parameters.
853
+
854
+ Raises
855
+ ------
856
+ ImportError
857
+ If matplotlib is not installed
858
+ """
859
+ try:
860
+ import matplotlib.pyplot as plt
861
+ except ImportError:
862
+ raise ImportError(
863
+ "matplotlib is required for plotting. "
864
+ "Install with: pip install matplotlib"
865
+ )
866
+
867
+ if not self._fitted:
868
+ raise RuntimeError("Must call run() before plot_distribution()")
869
+
870
+ if param is not None:
871
+ # Plot single parameter
872
+ if param not in self.results.params.index:
873
+ raise ValueError(f"Parameter '{param}' not found in model")
874
+
875
+ param_idx = self.results.params.index.get_loc(param)
876
+ boot_values = self.bootstrap_estimates_[:, param_idx]
877
+ original_value = self.results.params.iloc[param_idx]
878
+
879
+ fig, ax = plt.subplots(figsize=(10, 6))
880
+ ax.hist(boot_values, bins=50, alpha=0.7, edgecolor='black')
881
+ ax.axvline(original_value, color='red', linestyle='--', linewidth=2,
882
+ label=f'Original: {original_value:.4f}')
883
+ ax.set_xlabel('Coefficient Value')
884
+ ax.set_ylabel('Frequency')
885
+ ax.set_title(f'Bootstrap Distribution: {param}')
886
+ ax.legend()
887
+ plt.tight_layout()
888
+ plt.show()
889
+ else:
890
+ # Plot all parameters
891
+ n_params = len(self.results.params)
892
+ n_cols = min(3, n_params)
893
+ n_rows = (n_params + n_cols - 1) // n_cols
894
+
895
+ fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
896
+ if n_params == 1:
897
+ axes = np.array([axes])
898
+ axes = axes.flatten()
899
+
900
+ for i, param_name in enumerate(self.results.params.index):
901
+ boot_values = self.bootstrap_estimates_[:, i]
902
+ original_value = self.results.params.iloc[i]
903
+
904
+ axes[i].hist(boot_values, bins=30, alpha=0.7, edgecolor='black')
905
+ axes[i].axvline(original_value, color='red', linestyle='--',
906
+ linewidth=2, label=f'Original: {original_value:.4f}')
907
+ axes[i].set_xlabel('Value')
908
+ axes[i].set_ylabel('Frequency')
909
+ axes[i].set_title(param_name)
910
+ axes[i].legend(fontsize=8)
911
+
912
+ # Hide unused subplots
913
+ for i in range(n_params, len(axes)):
914
+ axes[i].set_visible(False)
915
+
916
+ plt.suptitle(f'Bootstrap Distributions ({self.method} method, n={self.n_bootstrap})',
917
+ fontsize=14, y=1.00)
918
+ plt.tight_layout()
919
+ plt.show()
920
+
921
+ def __repr__(self) -> str:
922
+ """String representation."""
923
+ if self._fitted:
924
+ status = f"fitted with {self.n_bootstrap - self.n_failed_} successful replications"
925
+ else:
926
+ status = "not fitted"
927
+
928
+ return (
929
+ f"PanelBootstrap("
930
+ f"method='{self.method}', "
931
+ f"n_bootstrap={self.n_bootstrap}, "
932
+ f"{status})"
933
+ )