panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,457 @@
1
+ """
2
+ Jackknife inference for panel data models.
3
+
4
+ This module implements jackknife resampling for panel data, providing
5
+ alternative estimates of bias and variance. The jackknife is particularly
6
+ useful for small samples and provides influence diagnostics.
7
+
8
+ References
9
+ ----------
10
+ Efron, B., & Tibshirani, R. J. (1994). An Introduction to the Bootstrap.
11
+ Chapman and Hall/CRC.
12
+ Shao, J., & Tu, D. (1995). The Jackknife and Bootstrap.
13
+ Springer Science & Business Media.
14
+ """
15
+
16
+ from typing import Optional, Dict, Any, Tuple
17
+ import warnings
18
+ import numpy as np
19
+ import pandas as pd
20
+ from dataclasses import dataclass
21
+
22
+ from panelbox.core.results import PanelResults
23
+
24
+
25
+ @dataclass
26
+ class JackknifeResults:
27
+ """
28
+ Container for jackknife results.
29
+
30
+ Attributes
31
+ ----------
32
+ jackknife_estimates : pd.DataFrame
33
+ Parameter estimates for each jackknife sample (N x n_params)
34
+ original_estimates : pd.Series
35
+ Original parameter estimates
36
+ jackknife_mean : pd.Series
37
+ Mean of jackknife estimates
38
+ jackknife_bias : pd.Series
39
+ Jackknife bias estimates
40
+ jackknife_se : pd.Series
41
+ Jackknife standard errors
42
+ influence : pd.DataFrame
43
+ Influence values for each entity
44
+ n_jackknife : int
45
+ Number of jackknife samples (entities)
46
+ """
47
+ jackknife_estimates: pd.DataFrame
48
+ original_estimates: pd.Series
49
+ jackknife_mean: pd.Series
50
+ jackknife_bias: pd.Series
51
+ jackknife_se: pd.Series
52
+ influence: pd.DataFrame
53
+ n_jackknife: int
54
+
55
+ def summary(self) -> str:
56
+ """Generate summary of jackknife results."""
57
+ lines = []
58
+ lines.append("Jackknife Results")
59
+ lines.append("=" * 70)
60
+ lines.append(f"Number of jackknife samples: {self.n_jackknife}")
61
+ lines.append("")
62
+
63
+ lines.append("Parameter Estimates and Bias:")
64
+ lines.append("-" * 70)
65
+ lines.append(f"{'Parameter':<15} {'Original':>12} {'Jackknife':>12} "
66
+ f"{'Bias':>12} {'SE (JK)':>12}")
67
+ lines.append("-" * 70)
68
+
69
+ for param in self.original_estimates.index:
70
+ lines.append(
71
+ f"{param:<15} {self.original_estimates[param]:>12.6f} "
72
+ f"{self.jackknife_mean[param]:>12.6f} "
73
+ f"{self.jackknife_bias[param]:>12.6f} "
74
+ f"{self.jackknife_se[param]:>12.6f}"
75
+ )
76
+
77
+ lines.append("")
78
+ lines.append("Influential Entities:")
79
+ lines.append("-" * 70)
80
+
81
+ # Find most influential entities (highest absolute influence)
82
+ max_influence = self.influence.abs().max(axis=1)
83
+ top_influential = max_influence.nlargest(5)
84
+
85
+ if len(top_influential) > 0:
86
+ lines.append(f"{'Entity':>10} {'Max Influence':>15}")
87
+ lines.append("-" * 70)
88
+ for entity, infl in top_influential.items():
89
+ lines.append(f"{entity:>10} {infl:>15.6f}")
90
+ else:
91
+ lines.append("No influential entities detected")
92
+
93
+ return "\n".join(lines)
94
+
95
+
96
+ class PanelJackknife:
97
+ """
98
+ Jackknife inference for panel data models.
99
+
100
+ The jackknife resampling method systematically leaves out one entity
101
+ at a time and re-estimates the model. This provides estimates of:
102
+ - Bias in parameter estimates
103
+ - Standard errors
104
+ - Influence of individual entities
105
+
106
+ Parameters
107
+ ----------
108
+ results : PanelResults
109
+ Fitted model results to jackknife
110
+ verbose : bool, default=True
111
+ Whether to print progress information
112
+
113
+ Attributes
114
+ ----------
115
+ jackknife_results_ : JackknifeResults
116
+ Jackknife results after calling run()
117
+
118
+ Examples
119
+ --------
120
+ >>> import panelbox as pb
121
+ >>> import pandas as pd
122
+ >>>
123
+ >>> # Fit model
124
+ >>> data = pd.read_csv('panel_data.csv')
125
+ >>> fe = pb.FixedEffects("y ~ x1 + x2", data, "entity_id", "time")
126
+ >>> results = fe.fit()
127
+ >>>
128
+ >>> # Jackknife inference
129
+ >>> jackknife = pb.PanelJackknife(results)
130
+ >>> jk_results = jackknife.run()
131
+ >>>
132
+ >>> # View results
133
+ >>> print(jk_results.summary())
134
+ >>>
135
+ >>> # Get bias-corrected estimates
136
+ >>> bias_corrected = jackknife.bias_corrected_estimates()
137
+ >>> print(bias_corrected)
138
+
139
+ Notes
140
+ -----
141
+ - Jackknife is less computationally intensive than bootstrap
142
+ - Provides good estimates for variance and bias
143
+ - Each jackknife sample excludes one entity (all its time periods)
144
+ - For N entities, requires N model re-estimations
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ results: PanelResults,
150
+ verbose: bool = True
151
+ ):
152
+ self.results = results
153
+ self.verbose = verbose
154
+
155
+ # Extract model information
156
+ self.model = results._model
157
+ self.formula = results.formula
158
+ self.entity_col = self.model.data.entity_col
159
+ self.time_col = self.model.data.time_col
160
+
161
+ # Get original data
162
+ self.data = self.model.data.data
163
+
164
+ # Get entities
165
+ self.entities = sorted(self.data[self.entity_col].unique())
166
+ self.n_entities = len(self.entities)
167
+
168
+ # Results storage
169
+ self.jackknife_results_: Optional[JackknifeResults] = None
170
+
171
+ def run(self) -> JackknifeResults:
172
+ """
173
+ Run jackknife procedure.
174
+
175
+ Returns
176
+ -------
177
+ jackknife_results : JackknifeResults
178
+ Jackknife results containing estimates, bias, and standard errors
179
+
180
+ Notes
181
+ -----
182
+ The jackknife procedure:
183
+
184
+ 1. For each entity i:
185
+ - Remove entity i from dataset
186
+ - Re-estimate model on remaining N-1 entities
187
+ - Store parameter estimates
188
+
189
+ 2. Compute jackknife statistics:
190
+ - Mean of jackknife estimates
191
+ - Bias: (N-1) * (jackknife_mean - original)
192
+ - SE: sqrt((N-1)/N * sum((theta_i - mean)^2))
193
+ - Influence: (N-1) * (original - theta_i)
194
+ """
195
+ if self.verbose:
196
+ print(f"Starting jackknife procedure...")
197
+ print(f"Total entities: {self.n_entities}")
198
+ print("")
199
+
200
+ # Storage for jackknife estimates
201
+ jackknife_estimates = []
202
+ failed_samples = []
203
+
204
+ # Original estimates
205
+ original_estimates = self.results.params
206
+
207
+ # Perform leave-one-out
208
+ for i, entity in enumerate(self.entities, 1):
209
+ if self.verbose:
210
+ print(f"Jackknife sample {i}/{self.n_entities}: "
211
+ f"Excluding entity {entity}")
212
+
213
+ try:
214
+ # Remove entity i
215
+ jackknife_data = self.data[
216
+ self.data[self.entity_col] != entity
217
+ ].copy()
218
+
219
+ # Re-estimate model
220
+ model_class = type(self.model)
221
+ jackknife_model = model_class(
222
+ self.formula,
223
+ jackknife_data,
224
+ self.entity_col,
225
+ self.time_col
226
+ )
227
+ jackknife_result = jackknife_model.fit(
228
+ cov_type=self.results.cov_type
229
+ )
230
+
231
+ # Store estimates
232
+ jackknife_estimates.append({
233
+ 'entity_excluded': entity,
234
+ **jackknife_result.params.to_dict()
235
+ })
236
+
237
+ except Exception as e:
238
+ warnings.warn(f"Jackknife sample {i} (entity {entity}) failed: {str(e)}")
239
+ failed_samples.append(entity)
240
+ continue
241
+
242
+ # Check if we have any successful samples
243
+ if not jackknife_estimates:
244
+ raise RuntimeError("All jackknife samples failed")
245
+
246
+ if self.verbose and failed_samples:
247
+ print(f"\nWarning: {len(failed_samples)} samples failed")
248
+ print(f"Successfully completed: {len(jackknife_estimates)}/{self.n_entities}")
249
+
250
+ # Convert to DataFrame
251
+ jackknife_df = pd.DataFrame(jackknife_estimates)
252
+ entity_col_jk = jackknife_df['entity_excluded']
253
+ jackknife_df = jackknife_df.drop('entity_excluded', axis=1)
254
+
255
+ # Compute jackknife statistics
256
+ N = len(jackknife_estimates)
257
+
258
+ # Mean of jackknife estimates
259
+ jackknife_mean = jackknife_df.mean()
260
+
261
+ # Jackknife bias: (N-1) * (mean_jackknife - theta_original)
262
+ jackknife_bias = (N - 1) * (jackknife_mean - original_estimates)
263
+
264
+ # Jackknife standard error: sqrt((N-1)/N * sum((theta_i - mean)^2))
265
+ deviations = jackknife_df - jackknife_mean
266
+ jackknife_variance = ((N - 1) / N) * (deviations ** 2).sum()
267
+ jackknife_se = np.sqrt(jackknife_variance)
268
+
269
+ # Influence: (N-1) * (theta_original - theta_(-i))
270
+ influence_df = pd.DataFrame(
271
+ (N - 1) * (original_estimates.values - jackknife_df.values),
272
+ columns=original_estimates.index,
273
+ index=entity_col_jk
274
+ )
275
+
276
+ # Create results object
277
+ self.jackknife_results_ = JackknifeResults(
278
+ jackknife_estimates=jackknife_df,
279
+ original_estimates=original_estimates,
280
+ jackknife_mean=jackknife_mean,
281
+ jackknife_bias=jackknife_bias,
282
+ jackknife_se=jackknife_se,
283
+ influence=influence_df,
284
+ n_jackknife=N
285
+ )
286
+
287
+ if self.verbose:
288
+ print("\nJackknife Complete!")
289
+ print(f"Successful samples: {N}/{self.n_entities}")
290
+
291
+ return self.jackknife_results_
292
+
293
+ def bias_corrected_estimates(self) -> pd.Series:
294
+ """
295
+ Compute bias-corrected parameter estimates.
296
+
297
+ Returns
298
+ -------
299
+ bias_corrected : pd.Series
300
+ Bias-corrected estimates: original - bias
301
+
302
+ Raises
303
+ ------
304
+ RuntimeError
305
+ If run() has not been called yet
306
+
307
+ Notes
308
+ -----
309
+ Bias correction formula:
310
+ theta_corrected = theta_original - bias
311
+ where bias = (N-1) * (mean_jackknife - theta_original)
312
+
313
+ This is equivalent to:
314
+ theta_corrected = N * theta_original - (N-1) * mean_jackknife
315
+ """
316
+ if self.jackknife_results_ is None:
317
+ raise RuntimeError("Must call run() before bias_corrected_estimates()")
318
+
319
+ bias_corrected = (
320
+ self.jackknife_results_.original_estimates -
321
+ self.jackknife_results_.jackknife_bias
322
+ )
323
+
324
+ return bias_corrected
325
+
326
+ def confidence_intervals(
327
+ self,
328
+ alpha: float = 0.05,
329
+ method: str = 'normal'
330
+ ) -> pd.DataFrame:
331
+ """
332
+ Compute confidence intervals using jackknife standard errors.
333
+
334
+ Parameters
335
+ ----------
336
+ alpha : float, default=0.05
337
+ Significance level (e.g., 0.05 for 95% CI)
338
+ method : {'normal', 'percentile'}, default='normal'
339
+ Method for computing confidence intervals:
340
+
341
+ - 'normal': Normal approximation using jackknife SE
342
+ - 'percentile': Percentile method using jackknife distribution
343
+
344
+ Returns
345
+ -------
346
+ ci : pd.DataFrame
347
+ Confidence intervals with columns 'lower' and 'upper'
348
+
349
+ Raises
350
+ ------
351
+ RuntimeError
352
+ If run() has not been called yet
353
+ """
354
+ if self.jackknife_results_ is None:
355
+ raise RuntimeError("Must call run() before confidence_intervals()")
356
+
357
+ if method == 'normal':
358
+ # Normal approximation
359
+ from scipy import stats
360
+ z = stats.norm.ppf(1 - alpha / 2)
361
+
362
+ lower = (self.jackknife_results_.original_estimates -
363
+ z * self.jackknife_results_.jackknife_se)
364
+ upper = (self.jackknife_results_.original_estimates +
365
+ z * self.jackknife_results_.jackknife_se)
366
+
367
+ elif method == 'percentile':
368
+ # Percentile method
369
+ lower = self.jackknife_results_.jackknife_estimates.quantile(alpha / 2)
370
+ upper = self.jackknife_results_.jackknife_estimates.quantile(1 - alpha / 2)
371
+
372
+ else:
373
+ raise ValueError(f"Unknown method: {method}. Use 'normal' or 'percentile'")
374
+
375
+ ci = pd.DataFrame({
376
+ 'lower': lower,
377
+ 'upper': upper
378
+ })
379
+
380
+ return ci
381
+
382
+ def influential_entities(
383
+ self,
384
+ threshold: float = 2.0,
385
+ metric: str = 'max'
386
+ ) -> pd.DataFrame:
387
+ """
388
+ Identify influential entities based on jackknife influence.
389
+
390
+ Parameters
391
+ ----------
392
+ threshold : float, default=2.0
393
+ Threshold for influence (in units of mean absolute influence)
394
+ metric : {'max', 'mean', 'sum'}, default='max'
395
+ How to aggregate influence across parameters:
396
+
397
+ - 'max': Maximum absolute influence across parameters
398
+ - 'mean': Mean absolute influence across parameters
399
+ - 'sum': Sum of absolute influences across parameters
400
+
401
+ Returns
402
+ -------
403
+ influential : pd.DataFrame
404
+ DataFrame of influential entities with their influence measures
405
+
406
+ Raises
407
+ ------
408
+ RuntimeError
409
+ If run() has not been called yet
410
+ """
411
+ if self.jackknife_results_ is None:
412
+ raise RuntimeError("Must call run() before influential_entities()")
413
+
414
+ influence = self.jackknife_results_.influence
415
+
416
+ # Compute aggregate influence
417
+ if metric == 'max':
418
+ aggregate_influence = influence.abs().max(axis=1)
419
+ elif metric == 'mean':
420
+ aggregate_influence = influence.abs().mean(axis=1)
421
+ elif metric == 'sum':
422
+ aggregate_influence = influence.abs().sum(axis=1)
423
+ else:
424
+ raise ValueError(f"Unknown metric: {metric}. Use 'max', 'mean', or 'sum'")
425
+
426
+ # Determine threshold
427
+ mean_influence = aggregate_influence.mean()
428
+ influence_threshold = threshold * mean_influence
429
+
430
+ # Filter influential entities
431
+ influential_mask = aggregate_influence > influence_threshold
432
+ influential = pd.DataFrame({
433
+ 'entity': aggregate_influence[influential_mask].index,
434
+ 'influence': aggregate_influence[influential_mask].values,
435
+ 'threshold': influence_threshold
436
+ })
437
+
438
+ return influential
439
+
440
+ def summary(self) -> str:
441
+ """
442
+ Generate summary of jackknife results.
443
+
444
+ Returns
445
+ -------
446
+ summary_str : str
447
+ Formatted summary string
448
+
449
+ Raises
450
+ ------
451
+ RuntimeError
452
+ If run() has not been called yet
453
+ """
454
+ if self.jackknife_results_ is None:
455
+ raise RuntimeError("Must call run() before summary()")
456
+
457
+ return self.jackknife_results_.summary()