panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,386 @@
1
+ """
2
+ Cluster-robust standard errors for panel data.
3
+
4
+ This module implements one-way and two-way cluster-robust covariance
5
+ estimators commonly used in panel data applications.
6
+ """
7
+
8
+ from typing import Union, List, Optional
9
+ import numpy as np
10
+ import pandas as pd
11
+ from dataclasses import dataclass
12
+
13
+ from .utils import (
14
+ compute_bread,
15
+ compute_clustered_meat,
16
+ compute_twoway_clustered_meat,
17
+ sandwich_covariance,
18
+ clustered_covariance,
19
+ twoway_clustered_covariance
20
+ )
21
+
22
+
23
+ @dataclass
24
+ class ClusteredCovarianceResult:
25
+ """
26
+ Result of cluster-robust covariance estimation.
27
+
28
+ Attributes
29
+ ----------
30
+ cov_matrix : np.ndarray
31
+ Cluster-robust covariance matrix (k x k)
32
+ std_errors : np.ndarray
33
+ Cluster-robust standard errors (k,)
34
+ n_clusters : int or tuple
35
+ Number of clusters (or tuple for two-way)
36
+ n_obs : int
37
+ Number of observations
38
+ n_params : int
39
+ Number of parameters
40
+ cluster_dims : int
41
+ Number of clustering dimensions (1 or 2)
42
+ df_correction : bool
43
+ Whether finite-sample correction was applied
44
+ """
45
+ cov_matrix: np.ndarray
46
+ std_errors: np.ndarray
47
+ n_clusters: Union[int, tuple]
48
+ n_obs: int
49
+ n_params: int
50
+ cluster_dims: int
51
+ df_correction: bool
52
+
53
+
54
+ class ClusteredStandardErrors:
55
+ """
56
+ Cluster-robust standard errors for panel data.
57
+
58
+ Implements one-way and two-way clustering with finite-sample corrections.
59
+
60
+ Parameters
61
+ ----------
62
+ X : np.ndarray
63
+ Design matrix (n x k)
64
+ resid : np.ndarray
65
+ Residuals (n,)
66
+ clusters : np.ndarray or tuple of np.ndarray
67
+ Cluster identifiers. Can be:
68
+ - 1D array for one-way clustering
69
+ - Tuple of two 1D arrays for two-way clustering
70
+ df_correction : bool, default=True
71
+ Apply finite-sample correction: G/(G-1) × (N-1)/(N-K)
72
+
73
+ Attributes
74
+ ----------
75
+ X : np.ndarray
76
+ Design matrix
77
+ resid : np.ndarray
78
+ Residuals
79
+ n_obs : int
80
+ Number of observations
81
+ n_params : int
82
+ Number of parameters
83
+
84
+ Examples
85
+ --------
86
+ >>> # One-way clustering by entity
87
+ >>> clustered = ClusteredStandardErrors(X, resid, entity_ids)
88
+ >>> result = clustered.compute()
89
+ >>> print(result.std_errors)
90
+
91
+ >>> # Two-way clustering by entity and time
92
+ >>> clustered = ClusteredStandardErrors(X, resid, (entity_ids, time_ids))
93
+ >>> result = clustered.compute()
94
+ >>> print(result.std_errors)
95
+
96
+ References
97
+ ----------
98
+ Cameron, A. C., Gelbach, J. B., & Miller, D. L. (2011).
99
+ Robust inference with multiway clustering.
100
+ Journal of Business & Economic Statistics, 29(2), 238-249.
101
+
102
+ Petersen, M. A. (2009). Estimating standard errors in finance panel
103
+ data sets: Comparing approaches. Review of Financial Studies,
104
+ 22(1), 435-480.
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ X: np.ndarray,
110
+ resid: np.ndarray,
111
+ clusters: Union[np.ndarray, tuple],
112
+ df_correction: bool = True
113
+ ):
114
+ self.X = X
115
+ self.resid = resid
116
+ self.n_obs, self.n_params = X.shape
117
+ self.df_correction = df_correction
118
+
119
+ # Handle one-way vs two-way clustering
120
+ if isinstance(clusters, tuple):
121
+ if len(clusters) != 2:
122
+ raise ValueError("Two-way clustering requires exactly 2 cluster dimensions")
123
+ self.clusters1 = np.asarray(clusters[0])
124
+ self.clusters2 = np.asarray(clusters[1])
125
+ self.cluster_dims = 2
126
+ else:
127
+ self.clusters = np.asarray(clusters)
128
+ self.cluster_dims = 1
129
+
130
+ # Validate dimensions
131
+ if self.cluster_dims == 1:
132
+ if len(self.clusters) != self.n_obs:
133
+ raise ValueError(
134
+ f"Cluster dimension mismatch: expected {self.n_obs}, "
135
+ f"got {len(self.clusters)}"
136
+ )
137
+ else:
138
+ if len(self.clusters1) != self.n_obs or len(self.clusters2) != self.n_obs:
139
+ raise ValueError(
140
+ f"Cluster dimension mismatch: expected {self.n_obs} for each dimension"
141
+ )
142
+
143
+ # Cache
144
+ self._bread = None
145
+
146
+ @property
147
+ def bread(self) -> np.ndarray:
148
+ """Compute and cache bread matrix."""
149
+ if self._bread is None:
150
+ self._bread = compute_bread(self.X)
151
+ return self._bread
152
+
153
+ @property
154
+ def n_clusters(self) -> Union[int, tuple]:
155
+ """Number of clusters."""
156
+ if self.cluster_dims == 1:
157
+ return len(np.unique(self.clusters))
158
+ else:
159
+ n_clusters1 = len(np.unique(self.clusters1))
160
+ n_clusters2 = len(np.unique(self.clusters2))
161
+ return (n_clusters1, n_clusters2)
162
+
163
+ def compute(self) -> ClusteredCovarianceResult:
164
+ """
165
+ Compute cluster-robust covariance matrix.
166
+
167
+ Returns
168
+ -------
169
+ result : ClusteredCovarianceResult
170
+ Cluster-robust covariance and standard errors
171
+
172
+ Notes
173
+ -----
174
+ For one-way clustering:
175
+ V = (X'X)^{-1} [Σ_g (X_g'ε_g)(X_g'ε_g)'] (X'X)^{-1}
176
+
177
+ For two-way clustering (Cameron, Gelbach, Miller 2011):
178
+ V = V_1 + V_2 - V_12
179
+
180
+ where V_1 and V_2 are one-way clustered, and V_12 is clustered
181
+ by the intersection.
182
+ """
183
+ if self.cluster_dims == 1:
184
+ # One-way clustering
185
+ meat = compute_clustered_meat(
186
+ self.X,
187
+ self.resid,
188
+ self.clusters,
189
+ self.df_correction
190
+ )
191
+ cov_matrix = sandwich_covariance(self.bread, meat)
192
+ n_clust = len(np.unique(self.clusters))
193
+
194
+ else:
195
+ # Two-way clustering
196
+ meat = compute_twoway_clustered_meat(
197
+ self.X,
198
+ self.resid,
199
+ self.clusters1,
200
+ self.clusters2,
201
+ self.df_correction
202
+ )
203
+ cov_matrix = sandwich_covariance(self.bread, meat)
204
+ n_clust = (
205
+ len(np.unique(self.clusters1)),
206
+ len(np.unique(self.clusters2))
207
+ )
208
+
209
+ std_errors = np.sqrt(np.diag(cov_matrix))
210
+
211
+ return ClusteredCovarianceResult(
212
+ cov_matrix=cov_matrix,
213
+ std_errors=std_errors,
214
+ n_clusters=n_clust,
215
+ n_obs=self.n_obs,
216
+ n_params=self.n_params,
217
+ cluster_dims=self.cluster_dims,
218
+ df_correction=self.df_correction
219
+ )
220
+
221
+ def diagnostic_summary(self) -> str:
222
+ """
223
+ Generate diagnostic summary for clustering.
224
+
225
+ Returns
226
+ -------
227
+ summary : str
228
+ Diagnostic information about clustering
229
+
230
+ Notes
231
+ -----
232
+ Provides information about:
233
+ - Number of clusters
234
+ - Cluster sizes (min, max, mean)
235
+ - Warnings if few clusters
236
+ """
237
+ lines = []
238
+ lines.append("Cluster-Robust Standard Errors Diagnostics")
239
+ lines.append("=" * 50)
240
+
241
+ if self.cluster_dims == 1:
242
+ unique_clusters = np.unique(self.clusters)
243
+ n_clust = len(unique_clusters)
244
+ cluster_sizes = [np.sum(self.clusters == c) for c in unique_clusters]
245
+
246
+ lines.append(f"Clustering dimension: 1")
247
+ lines.append(f"Number of clusters: {n_clust}")
248
+ lines.append(f"Observations: {self.n_obs}")
249
+ lines.append(f"Avg obs per cluster: {self.n_obs / n_clust:.1f}")
250
+ lines.append(f"Cluster size - min: {min(cluster_sizes)}")
251
+ lines.append(f"Cluster size - max: {max(cluster_sizes)}")
252
+ lines.append(f"Cluster size - mean: {np.mean(cluster_sizes):.1f}")
253
+
254
+ # Warnings
255
+ if n_clust < 20:
256
+ lines.append("")
257
+ lines.append("⚠ WARNING: Few clusters detected (<20)")
258
+ lines.append(" Cluster-robust SEs may be unreliable with few clusters")
259
+ if n_clust < 10:
260
+ lines.append("⚠ CRITICAL: Very few clusters (<10)")
261
+ lines.append(" Consider using alternative inference methods")
262
+
263
+ else:
264
+ unique_clusters1 = np.unique(self.clusters1)
265
+ unique_clusters2 = np.unique(self.clusters2)
266
+ n_clust1 = len(unique_clusters1)
267
+ n_clust2 = len(unique_clusters2)
268
+
269
+ lines.append(f"Clustering dimensions: 2")
270
+ lines.append(f"Number of clusters (dim 1): {n_clust1}")
271
+ lines.append(f"Number of clusters (dim 2): {n_clust2}")
272
+ lines.append(f"Observations: {self.n_obs}")
273
+
274
+ # Warnings
275
+ if min(n_clust1, n_clust2) < 20:
276
+ lines.append("")
277
+ lines.append("⚠ WARNING: Few clusters in at least one dimension (<20)")
278
+
279
+ lines.append("")
280
+ lines.append(f"Finite-sample correction: {self.df_correction}")
281
+
282
+ return "\n".join(lines)
283
+
284
+
285
+ def cluster_by_entity(
286
+ X: np.ndarray,
287
+ resid: np.ndarray,
288
+ entity_ids: np.ndarray,
289
+ df_correction: bool = True
290
+ ) -> ClusteredCovarianceResult:
291
+ """
292
+ Convenience function for clustering by entity.
293
+
294
+ Parameters
295
+ ----------
296
+ X : np.ndarray
297
+ Design matrix (n x k)
298
+ resid : np.ndarray
299
+ Residuals (n,)
300
+ entity_ids : np.ndarray
301
+ Entity identifiers (n,)
302
+ df_correction : bool, default=True
303
+ Apply finite-sample correction
304
+
305
+ Returns
306
+ -------
307
+ result : ClusteredCovarianceResult
308
+ Cluster-robust covariance and standard errors
309
+
310
+ Examples
311
+ --------
312
+ >>> from panelbox.standard_errors import cluster_by_entity
313
+ >>> result = cluster_by_entity(X, resid, entity_ids)
314
+ >>> print(result.std_errors)
315
+ """
316
+ clustered = ClusteredStandardErrors(X, resid, entity_ids, df_correction)
317
+ return clustered.compute()
318
+
319
+
320
+ def cluster_by_time(
321
+ X: np.ndarray,
322
+ resid: np.ndarray,
323
+ time_ids: np.ndarray,
324
+ df_correction: bool = True
325
+ ) -> ClusteredCovarianceResult:
326
+ """
327
+ Convenience function for clustering by time.
328
+
329
+ Parameters
330
+ ----------
331
+ X : np.ndarray
332
+ Design matrix (n x k)
333
+ resid : np.ndarray
334
+ Residuals (n,)
335
+ time_ids : np.ndarray
336
+ Time identifiers (n,)
337
+ df_correction : bool, default=True
338
+ Apply finite-sample correction
339
+
340
+ Returns
341
+ -------
342
+ result : ClusteredCovarianceResult
343
+ Cluster-robust covariance and standard errors
344
+ """
345
+ clustered = ClusteredStandardErrors(X, resid, time_ids, df_correction)
346
+ return clustered.compute()
347
+
348
+
349
+ def twoway_cluster(
350
+ X: np.ndarray,
351
+ resid: np.ndarray,
352
+ cluster1: np.ndarray,
353
+ cluster2: np.ndarray,
354
+ df_correction: bool = True
355
+ ) -> ClusteredCovarianceResult:
356
+ """
357
+ Convenience function for two-way clustering.
358
+
359
+ Parameters
360
+ ----------
361
+ X : np.ndarray
362
+ Design matrix (n x k)
363
+ resid : np.ndarray
364
+ Residuals (n,)
365
+ cluster1 : np.ndarray
366
+ First clustering dimension (e.g., entity_ids)
367
+ cluster2 : np.ndarray
368
+ Second clustering dimension (e.g., time_ids)
369
+ df_correction : bool, default=True
370
+ Apply finite-sample correction
371
+
372
+ Returns
373
+ -------
374
+ result : ClusteredCovarianceResult
375
+ Two-way cluster-robust covariance and standard errors
376
+
377
+ Examples
378
+ --------
379
+ >>> from panelbox.standard_errors import twoway_cluster
380
+ >>> result = twoway_cluster(X, resid, entity_ids, time_ids)
381
+ >>> print(result.std_errors)
382
+ """
383
+ clustered = ClusteredStandardErrors(
384
+ X, resid, (cluster1, cluster2), df_correction
385
+ )
386
+ return clustered.compute()