panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cluster-robust standard errors for panel data.
|
|
3
|
+
|
|
4
|
+
This module implements one-way and two-way cluster-robust covariance
|
|
5
|
+
estimators commonly used in panel data applications.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Union, List, Optional
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
from .utils import (
|
|
14
|
+
compute_bread,
|
|
15
|
+
compute_clustered_meat,
|
|
16
|
+
compute_twoway_clustered_meat,
|
|
17
|
+
sandwich_covariance,
|
|
18
|
+
clustered_covariance,
|
|
19
|
+
twoway_clustered_covariance
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ClusteredCovarianceResult:
|
|
25
|
+
"""
|
|
26
|
+
Result of cluster-robust covariance estimation.
|
|
27
|
+
|
|
28
|
+
Attributes
|
|
29
|
+
----------
|
|
30
|
+
cov_matrix : np.ndarray
|
|
31
|
+
Cluster-robust covariance matrix (k x k)
|
|
32
|
+
std_errors : np.ndarray
|
|
33
|
+
Cluster-robust standard errors (k,)
|
|
34
|
+
n_clusters : int or tuple
|
|
35
|
+
Number of clusters (or tuple for two-way)
|
|
36
|
+
n_obs : int
|
|
37
|
+
Number of observations
|
|
38
|
+
n_params : int
|
|
39
|
+
Number of parameters
|
|
40
|
+
cluster_dims : int
|
|
41
|
+
Number of clustering dimensions (1 or 2)
|
|
42
|
+
df_correction : bool
|
|
43
|
+
Whether finite-sample correction was applied
|
|
44
|
+
"""
|
|
45
|
+
cov_matrix: np.ndarray
|
|
46
|
+
std_errors: np.ndarray
|
|
47
|
+
n_clusters: Union[int, tuple]
|
|
48
|
+
n_obs: int
|
|
49
|
+
n_params: int
|
|
50
|
+
cluster_dims: int
|
|
51
|
+
df_correction: bool
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ClusteredStandardErrors:
|
|
55
|
+
"""
|
|
56
|
+
Cluster-robust standard errors for panel data.
|
|
57
|
+
|
|
58
|
+
Implements one-way and two-way clustering with finite-sample corrections.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
X : np.ndarray
|
|
63
|
+
Design matrix (n x k)
|
|
64
|
+
resid : np.ndarray
|
|
65
|
+
Residuals (n,)
|
|
66
|
+
clusters : np.ndarray or tuple of np.ndarray
|
|
67
|
+
Cluster identifiers. Can be:
|
|
68
|
+
- 1D array for one-way clustering
|
|
69
|
+
- Tuple of two 1D arrays for two-way clustering
|
|
70
|
+
df_correction : bool, default=True
|
|
71
|
+
Apply finite-sample correction: G/(G-1) × (N-1)/(N-K)
|
|
72
|
+
|
|
73
|
+
Attributes
|
|
74
|
+
----------
|
|
75
|
+
X : np.ndarray
|
|
76
|
+
Design matrix
|
|
77
|
+
resid : np.ndarray
|
|
78
|
+
Residuals
|
|
79
|
+
n_obs : int
|
|
80
|
+
Number of observations
|
|
81
|
+
n_params : int
|
|
82
|
+
Number of parameters
|
|
83
|
+
|
|
84
|
+
Examples
|
|
85
|
+
--------
|
|
86
|
+
>>> # One-way clustering by entity
|
|
87
|
+
>>> clustered = ClusteredStandardErrors(X, resid, entity_ids)
|
|
88
|
+
>>> result = clustered.compute()
|
|
89
|
+
>>> print(result.std_errors)
|
|
90
|
+
|
|
91
|
+
>>> # Two-way clustering by entity and time
|
|
92
|
+
>>> clustered = ClusteredStandardErrors(X, resid, (entity_ids, time_ids))
|
|
93
|
+
>>> result = clustered.compute()
|
|
94
|
+
>>> print(result.std_errors)
|
|
95
|
+
|
|
96
|
+
References
|
|
97
|
+
----------
|
|
98
|
+
Cameron, A. C., Gelbach, J. B., & Miller, D. L. (2011).
|
|
99
|
+
Robust inference with multiway clustering.
|
|
100
|
+
Journal of Business & Economic Statistics, 29(2), 238-249.
|
|
101
|
+
|
|
102
|
+
Petersen, M. A. (2009). Estimating standard errors in finance panel
|
|
103
|
+
data sets: Comparing approaches. Review of Financial Studies,
|
|
104
|
+
22(1), 435-480.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
X: np.ndarray,
|
|
110
|
+
resid: np.ndarray,
|
|
111
|
+
clusters: Union[np.ndarray, tuple],
|
|
112
|
+
df_correction: bool = True
|
|
113
|
+
):
|
|
114
|
+
self.X = X
|
|
115
|
+
self.resid = resid
|
|
116
|
+
self.n_obs, self.n_params = X.shape
|
|
117
|
+
self.df_correction = df_correction
|
|
118
|
+
|
|
119
|
+
# Handle one-way vs two-way clustering
|
|
120
|
+
if isinstance(clusters, tuple):
|
|
121
|
+
if len(clusters) != 2:
|
|
122
|
+
raise ValueError("Two-way clustering requires exactly 2 cluster dimensions")
|
|
123
|
+
self.clusters1 = np.asarray(clusters[0])
|
|
124
|
+
self.clusters2 = np.asarray(clusters[1])
|
|
125
|
+
self.cluster_dims = 2
|
|
126
|
+
else:
|
|
127
|
+
self.clusters = np.asarray(clusters)
|
|
128
|
+
self.cluster_dims = 1
|
|
129
|
+
|
|
130
|
+
# Validate dimensions
|
|
131
|
+
if self.cluster_dims == 1:
|
|
132
|
+
if len(self.clusters) != self.n_obs:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"Cluster dimension mismatch: expected {self.n_obs}, "
|
|
135
|
+
f"got {len(self.clusters)}"
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
if len(self.clusters1) != self.n_obs or len(self.clusters2) != self.n_obs:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Cluster dimension mismatch: expected {self.n_obs} for each dimension"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Cache
|
|
144
|
+
self._bread = None
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def bread(self) -> np.ndarray:
|
|
148
|
+
"""Compute and cache bread matrix."""
|
|
149
|
+
if self._bread is None:
|
|
150
|
+
self._bread = compute_bread(self.X)
|
|
151
|
+
return self._bread
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def n_clusters(self) -> Union[int, tuple]:
|
|
155
|
+
"""Number of clusters."""
|
|
156
|
+
if self.cluster_dims == 1:
|
|
157
|
+
return len(np.unique(self.clusters))
|
|
158
|
+
else:
|
|
159
|
+
n_clusters1 = len(np.unique(self.clusters1))
|
|
160
|
+
n_clusters2 = len(np.unique(self.clusters2))
|
|
161
|
+
return (n_clusters1, n_clusters2)
|
|
162
|
+
|
|
163
|
+
def compute(self) -> ClusteredCovarianceResult:
|
|
164
|
+
"""
|
|
165
|
+
Compute cluster-robust covariance matrix.
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
result : ClusteredCovarianceResult
|
|
170
|
+
Cluster-robust covariance and standard errors
|
|
171
|
+
|
|
172
|
+
Notes
|
|
173
|
+
-----
|
|
174
|
+
For one-way clustering:
|
|
175
|
+
V = (X'X)^{-1} [Σ_g (X_g'ε_g)(X_g'ε_g)'] (X'X)^{-1}
|
|
176
|
+
|
|
177
|
+
For two-way clustering (Cameron, Gelbach, Miller 2011):
|
|
178
|
+
V = V_1 + V_2 - V_12
|
|
179
|
+
|
|
180
|
+
where V_1 and V_2 are one-way clustered, and V_12 is clustered
|
|
181
|
+
by the intersection.
|
|
182
|
+
"""
|
|
183
|
+
if self.cluster_dims == 1:
|
|
184
|
+
# One-way clustering
|
|
185
|
+
meat = compute_clustered_meat(
|
|
186
|
+
self.X,
|
|
187
|
+
self.resid,
|
|
188
|
+
self.clusters,
|
|
189
|
+
self.df_correction
|
|
190
|
+
)
|
|
191
|
+
cov_matrix = sandwich_covariance(self.bread, meat)
|
|
192
|
+
n_clust = len(np.unique(self.clusters))
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
# Two-way clustering
|
|
196
|
+
meat = compute_twoway_clustered_meat(
|
|
197
|
+
self.X,
|
|
198
|
+
self.resid,
|
|
199
|
+
self.clusters1,
|
|
200
|
+
self.clusters2,
|
|
201
|
+
self.df_correction
|
|
202
|
+
)
|
|
203
|
+
cov_matrix = sandwich_covariance(self.bread, meat)
|
|
204
|
+
n_clust = (
|
|
205
|
+
len(np.unique(self.clusters1)),
|
|
206
|
+
len(np.unique(self.clusters2))
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
std_errors = np.sqrt(np.diag(cov_matrix))
|
|
210
|
+
|
|
211
|
+
return ClusteredCovarianceResult(
|
|
212
|
+
cov_matrix=cov_matrix,
|
|
213
|
+
std_errors=std_errors,
|
|
214
|
+
n_clusters=n_clust,
|
|
215
|
+
n_obs=self.n_obs,
|
|
216
|
+
n_params=self.n_params,
|
|
217
|
+
cluster_dims=self.cluster_dims,
|
|
218
|
+
df_correction=self.df_correction
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def diagnostic_summary(self) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Generate diagnostic summary for clustering.
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
-------
|
|
227
|
+
summary : str
|
|
228
|
+
Diagnostic information about clustering
|
|
229
|
+
|
|
230
|
+
Notes
|
|
231
|
+
-----
|
|
232
|
+
Provides information about:
|
|
233
|
+
- Number of clusters
|
|
234
|
+
- Cluster sizes (min, max, mean)
|
|
235
|
+
- Warnings if few clusters
|
|
236
|
+
"""
|
|
237
|
+
lines = []
|
|
238
|
+
lines.append("Cluster-Robust Standard Errors Diagnostics")
|
|
239
|
+
lines.append("=" * 50)
|
|
240
|
+
|
|
241
|
+
if self.cluster_dims == 1:
|
|
242
|
+
unique_clusters = np.unique(self.clusters)
|
|
243
|
+
n_clust = len(unique_clusters)
|
|
244
|
+
cluster_sizes = [np.sum(self.clusters == c) for c in unique_clusters]
|
|
245
|
+
|
|
246
|
+
lines.append(f"Clustering dimension: 1")
|
|
247
|
+
lines.append(f"Number of clusters: {n_clust}")
|
|
248
|
+
lines.append(f"Observations: {self.n_obs}")
|
|
249
|
+
lines.append(f"Avg obs per cluster: {self.n_obs / n_clust:.1f}")
|
|
250
|
+
lines.append(f"Cluster size - min: {min(cluster_sizes)}")
|
|
251
|
+
lines.append(f"Cluster size - max: {max(cluster_sizes)}")
|
|
252
|
+
lines.append(f"Cluster size - mean: {np.mean(cluster_sizes):.1f}")
|
|
253
|
+
|
|
254
|
+
# Warnings
|
|
255
|
+
if n_clust < 20:
|
|
256
|
+
lines.append("")
|
|
257
|
+
lines.append("⚠ WARNING: Few clusters detected (<20)")
|
|
258
|
+
lines.append(" Cluster-robust SEs may be unreliable with few clusters")
|
|
259
|
+
if n_clust < 10:
|
|
260
|
+
lines.append("⚠ CRITICAL: Very few clusters (<10)")
|
|
261
|
+
lines.append(" Consider using alternative inference methods")
|
|
262
|
+
|
|
263
|
+
else:
|
|
264
|
+
unique_clusters1 = np.unique(self.clusters1)
|
|
265
|
+
unique_clusters2 = np.unique(self.clusters2)
|
|
266
|
+
n_clust1 = len(unique_clusters1)
|
|
267
|
+
n_clust2 = len(unique_clusters2)
|
|
268
|
+
|
|
269
|
+
lines.append(f"Clustering dimensions: 2")
|
|
270
|
+
lines.append(f"Number of clusters (dim 1): {n_clust1}")
|
|
271
|
+
lines.append(f"Number of clusters (dim 2): {n_clust2}")
|
|
272
|
+
lines.append(f"Observations: {self.n_obs}")
|
|
273
|
+
|
|
274
|
+
# Warnings
|
|
275
|
+
if min(n_clust1, n_clust2) < 20:
|
|
276
|
+
lines.append("")
|
|
277
|
+
lines.append("⚠ WARNING: Few clusters in at least one dimension (<20)")
|
|
278
|
+
|
|
279
|
+
lines.append("")
|
|
280
|
+
lines.append(f"Finite-sample correction: {self.df_correction}")
|
|
281
|
+
|
|
282
|
+
return "\n".join(lines)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def cluster_by_entity(
|
|
286
|
+
X: np.ndarray,
|
|
287
|
+
resid: np.ndarray,
|
|
288
|
+
entity_ids: np.ndarray,
|
|
289
|
+
df_correction: bool = True
|
|
290
|
+
) -> ClusteredCovarianceResult:
|
|
291
|
+
"""
|
|
292
|
+
Convenience function for clustering by entity.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
X : np.ndarray
|
|
297
|
+
Design matrix (n x k)
|
|
298
|
+
resid : np.ndarray
|
|
299
|
+
Residuals (n,)
|
|
300
|
+
entity_ids : np.ndarray
|
|
301
|
+
Entity identifiers (n,)
|
|
302
|
+
df_correction : bool, default=True
|
|
303
|
+
Apply finite-sample correction
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
result : ClusteredCovarianceResult
|
|
308
|
+
Cluster-robust covariance and standard errors
|
|
309
|
+
|
|
310
|
+
Examples
|
|
311
|
+
--------
|
|
312
|
+
>>> from panelbox.standard_errors import cluster_by_entity
|
|
313
|
+
>>> result = cluster_by_entity(X, resid, entity_ids)
|
|
314
|
+
>>> print(result.std_errors)
|
|
315
|
+
"""
|
|
316
|
+
clustered = ClusteredStandardErrors(X, resid, entity_ids, df_correction)
|
|
317
|
+
return clustered.compute()
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def cluster_by_time(
|
|
321
|
+
X: np.ndarray,
|
|
322
|
+
resid: np.ndarray,
|
|
323
|
+
time_ids: np.ndarray,
|
|
324
|
+
df_correction: bool = True
|
|
325
|
+
) -> ClusteredCovarianceResult:
|
|
326
|
+
"""
|
|
327
|
+
Convenience function for clustering by time.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
X : np.ndarray
|
|
332
|
+
Design matrix (n x k)
|
|
333
|
+
resid : np.ndarray
|
|
334
|
+
Residuals (n,)
|
|
335
|
+
time_ids : np.ndarray
|
|
336
|
+
Time identifiers (n,)
|
|
337
|
+
df_correction : bool, default=True
|
|
338
|
+
Apply finite-sample correction
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
result : ClusteredCovarianceResult
|
|
343
|
+
Cluster-robust covariance and standard errors
|
|
344
|
+
"""
|
|
345
|
+
clustered = ClusteredStandardErrors(X, resid, time_ids, df_correction)
|
|
346
|
+
return clustered.compute()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def twoway_cluster(
|
|
350
|
+
X: np.ndarray,
|
|
351
|
+
resid: np.ndarray,
|
|
352
|
+
cluster1: np.ndarray,
|
|
353
|
+
cluster2: np.ndarray,
|
|
354
|
+
df_correction: bool = True
|
|
355
|
+
) -> ClusteredCovarianceResult:
|
|
356
|
+
"""
|
|
357
|
+
Convenience function for two-way clustering.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
X : np.ndarray
|
|
362
|
+
Design matrix (n x k)
|
|
363
|
+
resid : np.ndarray
|
|
364
|
+
Residuals (n,)
|
|
365
|
+
cluster1 : np.ndarray
|
|
366
|
+
First clustering dimension (e.g., entity_ids)
|
|
367
|
+
cluster2 : np.ndarray
|
|
368
|
+
Second clustering dimension (e.g., time_ids)
|
|
369
|
+
df_correction : bool, default=True
|
|
370
|
+
Apply finite-sample correction
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
result : ClusteredCovarianceResult
|
|
375
|
+
Two-way cluster-robust covariance and standard errors
|
|
376
|
+
|
|
377
|
+
Examples
|
|
378
|
+
--------
|
|
379
|
+
>>> from panelbox.standard_errors import twoway_cluster
|
|
380
|
+
>>> result = twoway_cluster(X, resid, entity_ids, time_ids)
|
|
381
|
+
>>> print(result.std_errors)
|
|
382
|
+
"""
|
|
383
|
+
clustered = ClusteredStandardErrors(
|
|
384
|
+
X, resid, (cluster1, cluster2), df_correction
|
|
385
|
+
)
|
|
386
|
+
return clustered.compute()
|