panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
"""
|
|
2
|
+
First Difference estimator for panel data.
|
|
3
|
+
|
|
4
|
+
This module provides the First Difference estimator which eliminates
|
|
5
|
+
entity fixed effects through first-differencing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from panelbox.core.base_model import PanelModel
|
|
13
|
+
from panelbox.core.results import PanelResults
|
|
14
|
+
from panelbox.utils.matrix_ops import (
|
|
15
|
+
compute_ols,
|
|
16
|
+
compute_vcov_nonrobust,
|
|
17
|
+
compute_panel_rsquared
|
|
18
|
+
)
|
|
19
|
+
from panelbox.standard_errors import (
|
|
20
|
+
robust_covariance,
|
|
21
|
+
cluster_by_entity,
|
|
22
|
+
twoway_cluster,
|
|
23
|
+
driscoll_kraay,
|
|
24
|
+
newey_west,
|
|
25
|
+
pcse
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FirstDifferenceEstimator(PanelModel):
|
|
30
|
+
"""
|
|
31
|
+
First Difference estimator for panel data.
|
|
32
|
+
|
|
33
|
+
This estimator eliminates unobserved entity-specific fixed effects
|
|
34
|
+
through first-differencing. Instead of demeaning (like Fixed Effects),
|
|
35
|
+
it takes differences:
|
|
36
|
+
Δy_it = y_it - y_{i,t-1} = β Δx_it + Δε_it
|
|
37
|
+
|
|
38
|
+
where Δ denotes the first difference operator.
|
|
39
|
+
|
|
40
|
+
The entity fixed effect (α_i) cancels out because it's time-invariant:
|
|
41
|
+
Δα_i = α_i - α_i = 0
|
|
42
|
+
|
|
43
|
+
Advantages over Fixed Effects (within estimator):
|
|
44
|
+
- More robust when T is small (few time periods)
|
|
45
|
+
- Better suited for models with serially correlated errors
|
|
46
|
+
- Handles unbalanced panels naturally
|
|
47
|
+
- No dummy variable trap issues
|
|
48
|
+
|
|
49
|
+
Disadvantages:
|
|
50
|
+
- Loses one time period per entity (first period dropped)
|
|
51
|
+
- Amplifies measurement error (differences magnify noise)
|
|
52
|
+
- Less efficient than FE under homoskedastic errors
|
|
53
|
+
- Loses time-invariant variables (like FE)
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
formula : str
|
|
58
|
+
Model formula in R-style syntax (e.g., "y ~ x1 + x2")
|
|
59
|
+
data : pd.DataFrame
|
|
60
|
+
Panel data in long format (must be sorted by entity and time)
|
|
61
|
+
entity_col : str
|
|
62
|
+
Name of the column identifying entities
|
|
63
|
+
time_col : str
|
|
64
|
+
Name of the column identifying time periods
|
|
65
|
+
weights : np.ndarray, optional
|
|
66
|
+
Observation weights (applied to differenced data)
|
|
67
|
+
|
|
68
|
+
Attributes
|
|
69
|
+
----------
|
|
70
|
+
n_obs_original : int
|
|
71
|
+
Number of observations before differencing
|
|
72
|
+
n_obs_differenced : int
|
|
73
|
+
Number of observations after differencing (loses first period per entity)
|
|
74
|
+
|
|
75
|
+
Examples
|
|
76
|
+
--------
|
|
77
|
+
>>> import panelbox as pb
|
|
78
|
+
>>> import pandas as pd
|
|
79
|
+
>>>
|
|
80
|
+
>>> # Load data
|
|
81
|
+
>>> data = pb.load_grunfeld()
|
|
82
|
+
>>>
|
|
83
|
+
>>> # First Difference estimator
|
|
84
|
+
>>> fd = pb.FirstDifferenceEstimator("invest ~ value + capital", data, "firm", "year")
|
|
85
|
+
>>> results = fd.fit(cov_type='robust')
|
|
86
|
+
>>> print(results.summary())
|
|
87
|
+
>>>
|
|
88
|
+
>>> # Compare with Fixed Effects
|
|
89
|
+
>>> fe = pb.FixedEffects("invest ~ value + capital", data, "firm", "year")
|
|
90
|
+
>>> results_fe = fe.fit(cov_type='robust')
|
|
91
|
+
>>>
|
|
92
|
+
>>> print(f"First Diff coefs: {results.params.values}")
|
|
93
|
+
>>> print(f"Fixed Effects coefs: {results_fe.params.values}")
|
|
94
|
+
>>>
|
|
95
|
+
>>> # Under homoskedasticity, should be similar
|
|
96
|
+
>>> # Under serial correlation, FD may be more consistent
|
|
97
|
+
|
|
98
|
+
Notes
|
|
99
|
+
-----
|
|
100
|
+
**Data Requirements:**
|
|
101
|
+
- Data must be sorted by entity and time before estimation
|
|
102
|
+
- Missing periods will be handled by taking differences only within consecutive observations
|
|
103
|
+
- At least 2 time periods per entity required
|
|
104
|
+
|
|
105
|
+
**First Differencing:**
|
|
106
|
+
- For each entity i, compute: Δy_it = y_it - y_{i,t-1}
|
|
107
|
+
- Drops the first observation for each entity
|
|
108
|
+
- If N entities and T periods (balanced): N*T → N*(T-1) observations
|
|
109
|
+
|
|
110
|
+
**Inference:**
|
|
111
|
+
- Standard errors account for structure of differenced data
|
|
112
|
+
- Cluster-robust SE recommended (clustering by entity)
|
|
113
|
+
- Driscoll-Kraay useful for serial correlation and heteroskedasticity
|
|
114
|
+
|
|
115
|
+
**Comparison with Fixed Effects:**
|
|
116
|
+
- FE uses within transformation (demeaning): y_it - ȳ_i
|
|
117
|
+
- FD uses first difference: y_it - y_{i,t-1}
|
|
118
|
+
- Under random walk: y_it = y_{i,t-1} + ε_it → FD removes unit root
|
|
119
|
+
- Under classical RE/FE assumptions: FE is more efficient
|
|
120
|
+
|
|
121
|
+
References
|
|
122
|
+
----------
|
|
123
|
+
.. [1] Wooldridge, J. M. (2010). Econometric Analysis of Cross Section
|
|
124
|
+
and Panel Data. MIT Press. Section 10.5.
|
|
125
|
+
.. [2] Baltagi, B. H. (2013). Econometric Analysis of Panel Data.
|
|
126
|
+
Wiley. Chapter 3.
|
|
127
|
+
.. [3] Hsiao, C. (2014). Analysis of Panel Data. Cambridge University Press.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
formula: str,
|
|
133
|
+
data: pd.DataFrame,
|
|
134
|
+
entity_col: str,
|
|
135
|
+
time_col: str,
|
|
136
|
+
weights: Optional[np.ndarray] = None
|
|
137
|
+
):
|
|
138
|
+
super().__init__(formula, data, entity_col, time_col, weights)
|
|
139
|
+
|
|
140
|
+
# Store original observation count
|
|
141
|
+
self.n_obs_original = len(data)
|
|
142
|
+
self.n_obs_differenced: Optional[int] = None
|
|
143
|
+
|
|
144
|
+
def fit(
|
|
145
|
+
self,
|
|
146
|
+
cov_type: str = 'nonrobust',
|
|
147
|
+
**cov_kwds
|
|
148
|
+
) -> PanelResults:
|
|
149
|
+
"""
|
|
150
|
+
Fit the First Difference estimator.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
cov_type : str, default='nonrobust'
|
|
155
|
+
Type of covariance estimator:
|
|
156
|
+
- 'nonrobust': Classical standard errors
|
|
157
|
+
- 'robust' or 'hc1': Heteroskedasticity-robust (HC1)
|
|
158
|
+
- 'hc0', 'hc2', 'hc3': Other HC variants
|
|
159
|
+
- 'clustered': Cluster-robust (by entity, recommended for FD)
|
|
160
|
+
- 'twoway': Two-way clustered (entity and time)
|
|
161
|
+
- 'driscoll_kraay': Driscoll-Kraay (for serial correlation)
|
|
162
|
+
- 'newey_west': Newey-West HAC
|
|
163
|
+
- 'pcse': Panel-Corrected Standard Errors
|
|
164
|
+
**cov_kwds
|
|
165
|
+
Additional arguments for covariance estimation:
|
|
166
|
+
- cluster_col: For custom clustering (default: entity)
|
|
167
|
+
- max_lags: For Driscoll-Kraay and Newey-West
|
|
168
|
+
- kernel: For HAC estimators ('bartlett', 'parzen', 'quadratic_spectral')
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
PanelResults
|
|
173
|
+
Fitted model results
|
|
174
|
+
|
|
175
|
+
Examples
|
|
176
|
+
--------
|
|
177
|
+
>>> # Classical standard errors
|
|
178
|
+
>>> results = model.fit(cov_type='nonrobust')
|
|
179
|
+
|
|
180
|
+
>>> # Heteroskedasticity-robust (recommended)
|
|
181
|
+
>>> results = model.fit(cov_type='robust')
|
|
182
|
+
|
|
183
|
+
>>> # Cluster-robust by entity (recommended for FD)
|
|
184
|
+
>>> results = model.fit(cov_type='clustered')
|
|
185
|
+
|
|
186
|
+
>>> # Driscoll-Kraay (for serial correlation + heteroskedasticity)
|
|
187
|
+
>>> results = model.fit(cov_type='driscoll_kraay', max_lags=2)
|
|
188
|
+
|
|
189
|
+
Notes
|
|
190
|
+
-----
|
|
191
|
+
For First Difference models, clustered or Driscoll-Kraay standard errors
|
|
192
|
+
are typically recommended because:
|
|
193
|
+
- Differencing can induce serial correlation (MA(1) structure)
|
|
194
|
+
- Cluster-robust SE account for within-entity correlation
|
|
195
|
+
- Driscoll-Kraay handles both serial correlation and heteroskedasticity
|
|
196
|
+
"""
|
|
197
|
+
# Build design matrices from original data
|
|
198
|
+
y_orig, X_orig = self.formula_parser.build_design_matrices(
|
|
199
|
+
self.data.data,
|
|
200
|
+
return_type='array'
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Get variable names
|
|
204
|
+
var_names = self.formula_parser.get_variable_names(self.data.data)
|
|
205
|
+
|
|
206
|
+
# Remove intercept from variable names (differencing eliminates it)
|
|
207
|
+
# First differences remove constant terms
|
|
208
|
+
if 'Intercept' in var_names:
|
|
209
|
+
var_names = [v for v in var_names if v != 'Intercept']
|
|
210
|
+
# Remove intercept column from X
|
|
211
|
+
X_orig = X_orig[:, 1:]
|
|
212
|
+
|
|
213
|
+
# Get entity and time identifiers
|
|
214
|
+
entities = self.data.data[self.data.entity_col].values
|
|
215
|
+
times = self.data.data[self.data.time_col].values
|
|
216
|
+
|
|
217
|
+
# Apply first difference transformation
|
|
218
|
+
y_diff, X_diff, entities_diff, times_diff, valid_idx = self._first_difference(
|
|
219
|
+
y_orig, X_orig, entities, times
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Store differenced observation count
|
|
223
|
+
self.n_obs_differenced = len(y_diff)
|
|
224
|
+
|
|
225
|
+
# Check that we have enough observations
|
|
226
|
+
if self.n_obs_differenced < X_diff.shape[1]:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"Insufficient observations after differencing: {self.n_obs_differenced} obs, "
|
|
229
|
+
f"{X_diff.shape[1]} parameters. Need at least 2 time periods per entity."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Estimate coefficients on differenced data (no intercept)
|
|
233
|
+
beta, resid_diff, fitted_diff = compute_ols(y_diff, X_diff, self.weights)
|
|
234
|
+
|
|
235
|
+
# Degrees of freedom
|
|
236
|
+
n = len(y_diff)
|
|
237
|
+
k = X_diff.shape[1]
|
|
238
|
+
df_model = k # No intercept in first difference
|
|
239
|
+
df_resid = n - k
|
|
240
|
+
|
|
241
|
+
# Ensure df_resid is positive
|
|
242
|
+
if df_resid <= 0:
|
|
243
|
+
raise ValueError(
|
|
244
|
+
f"Insufficient degrees of freedom: df_resid = {df_resid}. "
|
|
245
|
+
f"n={n}, k={k}"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Compute fitted values and residuals in original scale (levels)
|
|
249
|
+
# This requires integrating back from differences (not unique, use cumsum)
|
|
250
|
+
# For reporting purposes, we'll use the differenced residuals
|
|
251
|
+
fitted_orig = np.full(len(y_orig), np.nan)
|
|
252
|
+
resid_orig = np.full(len(y_orig), np.nan)
|
|
253
|
+
fitted_orig[valid_idx] = fitted_diff
|
|
254
|
+
resid_orig[valid_idx] = resid_diff
|
|
255
|
+
|
|
256
|
+
# Compute covariance matrix (on differenced data)
|
|
257
|
+
cov_type_lower = cov_type.lower()
|
|
258
|
+
|
|
259
|
+
if cov_type_lower == 'nonrobust':
|
|
260
|
+
vcov = compute_vcov_nonrobust(X_diff, resid_diff, df_resid)
|
|
261
|
+
|
|
262
|
+
elif cov_type_lower in ['robust', 'hc0', 'hc1', 'hc2', 'hc3']:
|
|
263
|
+
# Map 'robust' to 'hc1'
|
|
264
|
+
method = 'HC1' if cov_type_lower == 'robust' else cov_type_lower.upper()
|
|
265
|
+
result = robust_covariance(X_diff, resid_diff, method=method)
|
|
266
|
+
vcov = result.cov_matrix
|
|
267
|
+
|
|
268
|
+
elif cov_type_lower == 'clustered':
|
|
269
|
+
# Cluster by entity (recommended for FD)
|
|
270
|
+
result = cluster_by_entity(X_diff, resid_diff, entities_diff, df_correction=True)
|
|
271
|
+
vcov = result.cov_matrix
|
|
272
|
+
|
|
273
|
+
elif cov_type_lower == 'twoway':
|
|
274
|
+
# Two-way clustering: entity and time
|
|
275
|
+
result = twoway_cluster(X_diff, resid_diff, entities_diff, times_diff, df_correction=True)
|
|
276
|
+
vcov = result.cov_matrix
|
|
277
|
+
|
|
278
|
+
elif cov_type_lower == 'driscoll_kraay':
|
|
279
|
+
# Driscoll-Kraay for serial correlation (recommended for FD)
|
|
280
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
281
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
282
|
+
result = driscoll_kraay(X_diff, resid_diff, times_diff, max_lags=max_lags, kernel=kernel)
|
|
283
|
+
vcov = result.cov_matrix
|
|
284
|
+
|
|
285
|
+
elif cov_type_lower == 'newey_west':
|
|
286
|
+
# Newey-West HAC
|
|
287
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
288
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
289
|
+
result = newey_west(X_diff, resid_diff, max_lags=max_lags, kernel=kernel)
|
|
290
|
+
vcov = result.cov_matrix
|
|
291
|
+
|
|
292
|
+
elif cov_type_lower == 'pcse':
|
|
293
|
+
# Panel-Corrected Standard Errors
|
|
294
|
+
result = pcse(X_diff, resid_diff, entities_diff, times_diff)
|
|
295
|
+
vcov = result.cov_matrix
|
|
296
|
+
|
|
297
|
+
else:
|
|
298
|
+
raise ValueError(
|
|
299
|
+
f"cov_type must be one of: 'nonrobust', 'robust', 'hc0', 'hc1', 'hc2', 'hc3', "
|
|
300
|
+
f"'clustered', 'twoway', 'driscoll_kraay', 'newey_west', 'pcse', got '{cov_type}'"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Standard errors
|
|
304
|
+
std_errors = np.sqrt(np.diag(vcov))
|
|
305
|
+
|
|
306
|
+
# Compute R-squared measures on differenced data
|
|
307
|
+
# For FD, R² measures fit of differenced model
|
|
308
|
+
tss_diff = np.sum((y_diff - y_diff.mean()) ** 2)
|
|
309
|
+
ess_diff = np.sum(resid_diff ** 2)
|
|
310
|
+
rsquared = 1 - ess_diff / tss_diff if tss_diff > 0 else 0.0
|
|
311
|
+
|
|
312
|
+
# Adjusted R-squared
|
|
313
|
+
rsquared_adj = 1 - (1 - rsquared) * (n - 1) / df_resid
|
|
314
|
+
|
|
315
|
+
# For FD, within/between/overall R² are less meaningful
|
|
316
|
+
# We report the R² of the differenced model as the primary measure
|
|
317
|
+
rsquared_within = rsquared # Differencing is similar to within transformation
|
|
318
|
+
rsquared_between = np.nan # Not applicable for FD
|
|
319
|
+
rsquared_overall = np.nan # Not applicable for FD
|
|
320
|
+
|
|
321
|
+
# Create Series/DataFrame with variable names
|
|
322
|
+
params = pd.Series(beta.ravel(), index=var_names)
|
|
323
|
+
std_errors_series = pd.Series(std_errors, index=var_names)
|
|
324
|
+
cov_params = pd.DataFrame(vcov, index=var_names, columns=var_names)
|
|
325
|
+
|
|
326
|
+
# Model information
|
|
327
|
+
model_info = {
|
|
328
|
+
'model_type': 'First Difference',
|
|
329
|
+
'formula': self.formula,
|
|
330
|
+
'cov_type': cov_type,
|
|
331
|
+
'cov_kwds': cov_kwds,
|
|
332
|
+
'entity_effects': True, # FD eliminates entity FE
|
|
333
|
+
'time_effects': False,
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
# Data information
|
|
337
|
+
data_info = {
|
|
338
|
+
'nobs': n, # Number of differenced observations
|
|
339
|
+
'n_entities': self.data.n_entities,
|
|
340
|
+
'n_periods': self.data.n_periods,
|
|
341
|
+
'n_obs_original': self.n_obs_original,
|
|
342
|
+
'n_obs_dropped': self.n_obs_original - n,
|
|
343
|
+
'df_model': df_model,
|
|
344
|
+
'df_resid': df_resid,
|
|
345
|
+
'entity_index': entities_diff,
|
|
346
|
+
'time_index': times_diff,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
# R-squared dictionary
|
|
350
|
+
rsquared_dict = {
|
|
351
|
+
'rsquared': rsquared, # R² of differenced model
|
|
352
|
+
'rsquared_adj': rsquared_adj,
|
|
353
|
+
'rsquared_within': rsquared_within,
|
|
354
|
+
'rsquared_between': rsquared_between,
|
|
355
|
+
'rsquared_overall': rsquared_overall
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
# Create results object
|
|
359
|
+
results = PanelResults(
|
|
360
|
+
params=params,
|
|
361
|
+
std_errors=std_errors_series,
|
|
362
|
+
cov_params=cov_params,
|
|
363
|
+
resid=resid_orig, # Residuals in original indexing
|
|
364
|
+
fittedvalues=fitted_orig, # Fitted values in original indexing
|
|
365
|
+
model_info=model_info,
|
|
366
|
+
data_info=data_info,
|
|
367
|
+
rsquared_dict=rsquared_dict,
|
|
368
|
+
model=self
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Store results and update state
|
|
372
|
+
self._results = results
|
|
373
|
+
self._fitted = True
|
|
374
|
+
|
|
375
|
+
return results
|
|
376
|
+
|
|
377
|
+
def _first_difference(
|
|
378
|
+
self,
|
|
379
|
+
y: np.ndarray,
|
|
380
|
+
X: np.ndarray,
|
|
381
|
+
entities: np.ndarray,
|
|
382
|
+
times: np.ndarray
|
|
383
|
+
) -> tuple:
|
|
384
|
+
"""
|
|
385
|
+
Apply first difference transformation.
|
|
386
|
+
|
|
387
|
+
Computes Δy_it = y_it - y_{i,t-1} and Δx_it = x_it - x_{i,t-1}
|
|
388
|
+
for each entity i.
|
|
389
|
+
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
y : np.ndarray
|
|
393
|
+
Dependent variable
|
|
394
|
+
X : np.ndarray
|
|
395
|
+
Independent variables
|
|
396
|
+
entities : np.ndarray
|
|
397
|
+
Entity identifiers
|
|
398
|
+
times : np.ndarray
|
|
399
|
+
Time identifiers
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
y_diff : np.ndarray
|
|
404
|
+
Differenced dependent variable
|
|
405
|
+
X_diff : np.ndarray
|
|
406
|
+
Differenced independent variables
|
|
407
|
+
entities_diff : np.ndarray
|
|
408
|
+
Entity identifiers for differenced observations
|
|
409
|
+
times_diff : np.ndarray
|
|
410
|
+
Time identifiers for differenced observations
|
|
411
|
+
valid_idx : np.ndarray
|
|
412
|
+
Indices of valid differenced observations in original data
|
|
413
|
+
"""
|
|
414
|
+
# Get unique entities
|
|
415
|
+
unique_entities = np.unique(entities)
|
|
416
|
+
|
|
417
|
+
# Initialize lists for differenced data
|
|
418
|
+
y_diff_list = []
|
|
419
|
+
X_diff_list = []
|
|
420
|
+
entities_diff_list = []
|
|
421
|
+
times_diff_list = []
|
|
422
|
+
valid_idx_list = []
|
|
423
|
+
|
|
424
|
+
# For each entity, compute first differences
|
|
425
|
+
for entity in unique_entities:
|
|
426
|
+
# Get observations for this entity
|
|
427
|
+
mask = entities == entity
|
|
428
|
+
indices = np.where(mask)[0]
|
|
429
|
+
|
|
430
|
+
# Get entity-specific data
|
|
431
|
+
y_entity = y[mask]
|
|
432
|
+
X_entity = X[mask]
|
|
433
|
+
times_entity = times[mask]
|
|
434
|
+
|
|
435
|
+
# Sort by time (should already be sorted, but ensure)
|
|
436
|
+
sort_idx = np.argsort(times_entity)
|
|
437
|
+
y_entity = y_entity[sort_idx]
|
|
438
|
+
X_entity = X_entity[sort_idx]
|
|
439
|
+
times_entity = times_entity[sort_idx]
|
|
440
|
+
indices_sorted = indices[sort_idx]
|
|
441
|
+
|
|
442
|
+
# Compute first differences (drop first observation)
|
|
443
|
+
if len(y_entity) >= 2:
|
|
444
|
+
y_diff_entity = y_entity[1:] - y_entity[:-1]
|
|
445
|
+
X_diff_entity = X_entity[1:] - X_entity[:-1]
|
|
446
|
+
times_diff_entity = times_entity[1:] # Use time of current period
|
|
447
|
+
entities_diff_entity = np.full(len(y_diff_entity), entity)
|
|
448
|
+
valid_idx_entity = indices_sorted[1:] # Indices in original data
|
|
449
|
+
|
|
450
|
+
# Append to lists
|
|
451
|
+
y_diff_list.append(y_diff_entity)
|
|
452
|
+
X_diff_list.append(X_diff_entity)
|
|
453
|
+
entities_diff_list.append(entities_diff_entity)
|
|
454
|
+
times_diff_list.append(times_diff_entity)
|
|
455
|
+
valid_idx_list.append(valid_idx_entity)
|
|
456
|
+
|
|
457
|
+
# Concatenate all entities
|
|
458
|
+
y_diff = np.concatenate(y_diff_list)
|
|
459
|
+
X_diff = np.vstack(X_diff_list)
|
|
460
|
+
entities_diff = np.concatenate(entities_diff_list)
|
|
461
|
+
times_diff = np.concatenate(times_diff_list)
|
|
462
|
+
valid_idx = np.concatenate(valid_idx_list)
|
|
463
|
+
|
|
464
|
+
return y_diff, X_diff, entities_diff, times_diff, valid_idx
|
|
465
|
+
|
|
466
|
+
def _estimate_coefficients(self) -> np.ndarray:
|
|
467
|
+
"""
|
|
468
|
+
Estimate coefficients (implementation of abstract method).
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
np.ndarray
|
|
473
|
+
Estimated coefficients
|
|
474
|
+
"""
|
|
475
|
+
# Build design matrices
|
|
476
|
+
y, X = self.formula_parser.build_design_matrices(
|
|
477
|
+
self.data.data,
|
|
478
|
+
return_type='array'
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Remove intercept
|
|
482
|
+
if self.formula_parser.has_intercept:
|
|
483
|
+
X = X[:, 1:]
|
|
484
|
+
|
|
485
|
+
# Get identifiers
|
|
486
|
+
entities = self.data.data[self.data.entity_col].values
|
|
487
|
+
times = self.data.data[self.data.time_col].values
|
|
488
|
+
|
|
489
|
+
# Apply first difference
|
|
490
|
+
y_diff, X_diff, _, _, _ = self._first_difference(y, X, entities, times)
|
|
491
|
+
|
|
492
|
+
# OLS on differenced data
|
|
493
|
+
beta, _, _ = compute_ols(y_diff, X_diff, self.weights)
|
|
494
|
+
return beta
|
|
@@ -17,6 +17,14 @@ from panelbox.utils.matrix_ops import (
|
|
|
17
17
|
compute_panel_rsquared,
|
|
18
18
|
demean_matrix
|
|
19
19
|
)
|
|
20
|
+
from panelbox.standard_errors import (
|
|
21
|
+
robust_covariance,
|
|
22
|
+
cluster_by_entity,
|
|
23
|
+
twoway_cluster,
|
|
24
|
+
driscoll_kraay,
|
|
25
|
+
newey_west,
|
|
26
|
+
pcse
|
|
27
|
+
)
|
|
20
28
|
|
|
21
29
|
|
|
22
30
|
class FixedEffects(PanelModel):
|
|
@@ -121,10 +129,18 @@ class FixedEffects(PanelModel):
|
|
|
121
129
|
cov_type : str, default='nonrobust'
|
|
122
130
|
Type of covariance estimator:
|
|
123
131
|
- 'nonrobust': Classical standard errors
|
|
124
|
-
- 'robust': Heteroskedasticity-robust (HC1)
|
|
125
|
-
- '
|
|
132
|
+
- 'robust' or 'hc1': Heteroskedasticity-robust (HC1)
|
|
133
|
+
- 'hc0', 'hc2', 'hc3': Other HC variants
|
|
134
|
+
- 'clustered': Cluster-robust (by entity by default)
|
|
135
|
+
- 'twoway': Two-way clustered (entity and time)
|
|
136
|
+
- 'driscoll_kraay': Driscoll-Kraay (spatial/temporal dependence)
|
|
137
|
+
- 'newey_west': Newey-West HAC
|
|
138
|
+
- 'pcse': Panel-Corrected Standard Errors (requires T > N)
|
|
126
139
|
**cov_kwds
|
|
127
|
-
Additional arguments for covariance estimation
|
|
140
|
+
Additional arguments for covariance estimation:
|
|
141
|
+
- cluster_col: For custom clustering (default: entity)
|
|
142
|
+
- max_lags: For Driscoll-Kraay and Newey-West
|
|
143
|
+
- kernel: For HAC estimators ('bartlett', 'parzen', 'quadratic_spectral')
|
|
128
144
|
|
|
129
145
|
Returns
|
|
130
146
|
-------
|
|
@@ -133,8 +149,27 @@ class FixedEffects(PanelModel):
|
|
|
133
149
|
|
|
134
150
|
Examples
|
|
135
151
|
--------
|
|
152
|
+
>>> # Classical standard errors
|
|
153
|
+
>>> results = model.fit(cov_type='nonrobust')
|
|
154
|
+
|
|
155
|
+
>>> # Heteroskedasticity-robust
|
|
136
156
|
>>> results = model.fit(cov_type='robust')
|
|
137
|
-
>>>
|
|
157
|
+
>>> results = model.fit(cov_type='hc3')
|
|
158
|
+
|
|
159
|
+
>>> # Cluster-robust by entity
|
|
160
|
+
>>> results = model.fit(cov_type='clustered')
|
|
161
|
+
|
|
162
|
+
>>> # Two-way clustering
|
|
163
|
+
>>> results = model.fit(cov_type='twoway')
|
|
164
|
+
|
|
165
|
+
>>> # Driscoll-Kraay (for spatial/temporal dependence)
|
|
166
|
+
>>> results = model.fit(cov_type='driscoll_kraay', max_lags=3)
|
|
167
|
+
|
|
168
|
+
>>> # Newey-West HAC
|
|
169
|
+
>>> results = model.fit(cov_type='newey_west', max_lags=4)
|
|
170
|
+
|
|
171
|
+
>>> # Panel-Corrected SE (requires T > N)
|
|
172
|
+
>>> results = model.fit(cov_type='pcse')
|
|
138
173
|
"""
|
|
139
174
|
# Build design matrices
|
|
140
175
|
y_orig, X_orig = self.formula_parser.build_design_matrices(
|
|
@@ -211,16 +246,50 @@ class FixedEffects(PanelModel):
|
|
|
211
246
|
)
|
|
212
247
|
|
|
213
248
|
# Compute covariance matrix (on demeaned data)
|
|
214
|
-
|
|
249
|
+
cov_type_lower = cov_type.lower()
|
|
250
|
+
|
|
251
|
+
if cov_type_lower == 'nonrobust':
|
|
215
252
|
vcov = compute_vcov_nonrobust(X, resid_demeaned, df_resid)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
253
|
+
|
|
254
|
+
elif cov_type_lower in ['robust', 'hc0', 'hc1', 'hc2', 'hc3']:
|
|
255
|
+
# Map 'robust' to 'hc1' (default robust method)
|
|
256
|
+
method = 'HC1' if cov_type_lower == 'robust' else cov_type_lower.upper()
|
|
257
|
+
result = robust_covariance(X, resid_demeaned, method=method)
|
|
258
|
+
vcov = result.cov_matrix
|
|
259
|
+
|
|
260
|
+
elif cov_type_lower == 'clustered':
|
|
261
|
+
# Default: cluster by entity
|
|
262
|
+
result = cluster_by_entity(X, resid_demeaned, entities, df_correction=True)
|
|
263
|
+
vcov = result.cov_matrix
|
|
264
|
+
|
|
265
|
+
elif cov_type_lower == 'twoway':
|
|
266
|
+
# Two-way clustering: entity and time
|
|
267
|
+
result = twoway_cluster(X, resid_demeaned, entities, times, df_correction=True)
|
|
268
|
+
vcov = result.cov_matrix
|
|
269
|
+
|
|
270
|
+
elif cov_type_lower == 'driscoll_kraay':
|
|
271
|
+
# Driscoll-Kraay for spatial/temporal dependence
|
|
272
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
273
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
274
|
+
result = driscoll_kraay(X, resid_demeaned, times, max_lags=max_lags, kernel=kernel)
|
|
275
|
+
vcov = result.cov_matrix
|
|
276
|
+
|
|
277
|
+
elif cov_type_lower == 'newey_west':
|
|
278
|
+
# Newey-West HAC
|
|
279
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
280
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
281
|
+
result = newey_west(X, resid_demeaned, max_lags=max_lags, kernel=kernel)
|
|
282
|
+
vcov = result.cov_matrix
|
|
283
|
+
|
|
284
|
+
elif cov_type_lower == 'pcse':
|
|
285
|
+
# Panel-Corrected Standard Errors
|
|
286
|
+
result = pcse(X, resid_demeaned, entities, times)
|
|
287
|
+
vcov = result.cov_matrix
|
|
288
|
+
|
|
220
289
|
else:
|
|
221
290
|
raise ValueError(
|
|
222
|
-
f"cov_type must be 'nonrobust', 'robust',
|
|
223
|
-
f"got '{cov_type}'"
|
|
291
|
+
f"cov_type must be one of: 'nonrobust', 'robust', 'hc0', 'hc1', 'hc2', 'hc3', "
|
|
292
|
+
f"'clustered', 'twoway', 'driscoll_kraay', 'newey_west', 'pcse', got '{cov_type}'"
|
|
224
293
|
)
|
|
225
294
|
|
|
226
295
|
# Standard errors
|