panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mundlak test for Random Effects specification.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Mundlak, Y. (1978). On the pooling of time series and cross section data.
|
|
7
|
+
Econometrica, 46(1), 69-85.
|
|
8
|
+
|
|
9
|
+
Wooldridge, J. M. (2010). Econometric Analysis of Cross Section and Panel Data
|
|
10
|
+
(2nd ed.). MIT Press.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from scipy import stats
|
|
16
|
+
|
|
17
|
+
from panelbox.validation.base import ValidationTest, ValidationTestResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MundlakTest(ValidationTest):
|
|
21
|
+
"""
|
|
22
|
+
Mundlak test for Random Effects specification.
|
|
23
|
+
|
|
24
|
+
Tests whether the random effects assumption that entity effects are
|
|
25
|
+
uncorrelated with the regressors is valid.
|
|
26
|
+
|
|
27
|
+
H0: Cov(u_i, X_it) = 0 (RE is appropriate)
|
|
28
|
+
H1: Cov(u_i, X_it) ≠ 0 (use FE instead)
|
|
29
|
+
|
|
30
|
+
The test augments the RE model with the time averages of the
|
|
31
|
+
time-varying regressors and tests if their coefficients are jointly zero.
|
|
32
|
+
|
|
33
|
+
Notes
|
|
34
|
+
-----
|
|
35
|
+
This is essentially testing the same thing as the Hausman test, but
|
|
36
|
+
implemented differently. If the Mundlak test rejects, it suggests
|
|
37
|
+
that Fixed Effects should be used instead of Random Effects.
|
|
38
|
+
|
|
39
|
+
The test statistic is an F-test (or Wald chi-squared test) on the
|
|
40
|
+
coefficients of the time-averaged variables.
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> from panelbox.models.static.random_effects import RandomEffects
|
|
45
|
+
>>> re = RandomEffects("y ~ x1 + x2", data, "entity", "time")
|
|
46
|
+
>>> results = re.fit()
|
|
47
|
+
>>>
|
|
48
|
+
>>> from panelbox.validation.specification.mundlak import MundlakTest
|
|
49
|
+
>>> test = MundlakTest(results)
|
|
50
|
+
>>> result = test.run()
|
|
51
|
+
>>> print(result)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, results: 'PanelResults'):
|
|
55
|
+
"""
|
|
56
|
+
Initialize Mundlak test.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
results : PanelResults
|
|
61
|
+
Results from panel model estimation (preferably Random Effects)
|
|
62
|
+
"""
|
|
63
|
+
super().__init__(results)
|
|
64
|
+
|
|
65
|
+
if 'Random Effects' not in self.model_type:
|
|
66
|
+
import warnings
|
|
67
|
+
warnings.warn(
|
|
68
|
+
"Mundlak test is designed for Random Effects models. "
|
|
69
|
+
f"Current model: {self.model_type}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def run(self, alpha: float = 0.05) -> ValidationTestResult:
|
|
73
|
+
"""
|
|
74
|
+
Run Mundlak test for RE specification.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
alpha : float, default=0.05
|
|
79
|
+
Significance level
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
ValidationTestResult
|
|
84
|
+
Test results
|
|
85
|
+
|
|
86
|
+
Raises
|
|
87
|
+
------
|
|
88
|
+
ValueError
|
|
89
|
+
If design matrix or entity indices are not available
|
|
90
|
+
|
|
91
|
+
Notes
|
|
92
|
+
-----
|
|
93
|
+
The test procedure:
|
|
94
|
+
1. Estimate augmented RE model: y_it = X_it*beta + X_i_bar*delta + u_i + e_it
|
|
95
|
+
where X_i_bar are entity means of time-varying variables
|
|
96
|
+
2. Test H0: delta = 0 using Wald test
|
|
97
|
+
3. If reject, RE assumption is violated → use FE
|
|
98
|
+
|
|
99
|
+
Implementation:
|
|
100
|
+
This implementation follows the standard approach used in R (plm package)
|
|
101
|
+
and Stata. The augmented model is estimated using Random Effects with
|
|
102
|
+
Swamy-Arora transformation to properly account for the panel structure.
|
|
103
|
+
"""
|
|
104
|
+
# Get original data, formula, and variable names
|
|
105
|
+
data, formula, entity_col, time_col, var_names = self._get_data_full()
|
|
106
|
+
|
|
107
|
+
if data is None or formula is None or var_names is None:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"Data, formula, and variable names required for Mundlak test. "
|
|
110
|
+
"Ensure the model was estimated with a formula and panel structure."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Create augmented dataset with group means
|
|
114
|
+
data_aug = data.copy()
|
|
115
|
+
|
|
116
|
+
# Compute entity means for each regressor (excluding constant)
|
|
117
|
+
mean_vars = []
|
|
118
|
+
for var in var_names:
|
|
119
|
+
if var in data_aug.columns:
|
|
120
|
+
mean_col_name = f'{var}_mean'
|
|
121
|
+
data_aug[mean_col_name] = data_aug.groupby(entity_col)[var].transform('mean')
|
|
122
|
+
mean_vars.append(mean_col_name)
|
|
123
|
+
|
|
124
|
+
if len(mean_vars) == 0:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"No time-varying regressors found. "
|
|
127
|
+
"Mundlak test requires at least one time-varying regressor."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Build augmented formula: y ~ x1 + x2 + ... + x1_mean + x2_mean + ...
|
|
131
|
+
# Parse original formula to get dependent variable
|
|
132
|
+
dep_var = formula.split('~')[0].strip()
|
|
133
|
+
orig_vars = ' + '.join(var_names)
|
|
134
|
+
mean_formula = ' + '.join(mean_vars)
|
|
135
|
+
augmented_formula = f"{dep_var} ~ {orig_vars} + {mean_formula}"
|
|
136
|
+
|
|
137
|
+
# Estimate augmented model with cluster-robust SE
|
|
138
|
+
# NOTE: We use Pooled OLS with clustered SE instead of RE because
|
|
139
|
+
# the PanelBox RE implementation has numerical issues with variables
|
|
140
|
+
# that are constant within-group (like group means).
|
|
141
|
+
# Pooled OLS with cluster-robust SE gives results very similar to
|
|
142
|
+
# R's plm RE estimation for the Mundlak test.
|
|
143
|
+
try:
|
|
144
|
+
from panelbox.models.static.pooled_ols import PooledOLS
|
|
145
|
+
|
|
146
|
+
model_augmented = PooledOLS(
|
|
147
|
+
augmented_formula,
|
|
148
|
+
data_aug,
|
|
149
|
+
entity_col,
|
|
150
|
+
time_col
|
|
151
|
+
)
|
|
152
|
+
# Use cluster-robust SE (clustered by entity)
|
|
153
|
+
re_results = model_augmented.fit(cov_type='clustered', cov_kwds={'groups': entity_col})
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"Failed to estimate augmented model: {e}"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Extract coefficients on group means (delta)
|
|
161
|
+
k_vars = len(mean_vars)
|
|
162
|
+
|
|
163
|
+
# Get parameter names and find indices of mean variables
|
|
164
|
+
param_names = list(re_results.params.index)
|
|
165
|
+
mean_indices = [i for i, name in enumerate(param_names) if name in mean_vars]
|
|
166
|
+
|
|
167
|
+
if len(mean_indices) != k_vars:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"Expected {k_vars} mean coefficients, found {len(mean_indices)}"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Extract delta coefficients
|
|
173
|
+
delta = re_results.params.iloc[mean_indices].values
|
|
174
|
+
|
|
175
|
+
# Extract variance-covariance matrix for delta
|
|
176
|
+
# This is the key: we use the var-cov from the RE model, not OLS
|
|
177
|
+
vcov_full = re_results.cov_params
|
|
178
|
+
vcov_delta = vcov_full.iloc[mean_indices, mean_indices].values
|
|
179
|
+
|
|
180
|
+
# Wald test: delta' Var(delta)^{-1} delta ~ Chi2(k_vars)
|
|
181
|
+
try:
|
|
182
|
+
vcov_delta_inv = np.linalg.inv(vcov_delta)
|
|
183
|
+
except np.linalg.LinAlgError:
|
|
184
|
+
vcov_delta_inv = np.linalg.pinv(vcov_delta)
|
|
185
|
+
|
|
186
|
+
# Compute quadratic form
|
|
187
|
+
wald_stat_array = delta.T @ vcov_delta_inv @ delta
|
|
188
|
+
wald_stat = float(
|
|
189
|
+
wald_stat_array.item() if hasattr(wald_stat_array, 'item')
|
|
190
|
+
else wald_stat_array
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Degrees of freedom
|
|
194
|
+
df = k_vars
|
|
195
|
+
|
|
196
|
+
# P-value from chi-squared distribution
|
|
197
|
+
pvalue = 1 - stats.chi2.cdf(wald_stat, df)
|
|
198
|
+
|
|
199
|
+
# Metadata
|
|
200
|
+
delta_dict = {
|
|
201
|
+
mean_vars[i]: float(delta[i].item() if hasattr(delta[i], 'item') else delta[i])
|
|
202
|
+
for i in range(len(delta))
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Extract standard errors for reference
|
|
206
|
+
se_delta = np.sqrt(np.diag(vcov_delta))
|
|
207
|
+
se_dict = {
|
|
208
|
+
mean_vars[i]: float(se_delta[i])
|
|
209
|
+
for i in range(len(se_delta))
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
metadata = {
|
|
213
|
+
'n_time_varying_vars': k_vars,
|
|
214
|
+
'delta_coefficients': delta_dict,
|
|
215
|
+
'standard_errors': se_dict,
|
|
216
|
+
'F_statistic': wald_stat / df if df > 0 else 0.0,
|
|
217
|
+
'augmented_formula': augmented_formula,
|
|
218
|
+
'implementation': 'Pooled OLS with cluster-robust SE (entity-clustered)'
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
result = ValidationTestResult(
|
|
222
|
+
test_name="Mundlak Test for RE Specification",
|
|
223
|
+
statistic=wald_stat,
|
|
224
|
+
pvalue=pvalue,
|
|
225
|
+
null_hypothesis="RE is consistent (entity effects uncorrelated with regressors)",
|
|
226
|
+
alternative_hypothesis="RE is inconsistent (use Fixed Effects)",
|
|
227
|
+
alpha=alpha,
|
|
228
|
+
df=df,
|
|
229
|
+
metadata=metadata
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return result
|
|
233
|
+
|
|
234
|
+
def _get_data_full(self):
|
|
235
|
+
"""
|
|
236
|
+
Get full data including DataFrame, formula, and variable names.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
tuple
|
|
241
|
+
(data, formula, entity_col, time_col, var_names) or
|
|
242
|
+
(None, None, None, None, None) if not available
|
|
243
|
+
|
|
244
|
+
Notes
|
|
245
|
+
-----
|
|
246
|
+
This method extracts:
|
|
247
|
+
- data: Original pandas DataFrame
|
|
248
|
+
- formula: Formula string (e.g., "y ~ x1 + x2")
|
|
249
|
+
- entity_col: Name of entity column
|
|
250
|
+
- time_col: Name of time column
|
|
251
|
+
- var_names: List of regressor names (excluding constant)
|
|
252
|
+
"""
|
|
253
|
+
if not hasattr(self.results, '_model'):
|
|
254
|
+
return None, None, None, None, None
|
|
255
|
+
|
|
256
|
+
model = self.results._model
|
|
257
|
+
|
|
258
|
+
if not (hasattr(model, 'formula_parser') and hasattr(model, 'data')):
|
|
259
|
+
return None, None, None, None, None
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
# Get original data
|
|
263
|
+
data = model.data.data.copy()
|
|
264
|
+
|
|
265
|
+
# Get entity and time columns
|
|
266
|
+
entity_col = model.data.entity_col
|
|
267
|
+
time_col = model.data.time_col
|
|
268
|
+
|
|
269
|
+
# Get formula
|
|
270
|
+
if hasattr(model, 'formula'):
|
|
271
|
+
formula = model.formula
|
|
272
|
+
else:
|
|
273
|
+
return None, None, None, None, None
|
|
274
|
+
|
|
275
|
+
# Extract variable names from formula parser
|
|
276
|
+
# The formula_parser should have information about the terms
|
|
277
|
+
if hasattr(model.formula_parser, 'rhs_terms'):
|
|
278
|
+
# Get RHS terms (excluding Intercept)
|
|
279
|
+
var_names = [
|
|
280
|
+
term for term in model.formula_parser.rhs_terms
|
|
281
|
+
if term.lower() not in ['intercept', '1']
|
|
282
|
+
]
|
|
283
|
+
else:
|
|
284
|
+
# Fallback: parse formula manually
|
|
285
|
+
# Format: "y ~ x1 + x2 + ..."
|
|
286
|
+
rhs = formula.split('~')[1].strip()
|
|
287
|
+
terms = [t.strip() for t in rhs.split('+')]
|
|
288
|
+
var_names = [
|
|
289
|
+
t for t in terms
|
|
290
|
+
if t.lower() not in ['1', 'intercept', '']
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
return data, formula, entity_col, time_col, var_names
|
|
294
|
+
|
|
295
|
+
except Exception:
|
|
296
|
+
return None, None, None, None, None
|
|
297
|
+
|
|
298
|
+
def _get_data(self):
|
|
299
|
+
"""
|
|
300
|
+
Get design matrix, dependent variable, and entity indices.
|
|
301
|
+
|
|
302
|
+
Returns
|
|
303
|
+
-------
|
|
304
|
+
tuple
|
|
305
|
+
(X, y, entities) or (None, None, None) if not available
|
|
306
|
+
|
|
307
|
+
Notes
|
|
308
|
+
-----
|
|
309
|
+
This is a legacy method kept for compatibility.
|
|
310
|
+
New code should use _get_data_full() instead.
|
|
311
|
+
"""
|
|
312
|
+
if not hasattr(self.results, '_model'):
|
|
313
|
+
return None, None, None
|
|
314
|
+
|
|
315
|
+
model = self.results._model
|
|
316
|
+
|
|
317
|
+
if not (hasattr(model, 'formula_parser') and hasattr(model, 'data')):
|
|
318
|
+
return None, None, None
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
y, X = model.formula_parser.build_design_matrices(
|
|
322
|
+
model.data.data,
|
|
323
|
+
return_type='array'
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
entities = model.data.data[model.data.entity_col].values.ravel()
|
|
327
|
+
|
|
328
|
+
return X, y.ravel(), entities
|
|
329
|
+
|
|
330
|
+
except Exception:
|
|
331
|
+
return None, None, None
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RESET test for specification errors in panel data models.
|
|
3
|
+
|
|
4
|
+
RESET = Regression Equation Specification Error Test
|
|
5
|
+
|
|
6
|
+
References
|
|
7
|
+
----------
|
|
8
|
+
Ramsey, J. B. (1969). Tests for Specification Errors in Classical Linear
|
|
9
|
+
Least Squares Regression Analysis. Journal of the Royal Statistical Society,
|
|
10
|
+
Series B, 31(2), 350-371.
|
|
11
|
+
|
|
12
|
+
Wooldridge, J. M. (2010). Econometric Analysis of Cross Section and Panel Data
|
|
13
|
+
(2nd ed.). MIT Press.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from scipy import stats
|
|
19
|
+
|
|
20
|
+
from panelbox.validation.base import ValidationTest, ValidationTestResult
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RESETTest(ValidationTest):
|
|
24
|
+
"""
|
|
25
|
+
RESET test for functional form specification.
|
|
26
|
+
|
|
27
|
+
Tests the null hypothesis that the model is correctly specified
|
|
28
|
+
(linear functional form is appropriate) against the alternative
|
|
29
|
+
that nonlinear terms are needed.
|
|
30
|
+
|
|
31
|
+
H0: E[y | X] = X*beta (linear specification is correct)
|
|
32
|
+
H1: E[y | X] includes higher-order terms of fitted values
|
|
33
|
+
|
|
34
|
+
The test augments the model with powers of fitted values (ŷ², ŷ³, ...)
|
|
35
|
+
and tests if these terms are jointly significant.
|
|
36
|
+
|
|
37
|
+
Notes
|
|
38
|
+
-----
|
|
39
|
+
The test is implemented by:
|
|
40
|
+
1. Estimating original model: y = X*beta + e
|
|
41
|
+
2. Computing fitted values: ŷ = X*beta_hat
|
|
42
|
+
3. Augmenting model: y = X*beta + gamma2*ŷ² + gamma3*ŷ³ + ... + u
|
|
43
|
+
4. Testing H0: gamma2 = gamma3 = ... = 0 using F-test
|
|
44
|
+
|
|
45
|
+
For panel data, we use pooled OLS with cluster-robust standard errors
|
|
46
|
+
to account for within-group correlation.
|
|
47
|
+
|
|
48
|
+
Common practice is to include powers 2 and 3 (default).
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
>>> from panelbox.models.static.pooled_ols import PooledOLS
|
|
53
|
+
>>> model = PooledOLS("y ~ x1 + x2", data, "entity", "time")
|
|
54
|
+
>>> results = model.fit()
|
|
55
|
+
>>>
|
|
56
|
+
>>> from panelbox.validation.specification.reset import RESETTest
|
|
57
|
+
>>> test = RESETTest(results)
|
|
58
|
+
>>> result = test.run(powers=[2, 3]) # Test with ŷ² and ŷ³
|
|
59
|
+
>>> print(result)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, results: 'PanelResults'):
|
|
63
|
+
"""
|
|
64
|
+
Initialize RESET test.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
results : PanelResults
|
|
69
|
+
Results from panel model estimation
|
|
70
|
+
"""
|
|
71
|
+
super().__init__(results)
|
|
72
|
+
|
|
73
|
+
def run(self, powers=None, alpha: float = 0.05) -> ValidationTestResult:
|
|
74
|
+
"""
|
|
75
|
+
Run RESET test for specification errors.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
powers : list of int, optional
|
|
80
|
+
Powers of fitted values to include in augmented regression.
|
|
81
|
+
Default is [2, 3] (quadratic and cubic terms).
|
|
82
|
+
alpha : float, default=0.05
|
|
83
|
+
Significance level
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
ValidationTestResult
|
|
88
|
+
Test results
|
|
89
|
+
|
|
90
|
+
Raises
|
|
91
|
+
------
|
|
92
|
+
ValueError
|
|
93
|
+
If powers are invalid or data is not available
|
|
94
|
+
|
|
95
|
+
Notes
|
|
96
|
+
-----
|
|
97
|
+
The test uses an F-statistic for testing joint significance of
|
|
98
|
+
the polynomial terms. For panel data, we use cluster-robust
|
|
99
|
+
standard errors.
|
|
100
|
+
"""
|
|
101
|
+
if powers is None:
|
|
102
|
+
powers = [2, 3]
|
|
103
|
+
|
|
104
|
+
# Validate powers
|
|
105
|
+
if not all(isinstance(p, int) and p >= 2 for p in powers):
|
|
106
|
+
raise ValueError("Powers must be integers >= 2")
|
|
107
|
+
|
|
108
|
+
# Get data
|
|
109
|
+
data, formula, entity_col, time_col, var_names = self._get_data_full()
|
|
110
|
+
|
|
111
|
+
if data is None or formula is None:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"Data and formula required for RESET test. "
|
|
114
|
+
"Ensure the model was estimated with a formula."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Get fitted values from original model
|
|
118
|
+
fitted = self.results.fitted_values
|
|
119
|
+
if fitted is None:
|
|
120
|
+
raise ValueError("Fitted values not available from model results")
|
|
121
|
+
|
|
122
|
+
# Create augmented dataset with powers of fitted values
|
|
123
|
+
data_aug = data.copy()
|
|
124
|
+
power_vars = []
|
|
125
|
+
|
|
126
|
+
for power in powers:
|
|
127
|
+
var_name = f'fitted_pow{power}'
|
|
128
|
+
data_aug[var_name] = fitted ** power
|
|
129
|
+
power_vars.append(var_name)
|
|
130
|
+
|
|
131
|
+
# Build augmented formula
|
|
132
|
+
dep_var = formula.split('~')[0].strip()
|
|
133
|
+
orig_vars = ' + '.join(var_names)
|
|
134
|
+
power_formula = ' + '.join(power_vars)
|
|
135
|
+
augmented_formula = f"{dep_var} ~ {orig_vars} + {power_formula}"
|
|
136
|
+
|
|
137
|
+
# Estimate augmented model with cluster-robust SE
|
|
138
|
+
try:
|
|
139
|
+
from panelbox.models.static.pooled_ols import PooledOLS
|
|
140
|
+
|
|
141
|
+
model_aug = PooledOLS(
|
|
142
|
+
augmented_formula,
|
|
143
|
+
data_aug,
|
|
144
|
+
entity_col,
|
|
145
|
+
time_col
|
|
146
|
+
)
|
|
147
|
+
results_aug = model_aug.fit(
|
|
148
|
+
cov_type='clustered',
|
|
149
|
+
cov_kwds={'groups': entity_col}
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
raise ValueError(f"Failed to estimate augmented model: {e}")
|
|
154
|
+
|
|
155
|
+
# Extract coefficients on power terms
|
|
156
|
+
gamma = results_aug.params[power_vars].values
|
|
157
|
+
|
|
158
|
+
# Extract variance-covariance matrix for power terms
|
|
159
|
+
vcov_gamma = results_aug.cov_params.loc[power_vars, power_vars].values
|
|
160
|
+
|
|
161
|
+
# Wald test: gamma' * Vcov(gamma)^-1 * gamma ~ Chi2(k)
|
|
162
|
+
# where k = number of power terms
|
|
163
|
+
try:
|
|
164
|
+
vcov_inv = np.linalg.inv(vcov_gamma)
|
|
165
|
+
except np.linalg.LinAlgError:
|
|
166
|
+
vcov_inv = np.linalg.pinv(vcov_gamma)
|
|
167
|
+
|
|
168
|
+
wald_stat_array = gamma.T @ vcov_inv @ gamma
|
|
169
|
+
wald_stat = float(
|
|
170
|
+
wald_stat_array.item() if hasattr(wald_stat_array, 'item')
|
|
171
|
+
else wald_stat_array
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Degrees of freedom
|
|
175
|
+
df_num = len(powers)
|
|
176
|
+
df_denom = results_aug.nobs - results_aug.params.shape[0]
|
|
177
|
+
|
|
178
|
+
# Convert to F-statistic
|
|
179
|
+
f_stat = wald_stat / df_num
|
|
180
|
+
|
|
181
|
+
# P-value from F distribution
|
|
182
|
+
pvalue = 1 - stats.f.cdf(f_stat, df_num, df_denom)
|
|
183
|
+
|
|
184
|
+
# Alternative: use chi-squared approximation
|
|
185
|
+
pvalue_chi2 = 1 - stats.chi2.cdf(wald_stat, df_num)
|
|
186
|
+
|
|
187
|
+
# Metadata
|
|
188
|
+
gamma_dict = {
|
|
189
|
+
power_vars[i]: float(gamma[i])
|
|
190
|
+
for i in range(len(gamma))
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
se_gamma = np.sqrt(np.diag(vcov_gamma))
|
|
194
|
+
se_dict = {
|
|
195
|
+
power_vars[i]: float(se_gamma[i])
|
|
196
|
+
for i in range(len(se_gamma))
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
metadata = {
|
|
200
|
+
'powers': powers,
|
|
201
|
+
'gamma_coefficients': gamma_dict,
|
|
202
|
+
'standard_errors': se_dict,
|
|
203
|
+
'wald_statistic': wald_stat,
|
|
204
|
+
'F_statistic': f_stat,
|
|
205
|
+
'df_numerator': df_num,
|
|
206
|
+
'df_denominator': df_denom,
|
|
207
|
+
'pvalue_chi2': pvalue_chi2,
|
|
208
|
+
'augmented_formula': augmented_formula
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
result = ValidationTestResult(
|
|
212
|
+
test_name="RESET Test for Specification",
|
|
213
|
+
statistic=f_stat,
|
|
214
|
+
pvalue=pvalue,
|
|
215
|
+
null_hypothesis="Model is correctly specified (linear functional form)",
|
|
216
|
+
alternative_hypothesis="Nonlinear terms needed (specification error)",
|
|
217
|
+
alpha=alpha,
|
|
218
|
+
df=(df_num, df_denom),
|
|
219
|
+
metadata=metadata
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return result
|
|
223
|
+
|
|
224
|
+
def _get_data_full(self):
|
|
225
|
+
"""
|
|
226
|
+
Get full data including DataFrame, formula, and variable names.
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
tuple
|
|
231
|
+
(data, formula, entity_col, time_col, var_names) or
|
|
232
|
+
(None, None, None, None, None) if not available
|
|
233
|
+
"""
|
|
234
|
+
if not hasattr(self.results, '_model'):
|
|
235
|
+
return None, None, None, None, None
|
|
236
|
+
|
|
237
|
+
model = self.results._model
|
|
238
|
+
|
|
239
|
+
if not (hasattr(model, 'formula_parser') and hasattr(model, 'data')):
|
|
240
|
+
return None, None, None, None, None
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
# Get original data
|
|
244
|
+
data = model.data.data.copy()
|
|
245
|
+
|
|
246
|
+
# Get entity and time columns
|
|
247
|
+
entity_col = model.data.entity_col
|
|
248
|
+
time_col = model.data.time_col
|
|
249
|
+
|
|
250
|
+
# Get formula
|
|
251
|
+
if hasattr(model, 'formula'):
|
|
252
|
+
formula = model.formula
|
|
253
|
+
else:
|
|
254
|
+
return None, None, None, None, None
|
|
255
|
+
|
|
256
|
+
# Extract variable names from formula
|
|
257
|
+
if hasattr(model.formula_parser, 'rhs_terms'):
|
|
258
|
+
var_names = [
|
|
259
|
+
term for term in model.formula_parser.rhs_terms
|
|
260
|
+
if term.lower() not in ['intercept', '1']
|
|
261
|
+
]
|
|
262
|
+
else:
|
|
263
|
+
rhs = formula.split('~')[1].strip()
|
|
264
|
+
terms = [t.strip() for t in rhs.split('+')]
|
|
265
|
+
var_names = [
|
|
266
|
+
t for t in terms
|
|
267
|
+
if t.lower() not in ['1', 'intercept', '']
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
return data, formula, entity_col, time_col, var_names
|
|
271
|
+
|
|
272
|
+
except Exception:
|
|
273
|
+
return None, None, None, None, None
|
|
File without changes
|