panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Baltagi-Wu LBI test for serial correlation in panel data.
|
|
3
|
+
|
|
4
|
+
LBI = Locally Best Invariant
|
|
5
|
+
|
|
6
|
+
References
|
|
7
|
+
----------
|
|
8
|
+
Baltagi, B. H., & Wu, P. X. (1999). Unequally Spaced Panel Data Regressions
|
|
9
|
+
with AR(1) Disturbances. Econometric Theory, 15(6), 814-823.
|
|
10
|
+
|
|
11
|
+
Baltagi, B. H., & Li, Q. (1995). Testing AR(1) Against MA(1) Disturbances
|
|
12
|
+
in an Error Component Model. Journal of Econometrics, 68(1), 133-151.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from scipy import stats
|
|
18
|
+
|
|
19
|
+
from panelbox.validation.base import ValidationTest, ValidationTestResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaltagiWuTest(ValidationTest):
|
|
23
|
+
"""
|
|
24
|
+
Baltagi-Wu LBI test for first-order serial correlation in panel data.
|
|
25
|
+
|
|
26
|
+
This test is designed for unbalanced panels and tests for AR(1)
|
|
27
|
+
serial correlation in the idiosyncratic errors.
|
|
28
|
+
|
|
29
|
+
H0: No first-order serial correlation (rho = 0)
|
|
30
|
+
H1: AR(1) serial correlation present (rho ≠ 0)
|
|
31
|
+
|
|
32
|
+
The test is based on a modified Durbin-Watson statistic that
|
|
33
|
+
accounts for unbalanced panel structure.
|
|
34
|
+
|
|
35
|
+
Notes
|
|
36
|
+
-----
|
|
37
|
+
The test statistic is:
|
|
38
|
+
|
|
39
|
+
LBI = sum_i sum_t (e_it - e_{i,t-1})² / sum_i sum_t e_it²
|
|
40
|
+
|
|
41
|
+
Under H0, LBI ≈ 2 (similar to Durbin-Watson).
|
|
42
|
+
LBI < 2 suggests positive autocorrelation.
|
|
43
|
+
LBI > 2 suggests negative autocorrelation.
|
|
44
|
+
|
|
45
|
+
Unlike the standard Durbin-Watson test, the Baltagi-Wu test:
|
|
46
|
+
- Works with unbalanced panels
|
|
47
|
+
- Accounts for heterogeneous time series lengths
|
|
48
|
+
- Provides asymptotic normal distribution under H0
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
>>> from panelbox.models.static.fixed_effects import FixedEffects
|
|
53
|
+
>>> fe = FixedEffects("y ~ x1 + x2", data, "entity", "time")
|
|
54
|
+
>>> results = fe.fit()
|
|
55
|
+
>>>
|
|
56
|
+
>>> from panelbox.validation.serial_correlation.baltagi_wu import BaltagiWuTest
|
|
57
|
+
>>> test = BaltagiWuTest(results)
|
|
58
|
+
>>> result = test.run()
|
|
59
|
+
>>> print(result)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, results: 'PanelResults'):
|
|
63
|
+
"""
|
|
64
|
+
Initialize Baltagi-Wu test.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
results : PanelResults
|
|
69
|
+
Results from panel model estimation
|
|
70
|
+
"""
|
|
71
|
+
super().__init__(results)
|
|
72
|
+
|
|
73
|
+
def run(self, alpha: float = 0.05) -> ValidationTestResult:
|
|
74
|
+
"""
|
|
75
|
+
Run Baltagi-Wu test for serial correlation.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
alpha : float, default=0.05
|
|
80
|
+
Significance level
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
ValidationTestResult
|
|
85
|
+
Test results
|
|
86
|
+
|
|
87
|
+
Raises
|
|
88
|
+
------
|
|
89
|
+
ValueError
|
|
90
|
+
If panel has fewer than 2 time periods per entity
|
|
91
|
+
|
|
92
|
+
Notes
|
|
93
|
+
-----
|
|
94
|
+
The test uses asymptotic normality of the LBI statistic.
|
|
95
|
+
For large N and T, (LBI - 2) is approximately normally distributed.
|
|
96
|
+
|
|
97
|
+
The transformation to a standard normal test statistic uses:
|
|
98
|
+
z = (LBI - 2) / sqrt(var(LBI))
|
|
99
|
+
|
|
100
|
+
where var(LBI) is estimated from the panel structure.
|
|
101
|
+
"""
|
|
102
|
+
# Get residuals with entity and time structure
|
|
103
|
+
resid_df = self._prepare_residual_data()
|
|
104
|
+
|
|
105
|
+
# Sort by entity and time
|
|
106
|
+
resid_df = resid_df.sort_values(['entity', 'time'])
|
|
107
|
+
|
|
108
|
+
# Compute lagged residuals within each entity
|
|
109
|
+
resid_df['resid_lag'] = resid_df.groupby('entity')['resid'].shift(1)
|
|
110
|
+
|
|
111
|
+
# Compute differences
|
|
112
|
+
resid_df['resid_diff'] = resid_df['resid'] - resid_df['resid_lag']
|
|
113
|
+
|
|
114
|
+
# Drop missing values (first observation of each entity)
|
|
115
|
+
resid_df_clean = resid_df.dropna(subset=['resid_diff', 'resid_lag'])
|
|
116
|
+
|
|
117
|
+
if len(resid_df_clean) == 0:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"No valid observations after computing differences. "
|
|
120
|
+
"Ensure each entity has at least 2 time periods."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Compute LBI statistic
|
|
124
|
+
# LBI = sum(diff²) / sum(resid²)
|
|
125
|
+
numerator = np.sum(resid_df_clean['resid_diff'] ** 2)
|
|
126
|
+
denominator = np.sum(resid_df['resid'] ** 2) # Use all residuals
|
|
127
|
+
|
|
128
|
+
if denominator == 0:
|
|
129
|
+
raise ValueError("Sum of squared residuals is zero (perfect fit)")
|
|
130
|
+
|
|
131
|
+
lbi_stat = numerator / denominator
|
|
132
|
+
|
|
133
|
+
# Compute approximate variance of LBI
|
|
134
|
+
# Under H0: E[LBI] ≈ 2
|
|
135
|
+
# Var(LBI) ≈ 4 / (N*T_bar) where T_bar is average time periods
|
|
136
|
+
n_entities = resid_df['entity'].nunique()
|
|
137
|
+
n_obs_total = len(resid_df)
|
|
138
|
+
t_bar = n_obs_total / n_entities
|
|
139
|
+
|
|
140
|
+
# More refined variance estimate
|
|
141
|
+
# Account for unbalanced structure
|
|
142
|
+
entity_counts = resid_df.groupby('entity').size()
|
|
143
|
+
t_i = entity_counts.values
|
|
144
|
+
|
|
145
|
+
# Variance formula for unbalanced panels
|
|
146
|
+
# Var(LBI) ≈ 4 * sum(1/T_i) / N
|
|
147
|
+
var_lbi = 4 * np.sum(1 / t_i) / n_entities
|
|
148
|
+
|
|
149
|
+
# Standard error
|
|
150
|
+
se_lbi = np.sqrt(var_lbi)
|
|
151
|
+
|
|
152
|
+
# Test statistic: (LBI - 2) / SE(LBI)
|
|
153
|
+
# Under H0, this is approximately N(0,1)
|
|
154
|
+
if se_lbi == 0:
|
|
155
|
+
raise ValueError("Standard error is zero")
|
|
156
|
+
|
|
157
|
+
z_stat = (lbi_stat - 2) / se_lbi
|
|
158
|
+
|
|
159
|
+
# Two-sided p-value
|
|
160
|
+
pvalue = 2 * (1 - stats.norm.cdf(abs(z_stat)))
|
|
161
|
+
|
|
162
|
+
# Metadata
|
|
163
|
+
# Estimate rho from LBI: rho ≈ 1 - LBI/2
|
|
164
|
+
rho_estimate = 1 - lbi_stat / 2
|
|
165
|
+
|
|
166
|
+
metadata = {
|
|
167
|
+
'lbi_statistic': float(lbi_stat),
|
|
168
|
+
'z_statistic': float(z_stat),
|
|
169
|
+
'rho_estimate': float(rho_estimate),
|
|
170
|
+
'n_entities': int(n_entities),
|
|
171
|
+
'n_obs_total': int(n_obs_total),
|
|
172
|
+
'n_obs_used': len(resid_df_clean),
|
|
173
|
+
'avg_time_periods': float(t_bar),
|
|
174
|
+
'min_time_periods': int(t_i.min()),
|
|
175
|
+
'max_time_periods': int(t_i.max()),
|
|
176
|
+
'variance_lbi': float(var_lbi),
|
|
177
|
+
'se_lbi': float(se_lbi),
|
|
178
|
+
'interpretation': (
|
|
179
|
+
'LBI < 2: positive autocorrelation, '
|
|
180
|
+
'LBI ≈ 2: no autocorrelation, '
|
|
181
|
+
'LBI > 2: negative autocorrelation'
|
|
182
|
+
)
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
result = ValidationTestResult(
|
|
186
|
+
test_name="Baltagi-Wu LBI Test for Serial Correlation",
|
|
187
|
+
statistic=z_stat,
|
|
188
|
+
pvalue=pvalue,
|
|
189
|
+
null_hypothesis="No first-order serial correlation",
|
|
190
|
+
alternative_hypothesis="First-order serial correlation present",
|
|
191
|
+
alpha=alpha,
|
|
192
|
+
df=None, # Asymptotic test, no df
|
|
193
|
+
metadata=metadata
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
def _prepare_residual_data(self) -> pd.DataFrame:
|
|
199
|
+
"""
|
|
200
|
+
Prepare residual data with entity and time identifiers.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
pd.DataFrame
|
|
205
|
+
DataFrame with columns: entity, time, resid
|
|
206
|
+
"""
|
|
207
|
+
if hasattr(self.results, 'entity_index') and hasattr(self.results, 'time_index'):
|
|
208
|
+
resid_flat = self.resid.ravel() if hasattr(self.resid, 'ravel') else self.resid
|
|
209
|
+
|
|
210
|
+
resid_df = pd.DataFrame({
|
|
211
|
+
'entity': self.results.entity_index,
|
|
212
|
+
'time': self.results.time_index,
|
|
213
|
+
'resid': resid_flat
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
return resid_df
|
|
217
|
+
else:
|
|
218
|
+
raise AttributeError(
|
|
219
|
+
"Results object must have 'entity_index' and 'time_index' attributes"
|
|
220
|
+
)
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Breusch-Godfrey LM test for serial correlation in panel data.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Breusch, T. S. (1978). Testing for autocorrelation in dynamic linear models.
|
|
7
|
+
Australian Economic Papers, 17(31), 334-355.
|
|
8
|
+
|
|
9
|
+
Godfrey, L. G. (1978). Testing against general autoregressive and moving average
|
|
10
|
+
error models when the regressors include lagged dependent variables.
|
|
11
|
+
Econometrica, 46(6), 1293-1301.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from scipy import stats
|
|
17
|
+
|
|
18
|
+
from panelbox.validation.base import ValidationTest, ValidationTestResult
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BreuschGodfreyTest(ValidationTest):
|
|
22
|
+
"""
|
|
23
|
+
Breusch-Godfrey LM test for serial correlation.
|
|
24
|
+
|
|
25
|
+
Tests the null hypothesis of no serial correlation against the
|
|
26
|
+
alternative of AR(p) serial correlation in the errors.
|
|
27
|
+
|
|
28
|
+
H0: No serial correlation
|
|
29
|
+
H1: AR(p) serial correlation present
|
|
30
|
+
|
|
31
|
+
The test regresses the residuals on lagged residuals and the original
|
|
32
|
+
regressors, then tests if the lagged residuals are jointly significant.
|
|
33
|
+
|
|
34
|
+
Notes
|
|
35
|
+
-----
|
|
36
|
+
Unlike the Durbin-Watson test, the BG test:
|
|
37
|
+
- Can test for higher-order serial correlation
|
|
38
|
+
- Is valid when regressors include lagged dependent variables
|
|
39
|
+
- Provides an LM test statistic ~ Chi2(p)
|
|
40
|
+
|
|
41
|
+
For panel data, the test is applied accounting for the panel structure.
|
|
42
|
+
|
|
43
|
+
Examples
|
|
44
|
+
--------
|
|
45
|
+
>>> from panelbox.models.static.fixed_effects import FixedEffects
|
|
46
|
+
>>> fe = FixedEffects("y ~ x1 + x2", data, "entity", "time")
|
|
47
|
+
>>> results = fe.fit()
|
|
48
|
+
>>>
|
|
49
|
+
>>> from panelbox.validation.serial_correlation.breusch_godfrey import BreuschGodfreyTest
|
|
50
|
+
>>> test = BreuschGodfreyTest(results)
|
|
51
|
+
>>> result = test.run(lags=1) # Test for AR(1)
|
|
52
|
+
>>> print(result)
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, results: 'PanelResults'):
|
|
56
|
+
"""
|
|
57
|
+
Initialize Breusch-Godfrey test.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
results : PanelResults
|
|
62
|
+
Results from panel model estimation
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(results)
|
|
65
|
+
|
|
66
|
+
def run(self, lags: int = 1, alpha: float = 0.05) -> ValidationTestResult:
|
|
67
|
+
"""
|
|
68
|
+
Run Breusch-Godfrey LM test for serial correlation.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
lags : int, default=1
|
|
73
|
+
Number of lags to test (order of AR process)
|
|
74
|
+
alpha : float, default=0.05
|
|
75
|
+
Significance level
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
ValidationTestResult
|
|
80
|
+
Test results
|
|
81
|
+
|
|
82
|
+
Raises
|
|
83
|
+
------
|
|
84
|
+
ValueError
|
|
85
|
+
If required data is not available or lags < 1
|
|
86
|
+
|
|
87
|
+
Notes
|
|
88
|
+
-----
|
|
89
|
+
The test procedure:
|
|
90
|
+
1. Obtain residuals from original model
|
|
91
|
+
2. Regress residuals on lagged residuals (up to lag p) and original X
|
|
92
|
+
3. Compute LM = n*R² from this auxiliary regression
|
|
93
|
+
4. Compare to Chi2(p) distribution
|
|
94
|
+
"""
|
|
95
|
+
if lags < 1:
|
|
96
|
+
raise ValueError(f"lags must be >= 1, got {lags}")
|
|
97
|
+
|
|
98
|
+
# Get residuals with entity and time structure
|
|
99
|
+
resid_df = self._prepare_residual_data()
|
|
100
|
+
|
|
101
|
+
# Get design matrix
|
|
102
|
+
X = self._get_design_matrix()
|
|
103
|
+
|
|
104
|
+
if X is None:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
"Design matrix not available for Breusch-Godfrey test"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Create lagged residuals
|
|
110
|
+
resid_df = resid_df.sort_values(['entity', 'time'])
|
|
111
|
+
|
|
112
|
+
for lag in range(1, lags + 1):
|
|
113
|
+
resid_df[f'resid_lag{lag}'] = resid_df.groupby('entity')['resid'].shift(lag)
|
|
114
|
+
|
|
115
|
+
# Drop missing values (first lags obs per entity)
|
|
116
|
+
lag_cols = [f'resid_lag{i}' for i in range(1, lags + 1)]
|
|
117
|
+
resid_df = resid_df.dropna(subset=lag_cols)
|
|
118
|
+
|
|
119
|
+
if len(resid_df) == 0:
|
|
120
|
+
raise ValueError("No valid observations after creating lags")
|
|
121
|
+
|
|
122
|
+
# Get residuals and lagged residuals as arrays
|
|
123
|
+
resid = resid_df['resid'].values
|
|
124
|
+
X_lags = resid_df[lag_cols].values
|
|
125
|
+
|
|
126
|
+
# Match X to the reduced sample (after dropping NAs)
|
|
127
|
+
# We need to align X with the residuals we kept
|
|
128
|
+
# This is tricky - we need the original indices
|
|
129
|
+
|
|
130
|
+
# Simpler approach: use all X but only for non-missing resid indices
|
|
131
|
+
if len(X) == len(resid) + lags * resid_df['entity'].nunique():
|
|
132
|
+
# Need to match indices properly
|
|
133
|
+
# For now, assume X already matches the full data
|
|
134
|
+
# and we need to select the rows that correspond to non-missing resid
|
|
135
|
+
|
|
136
|
+
# Get indices of non-missing residuals
|
|
137
|
+
valid_indices = resid_df.index.values
|
|
138
|
+
|
|
139
|
+
# This assumes resid_df index corresponds to original data indices
|
|
140
|
+
if max(valid_indices) < len(X):
|
|
141
|
+
X_matched = X[valid_indices, :]
|
|
142
|
+
else:
|
|
143
|
+
# Fallback: use last len(resid) rows
|
|
144
|
+
X_matched = X[-len(resid):, :]
|
|
145
|
+
else:
|
|
146
|
+
# Assume X and resid are aligned
|
|
147
|
+
X_matched = X[:len(resid), :]
|
|
148
|
+
|
|
149
|
+
# Auxiliary regression: resid on [X, resid_lag1, ..., resid_lagp]
|
|
150
|
+
X_aug = np.column_stack([X_matched, X_lags])
|
|
151
|
+
|
|
152
|
+
# OLS
|
|
153
|
+
try:
|
|
154
|
+
XtX = X_aug.T @ X_aug
|
|
155
|
+
Xty = X_aug.T @ resid
|
|
156
|
+
beta_aux = np.linalg.solve(XtX, Xty)
|
|
157
|
+
except np.linalg.LinAlgError:
|
|
158
|
+
beta_aux = np.linalg.lstsq(X_aug, resid, rcond=None)[0]
|
|
159
|
+
|
|
160
|
+
# Fitted values
|
|
161
|
+
fitted_aux = X_aug @ beta_aux
|
|
162
|
+
|
|
163
|
+
# R² from auxiliary regression using explained sum of squares
|
|
164
|
+
# This is more numerically stable
|
|
165
|
+
mean_resid = np.mean(resid)
|
|
166
|
+
SST = np.sum((resid - mean_resid) ** 2)
|
|
167
|
+
SSE = np.sum((fitted_aux - mean_resid) ** 2)
|
|
168
|
+
|
|
169
|
+
if SST > 0:
|
|
170
|
+
R2_aux = SSE / SST
|
|
171
|
+
else:
|
|
172
|
+
R2_aux = 0.0
|
|
173
|
+
|
|
174
|
+
# Ensure R² is in [0, 1]
|
|
175
|
+
R2_aux = np.clip(R2_aux, 0.0, 1.0)
|
|
176
|
+
|
|
177
|
+
# LM statistic for panel data
|
|
178
|
+
# IMPORTANT: For panel data, we use the number of CROSS-SECTIONAL UNITS (N)
|
|
179
|
+
# not the total number of observations (N*T) or the reduced sample size.
|
|
180
|
+
#
|
|
181
|
+
# The Breusch-Godfrey test for panels (pbgtest in plm) uses:
|
|
182
|
+
# LM = N * R²
|
|
183
|
+
# where N is the number of cross-sectional units.
|
|
184
|
+
#
|
|
185
|
+
# This is different from the time-series version which uses n = T observations.
|
|
186
|
+
#
|
|
187
|
+
# Reference: Baltagi & Li (1995), "Testing AR(1) against MA(1) disturbances
|
|
188
|
+
# in an error component model"
|
|
189
|
+
|
|
190
|
+
n_entities = resid_df['entity'].nunique()
|
|
191
|
+
lm_stat = n_entities * R2_aux
|
|
192
|
+
|
|
193
|
+
# Sanity check
|
|
194
|
+
if lm_stat < 0:
|
|
195
|
+
lm_stat = 0.0
|
|
196
|
+
|
|
197
|
+
# Degrees of freedom = number of lags
|
|
198
|
+
df = lags
|
|
199
|
+
|
|
200
|
+
# P-value
|
|
201
|
+
pvalue = 1 - stats.chi2.cdf(lm_stat, df)
|
|
202
|
+
|
|
203
|
+
# Metadata
|
|
204
|
+
n_obs = len(resid)
|
|
205
|
+
metadata = {
|
|
206
|
+
'lags': lags,
|
|
207
|
+
'R2_auxiliary': R2_aux,
|
|
208
|
+
'n_obs_auxiliary': n_obs,
|
|
209
|
+
'n_entities': n_entities,
|
|
210
|
+
'note': 'Panel BG test uses LM = N * R² where N = number of entities'
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
result = ValidationTestResult(
|
|
214
|
+
test_name=f"Breusch-Godfrey LM Test for Serial Correlation (AR({lags}))",
|
|
215
|
+
statistic=lm_stat,
|
|
216
|
+
pvalue=pvalue,
|
|
217
|
+
null_hypothesis="No serial correlation",
|
|
218
|
+
alternative_hypothesis=f"AR({lags}) serial correlation present",
|
|
219
|
+
alpha=alpha,
|
|
220
|
+
df=df,
|
|
221
|
+
metadata=metadata
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
def _prepare_residual_data(self) -> pd.DataFrame:
|
|
227
|
+
"""Prepare residual data with entity and time identifiers."""
|
|
228
|
+
if hasattr(self.results, 'entity_index') and hasattr(self.results, 'time_index'):
|
|
229
|
+
resid_flat = self.resid.ravel() if hasattr(self.resid, 'ravel') else self.resid
|
|
230
|
+
|
|
231
|
+
resid_df = pd.DataFrame({
|
|
232
|
+
'entity': self.results.entity_index,
|
|
233
|
+
'time': self.results.time_index,
|
|
234
|
+
'resid': resid_flat
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
return resid_df
|
|
238
|
+
else:
|
|
239
|
+
raise AttributeError(
|
|
240
|
+
"Results object must have 'entity_index' and 'time_index' attributes"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def _get_design_matrix(self) -> np.ndarray:
|
|
244
|
+
"""Get the design matrix X."""
|
|
245
|
+
if not hasattr(self.results, '_model'):
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
model = self.results._model
|
|
249
|
+
|
|
250
|
+
if hasattr(model, 'formula_parser') and hasattr(model, 'data'):
|
|
251
|
+
try:
|
|
252
|
+
_, X = model.formula_parser.build_design_matrices(
|
|
253
|
+
model.data.data,
|
|
254
|
+
return_type='array'
|
|
255
|
+
)
|
|
256
|
+
return X
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
return None
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wooldridge test for autocorrelation in panel data.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Wooldridge, J. M. (2002). Econometric Analysis of Cross Section and Panel Data.
|
|
7
|
+
MIT Press, Section 10.4.1.
|
|
8
|
+
|
|
9
|
+
Stata command: xtserial
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from scipy import stats
|
|
15
|
+
|
|
16
|
+
from panelbox.validation.base import ValidationTest, ValidationTestResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WooldridgeARTest(ValidationTest):
|
|
20
|
+
"""
|
|
21
|
+
Wooldridge test for first-order autocorrelation in panel data.
|
|
22
|
+
|
|
23
|
+
This test is specifically designed for fixed effects models and tests
|
|
24
|
+
for AR(1) autocorrelation in the idiosyncratic errors.
|
|
25
|
+
|
|
26
|
+
The test is based on regressing the first-differenced residuals on their
|
|
27
|
+
own lag and testing if the coefficient equals -0.5 (which is the value
|
|
28
|
+
under H0 of no serial correlation).
|
|
29
|
+
|
|
30
|
+
Notes
|
|
31
|
+
-----
|
|
32
|
+
The test statistic is approximately distributed as F(1, N-1) under the null
|
|
33
|
+
of no first-order serial correlation.
|
|
34
|
+
|
|
35
|
+
This test requires at least T >= 3 time periods.
|
|
36
|
+
|
|
37
|
+
Examples
|
|
38
|
+
--------
|
|
39
|
+
>>> from panelbox.models.static.fixed_effects import FixedEffects
|
|
40
|
+
>>> fe = FixedEffects("y ~ x1 + x2", data, "entity", "time")
|
|
41
|
+
>>> results = fe.fit()
|
|
42
|
+
>>>
|
|
43
|
+
>>> from panelbox.validation.serial_correlation.wooldridge_ar import WooldridgeARTest
|
|
44
|
+
>>> test = WooldridgeARTest(results)
|
|
45
|
+
>>> result = test.run()
|
|
46
|
+
>>> print(result)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, results: 'PanelResults'):
|
|
50
|
+
"""
|
|
51
|
+
Initialize Wooldridge AR test.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
results : PanelResults
|
|
56
|
+
Results from panel model estimation (preferably Fixed Effects)
|
|
57
|
+
"""
|
|
58
|
+
super().__init__(results)
|
|
59
|
+
|
|
60
|
+
# Check if model is suitable
|
|
61
|
+
if 'Fixed Effects' not in self.model_type:
|
|
62
|
+
import warnings
|
|
63
|
+
warnings.warn(
|
|
64
|
+
"Wooldridge test is designed for Fixed Effects models. "
|
|
65
|
+
f"Current model: {self.model_type}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def run(self, alpha: float = 0.05) -> ValidationTestResult:
|
|
69
|
+
"""
|
|
70
|
+
Run Wooldridge test for AR(1) autocorrelation.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
alpha : float, default=0.05
|
|
75
|
+
Significance level
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
ValidationTestResult
|
|
80
|
+
Test results
|
|
81
|
+
|
|
82
|
+
Raises
|
|
83
|
+
------
|
|
84
|
+
ValueError
|
|
85
|
+
If panel has fewer than 3 time periods
|
|
86
|
+
"""
|
|
87
|
+
# Get residuals as DataFrame with entity and time info
|
|
88
|
+
# We need to reconstruct the panel structure
|
|
89
|
+
resid_df = self._prepare_residual_data()
|
|
90
|
+
|
|
91
|
+
# Check minimum time periods
|
|
92
|
+
min_T = resid_df.groupby('entity').size().min()
|
|
93
|
+
if min_T < 3:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Wooldridge test requires at least 3 time periods. "
|
|
96
|
+
f"Minimum found: {min_T}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Compute first differences of residuals
|
|
100
|
+
resid_df = resid_df.sort_values(['entity', 'time'])
|
|
101
|
+
resid_df['resid_diff'] = resid_df.groupby('entity')['resid'].diff()
|
|
102
|
+
resid_df['resid_diff_lag'] = resid_df.groupby('entity')['resid_diff'].shift(1)
|
|
103
|
+
|
|
104
|
+
# Drop missing values (first two obs per entity are lost)
|
|
105
|
+
resid_df = resid_df.dropna(subset=['resid_diff', 'resid_diff_lag'])
|
|
106
|
+
|
|
107
|
+
if len(resid_df) == 0:
|
|
108
|
+
raise ValueError("No valid observations after differencing")
|
|
109
|
+
|
|
110
|
+
# Regression: Δe_it on Δe_{i,t-1}
|
|
111
|
+
y = resid_df['resid_diff'].values
|
|
112
|
+
X = resid_df['resid_diff_lag'].values
|
|
113
|
+
|
|
114
|
+
# OLS regression
|
|
115
|
+
n = len(y)
|
|
116
|
+
beta = np.sum(X * y) / np.sum(X * X)
|
|
117
|
+
|
|
118
|
+
# Residuals
|
|
119
|
+
fitted = beta * X
|
|
120
|
+
resid_reg = y - fitted
|
|
121
|
+
|
|
122
|
+
# Standard error of beta
|
|
123
|
+
s2 = np.sum(resid_reg ** 2) / (n - 1)
|
|
124
|
+
se_beta = np.sqrt(s2 / np.sum(X * X))
|
|
125
|
+
|
|
126
|
+
# Test H0: beta = -0.5 (no serial correlation)
|
|
127
|
+
# Under H0, if no autocorrelation, E[Δe_it * Δe_{i,t-1}] = -sigma²/2
|
|
128
|
+
# So coefficient should be -0.5
|
|
129
|
+
t_stat = (beta - (-0.5)) / se_beta
|
|
130
|
+
|
|
131
|
+
# F statistic (F = t²)
|
|
132
|
+
f_stat = t_stat ** 2
|
|
133
|
+
|
|
134
|
+
# P-value from F distribution
|
|
135
|
+
# Number of entities
|
|
136
|
+
n_entities = resid_df['entity'].nunique()
|
|
137
|
+
df_num = 1
|
|
138
|
+
df_denom = n_entities - 1
|
|
139
|
+
|
|
140
|
+
pvalue = 1 - stats.f.cdf(f_stat, df_num, df_denom)
|
|
141
|
+
|
|
142
|
+
# Metadata
|
|
143
|
+
metadata = {
|
|
144
|
+
'coefficient': beta,
|
|
145
|
+
'std_error': se_beta,
|
|
146
|
+
't_statistic': t_stat,
|
|
147
|
+
'n_entities': n_entities,
|
|
148
|
+
'n_obs_used': n
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
result = ValidationTestResult(
|
|
152
|
+
test_name="Wooldridge Test for Autocorrelation",
|
|
153
|
+
statistic=f_stat,
|
|
154
|
+
pvalue=pvalue,
|
|
155
|
+
null_hypothesis="No first-order autocorrelation",
|
|
156
|
+
alternative_hypothesis="First-order autocorrelation present",
|
|
157
|
+
alpha=alpha,
|
|
158
|
+
df=(df_num, df_denom),
|
|
159
|
+
metadata=metadata
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
def _prepare_residual_data(self) -> pd.DataFrame:
|
|
165
|
+
"""
|
|
166
|
+
Prepare residual data with entity and time identifiers.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
pd.DataFrame
|
|
171
|
+
DataFrame with columns: entity, time, resid
|
|
172
|
+
"""
|
|
173
|
+
# Try to get entity and time from model metadata
|
|
174
|
+
# This assumes the model stored the original data structure
|
|
175
|
+
|
|
176
|
+
# For now, we'll try to extract from the results object
|
|
177
|
+
# This requires that the model kept track of entity/time indices
|
|
178
|
+
|
|
179
|
+
# If results has entity_index and time_index attributes
|
|
180
|
+
if hasattr(self.results, 'entity_index') and hasattr(self.results, 'time_index'):
|
|
181
|
+
# Ensure resid is 1D
|
|
182
|
+
resid_flat = self.resid.ravel() if hasattr(self.resid, 'ravel') else self.resid
|
|
183
|
+
|
|
184
|
+
resid_df = pd.DataFrame({
|
|
185
|
+
'entity': self.results.entity_index,
|
|
186
|
+
'time': self.results.time_index,
|
|
187
|
+
'resid': resid_flat
|
|
188
|
+
})
|
|
189
|
+
else:
|
|
190
|
+
# Fallback: try to reconstruct from model's data attribute
|
|
191
|
+
# This assumes the results object has reference to the original model
|
|
192
|
+
# which has the PanelData object
|
|
193
|
+
|
|
194
|
+
# For now, raise informative error
|
|
195
|
+
raise AttributeError(
|
|
196
|
+
"Results object must have 'entity_index' and 'time_index' attributes. "
|
|
197
|
+
"Please ensure your model stores these during estimation."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return resid_df
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Specification tests for panel models.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from panelbox.validation.specification.hausman import HausmanTest, HausmanTestResult
|
|
6
|
+
from panelbox.validation.specification.mundlak import MundlakTest
|
|
7
|
+
from panelbox.validation.specification.reset import RESETTest
|
|
8
|
+
from panelbox.validation.specification.chow import ChowTest
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
'HausmanTest',
|
|
12
|
+
'HausmanTestResult',
|
|
13
|
+
'MundlakTest',
|
|
14
|
+
'RESETTest',
|
|
15
|
+
'ChowTest',
|
|
16
|
+
]
|