panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,621 @@
|
|
|
1
|
+
"""
|
|
2
|
+
System GMM Estimator
|
|
3
|
+
====================
|
|
4
|
+
|
|
5
|
+
Blundell-Bond (1998) System GMM estimator for dynamic panel data models.
|
|
6
|
+
|
|
7
|
+
Classes
|
|
8
|
+
-------
|
|
9
|
+
SystemGMM : Blundell-Bond System GMM estimator
|
|
10
|
+
|
|
11
|
+
References
|
|
12
|
+
----------
|
|
13
|
+
.. [1] Blundell, R., & Bond, S. (1998). "Initial Conditions and Moment
|
|
14
|
+
Restrictions in Dynamic Panel Data Models." Journal of Econometrics,
|
|
15
|
+
87(1), 115-143.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Union, List, Optional, Dict
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from panelbox.gmm.difference_gmm import DifferenceGMM
|
|
22
|
+
from panelbox.gmm.results import GMMResults
|
|
23
|
+
from panelbox.gmm.instruments import InstrumentSet, InstrumentBuilder
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SystemGMM(DifferenceGMM):
|
|
27
|
+
"""
|
|
28
|
+
Blundell-Bond (1998) System GMM estimator.
|
|
29
|
+
|
|
30
|
+
Combines difference and level equations in a stacked system:
|
|
31
|
+
- Difference equations (instruments: lags of levels)
|
|
32
|
+
- Level equations (instruments: lags of differences)
|
|
33
|
+
|
|
34
|
+
Advantages over Difference GMM:
|
|
35
|
+
- More efficient when series are persistent
|
|
36
|
+
- Better precision for coefficient estimates
|
|
37
|
+
- Additional moment conditions
|
|
38
|
+
|
|
39
|
+
Requires assumption:
|
|
40
|
+
E[Δy_{i,t-1} · η_i] = 0 (initial conditions)
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
data : pd.DataFrame
|
|
45
|
+
Panel data in long format
|
|
46
|
+
dep_var : str
|
|
47
|
+
Name of dependent variable
|
|
48
|
+
lags : Union[int, List[int]]
|
|
49
|
+
Lags of dependent variable to include
|
|
50
|
+
id_var : str
|
|
51
|
+
Name of cross-sectional identifier (default: 'id')
|
|
52
|
+
time_var : str
|
|
53
|
+
Name of time variable (default: 'year')
|
|
54
|
+
exog_vars : List[str], optional
|
|
55
|
+
List of strictly exogenous variables
|
|
56
|
+
endogenous_vars : List[str], optional
|
|
57
|
+
List of endogenous variables
|
|
58
|
+
predetermined_vars : List[str], optional
|
|
59
|
+
List of predetermined variables
|
|
60
|
+
time_dummies : bool
|
|
61
|
+
Include time dummies (default: True)
|
|
62
|
+
collapse : bool
|
|
63
|
+
Collapse instruments (default: False)
|
|
64
|
+
two_step : bool
|
|
65
|
+
Use two-step GMM (default: True)
|
|
66
|
+
robust : bool
|
|
67
|
+
Use robust variance with Windmeijer correction (default: True)
|
|
68
|
+
gmm_type : str
|
|
69
|
+
GMM type: 'one_step', 'two_step', 'iterative' (default: 'two_step')
|
|
70
|
+
level_instruments : Dict, optional
|
|
71
|
+
Configuration for level equation instruments
|
|
72
|
+
Example: {'max_lags': 1} uses L.D.y as instrument
|
|
73
|
+
|
|
74
|
+
Attributes
|
|
75
|
+
----------
|
|
76
|
+
level_instruments : Dict
|
|
77
|
+
Configuration for level equation instruments
|
|
78
|
+
|
|
79
|
+
Examples
|
|
80
|
+
--------
|
|
81
|
+
**When to use System GMM:**
|
|
82
|
+
|
|
83
|
+
System GMM is preferred over Difference GMM when:
|
|
84
|
+
- Variables are highly persistent (AR coefficient near 1)
|
|
85
|
+
- Lagged levels are weak instruments for differences
|
|
86
|
+
- You want more efficient estimates (smaller standard errors)
|
|
87
|
+
|
|
88
|
+
**Basic System GMM with production function:**
|
|
89
|
+
|
|
90
|
+
>>> import pandas as pd
|
|
91
|
+
>>> from panelbox.gmm import SystemGMM
|
|
92
|
+
>>>
|
|
93
|
+
>>> # Load production data
|
|
94
|
+
>>> data = pd.read_csv('production.csv')
|
|
95
|
+
>>>
|
|
96
|
+
>>> # Estimate System GMM
|
|
97
|
+
>>> model = SystemGMM(
|
|
98
|
+
... data=data,
|
|
99
|
+
... dep_var='output',
|
|
100
|
+
... lags=1, # Include output_{t-1}
|
|
101
|
+
... id_var='firm_id',
|
|
102
|
+
... time_var='year',
|
|
103
|
+
... exog_vars=['capital', 'labor'],
|
|
104
|
+
... collapse=True, # Always recommended
|
|
105
|
+
... two_step=True,
|
|
106
|
+
... robust=True,
|
|
107
|
+
... level_instruments={'max_lags': 1} # Use Δy_{t-1} for level equation
|
|
108
|
+
... )
|
|
109
|
+
>>>
|
|
110
|
+
>>> results = model.fit()
|
|
111
|
+
>>> print(results.summary())
|
|
112
|
+
>>>
|
|
113
|
+
>>> # Check if more efficient than Difference GMM
|
|
114
|
+
>>> print(f"Standard error: {results.std_errors['L1.output']:.4f}")
|
|
115
|
+
|
|
116
|
+
**Comparing Difference vs System GMM:**
|
|
117
|
+
|
|
118
|
+
>>> from panelbox.gmm import DifferenceGMM, SystemGMM
|
|
119
|
+
>>>
|
|
120
|
+
>>> # Estimate both
|
|
121
|
+
>>> diff_gmm = DifferenceGMM(
|
|
122
|
+
... data=data,
|
|
123
|
+
... dep_var='y',
|
|
124
|
+
... lags=1,
|
|
125
|
+
... exog_vars=['x1', 'x2'],
|
|
126
|
+
... collapse=True,
|
|
127
|
+
... two_step=True
|
|
128
|
+
... )
|
|
129
|
+
>>> diff_results = diff_gmm.fit()
|
|
130
|
+
>>>
|
|
131
|
+
>>> sys_gmm = SystemGMM(
|
|
132
|
+
... data=data,
|
|
133
|
+
... dep_var='y',
|
|
134
|
+
... lags=1,
|
|
135
|
+
... exog_vars=['x1', 'x2'],
|
|
136
|
+
... collapse=True,
|
|
137
|
+
... two_step=True,
|
|
138
|
+
... level_instruments={'max_lags': 1}
|
|
139
|
+
... )
|
|
140
|
+
>>> sys_results = sys_gmm.fit()
|
|
141
|
+
>>>
|
|
142
|
+
>>> # Compare efficiency
|
|
143
|
+
>>> coef_name = 'L1.y'
|
|
144
|
+
>>> diff_se = diff_results.std_errors[coef_name]
|
|
145
|
+
>>> sys_se = sys_results.std_errors[coef_name]
|
|
146
|
+
>>> efficiency_gain = (diff_se - sys_se) / diff_se * 100
|
|
147
|
+
>>> print(f"System GMM SE is {efficiency_gain:.1f}% smaller")
|
|
148
|
+
>>>
|
|
149
|
+
>>> # Check if both are valid
|
|
150
|
+
>>> if sys_results.ar2_test.pvalue > 0.10 and sys_results.hansen_j.pvalue > 0.10:
|
|
151
|
+
... print("System GMM preferred (more efficient and valid)")
|
|
152
|
+
|
|
153
|
+
**With custom level instruments:**
|
|
154
|
+
|
|
155
|
+
>>> # Control instrument depth for level equation
|
|
156
|
+
>>> model = SystemGMM(
|
|
157
|
+
... data=data,
|
|
158
|
+
... dep_var='n',
|
|
159
|
+
... lags=1,
|
|
160
|
+
... exog_vars=['w', 'k'],
|
|
161
|
+
... collapse=True,
|
|
162
|
+
... level_instruments={'max_lags': 1}
|
|
163
|
+
... )
|
|
164
|
+
>>> results = model.fit()
|
|
165
|
+
|
|
166
|
+
Notes
|
|
167
|
+
-----
|
|
168
|
+
System combines:
|
|
169
|
+
|
|
170
|
+
Difference equation:
|
|
171
|
+
Δy_{it} = γ Δy_{i,t-1} + β' Δx_{it} + Δε_{it}
|
|
172
|
+
Instruments: lags of levels (y_{i,t-2}, y_{i,t-3}, ...)
|
|
173
|
+
|
|
174
|
+
Level equation:
|
|
175
|
+
y_{it} = γ y_{i,t-1} + β' x_{it} + η_i + ε_{it}
|
|
176
|
+
Instruments: lags of differences (Δy_{i,t-1}, Δy_{i,t-2}, ...)
|
|
177
|
+
|
|
178
|
+
Critical assumption:
|
|
179
|
+
E[Δy_{i,1} · η_i] = 0
|
|
180
|
+
Violated if initial conditions are correlated with fixed effects
|
|
181
|
+
|
|
182
|
+
References
|
|
183
|
+
----------
|
|
184
|
+
Blundell, R., & Bond, S. (1998). Journal of Econometrics, 87(1), 115-143.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def __init__(self,
|
|
188
|
+
data: pd.DataFrame,
|
|
189
|
+
dep_var: str,
|
|
190
|
+
lags: Union[int, List[int]],
|
|
191
|
+
id_var: str = 'id',
|
|
192
|
+
time_var: str = 'year',
|
|
193
|
+
exog_vars: Optional[List[str]] = None,
|
|
194
|
+
endogenous_vars: Optional[List[str]] = None,
|
|
195
|
+
predetermined_vars: Optional[List[str]] = None,
|
|
196
|
+
time_dummies: bool = True,
|
|
197
|
+
collapse: bool = False,
|
|
198
|
+
two_step: bool = True,
|
|
199
|
+
robust: bool = True,
|
|
200
|
+
gmm_type: str = 'two_step',
|
|
201
|
+
level_instruments: Optional[Dict] = None):
|
|
202
|
+
"""Initialize System GMM model."""
|
|
203
|
+
# Initialize parent Difference GMM
|
|
204
|
+
super().__init__(
|
|
205
|
+
data=data,
|
|
206
|
+
dep_var=dep_var,
|
|
207
|
+
lags=lags,
|
|
208
|
+
id_var=id_var,
|
|
209
|
+
time_var=time_var,
|
|
210
|
+
exog_vars=exog_vars,
|
|
211
|
+
endogenous_vars=endogenous_vars,
|
|
212
|
+
predetermined_vars=predetermined_vars,
|
|
213
|
+
time_dummies=time_dummies,
|
|
214
|
+
collapse=collapse,
|
|
215
|
+
two_step=two_step,
|
|
216
|
+
robust=robust,
|
|
217
|
+
gmm_type=gmm_type
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Level instruments configuration
|
|
221
|
+
self.level_instruments = level_instruments or {'max_lags': 1}
|
|
222
|
+
|
|
223
|
+
def fit(self) -> GMMResults:
|
|
224
|
+
"""
|
|
225
|
+
Estimate the System GMM model.
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
GMMResults
|
|
230
|
+
Estimation results
|
|
231
|
+
|
|
232
|
+
Notes
|
|
233
|
+
-----
|
|
234
|
+
Estimation procedure:
|
|
235
|
+
1. Create difference equations (as in Difference GMM)
|
|
236
|
+
2. Create level equations
|
|
237
|
+
3. Stack equations and instruments
|
|
238
|
+
4. Estimate using stacked system
|
|
239
|
+
5. Compute specification tests including Diff-in-Hansen
|
|
240
|
+
"""
|
|
241
|
+
# Step 1 & 2: Transform data (both differences and levels)
|
|
242
|
+
y_diff, X_diff, y_level, X_level, ids, times = self._transform_data_system()
|
|
243
|
+
|
|
244
|
+
# Step 3: Generate instruments (difference + level)
|
|
245
|
+
# Note: _generate_instruments_system will recreate InstrumentBuilder internally
|
|
246
|
+
Z_diff, Z_level = self._generate_instruments_system()
|
|
247
|
+
|
|
248
|
+
# Step 4: Stack equations
|
|
249
|
+
y_stacked = np.vstack([y_diff, y_level])
|
|
250
|
+
X_stacked = np.vstack([X_diff, X_level])
|
|
251
|
+
Z_stacked = self._stack_instruments(Z_diff, Z_level)
|
|
252
|
+
|
|
253
|
+
# Repeat ids and times for stacked system
|
|
254
|
+
ids_stacked = np.concatenate([ids, ids])
|
|
255
|
+
times_stacked = np.concatenate([times, times])
|
|
256
|
+
|
|
257
|
+
# Step 5: Estimate GMM on stacked system
|
|
258
|
+
if self.gmm_type == 'one_step':
|
|
259
|
+
beta, W, residuals = self.estimator.one_step(y_stacked, X_stacked, Z_stacked)
|
|
260
|
+
vcov = self._compute_one_step_vcov(X_stacked, Z_stacked, residuals, W)
|
|
261
|
+
converged = True
|
|
262
|
+
elif self.gmm_type == 'two_step':
|
|
263
|
+
beta, vcov, W, residuals = self.estimator.two_step(
|
|
264
|
+
y_stacked, X_stacked, Z_stacked, robust=self.robust
|
|
265
|
+
)
|
|
266
|
+
converged = True
|
|
267
|
+
else: # iterative
|
|
268
|
+
beta, vcov, W, converged = self.estimator.iterative(
|
|
269
|
+
y_stacked, X_stacked, Z_stacked
|
|
270
|
+
)
|
|
271
|
+
residuals = y_stacked - X_stacked @ beta
|
|
272
|
+
|
|
273
|
+
# Ensure beta is 1D for pandas Series
|
|
274
|
+
beta = beta.flatten()
|
|
275
|
+
|
|
276
|
+
# Step 6: Compute standard errors and statistics
|
|
277
|
+
std_errors = np.sqrt(np.diag(vcov))
|
|
278
|
+
tvalues = beta / std_errors
|
|
279
|
+
from scipy import stats as scipy_stats
|
|
280
|
+
pvalues = 2 * (1 - scipy_stats.norm.cdf(np.abs(tvalues)))
|
|
281
|
+
|
|
282
|
+
# Step 7: Get variable names
|
|
283
|
+
var_names = self._get_variable_names()
|
|
284
|
+
|
|
285
|
+
# Step 8: Compute specification tests
|
|
286
|
+
n_params = len(beta)
|
|
287
|
+
|
|
288
|
+
# Hansen J-test on full system
|
|
289
|
+
hansen = self.tester.hansen_j_test(
|
|
290
|
+
residuals, Z_stacked, W, n_params
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Sargan test
|
|
294
|
+
sargan = self.tester.sargan_test(
|
|
295
|
+
residuals, Z_stacked, n_params
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# AR tests (on difference residuals only)
|
|
299
|
+
n_diff = len(y_diff)
|
|
300
|
+
residuals_diff_only = residuals[:n_diff]
|
|
301
|
+
ids_diff_only = ids_stacked[:n_diff] # Use stacked ids, first half
|
|
302
|
+
|
|
303
|
+
valid_mask_diff = ~np.isnan(residuals_diff_only.flatten())
|
|
304
|
+
resid_diff_clean = residuals_diff_only.flatten()[valid_mask_diff]
|
|
305
|
+
ids_diff_clean = ids_diff_only[valid_mask_diff]
|
|
306
|
+
|
|
307
|
+
ar1 = self.tester.arellano_bond_ar_test(
|
|
308
|
+
resid_diff_clean, ids_diff_clean, order=1
|
|
309
|
+
)
|
|
310
|
+
ar2 = self.tester.arellano_bond_ar_test(
|
|
311
|
+
resid_diff_clean, ids_diff_clean, order=2
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Difference-in-Hansen test for level instruments
|
|
315
|
+
diff_hansen = self._compute_diff_hansen(
|
|
316
|
+
residuals, Z_diff, Z_level, W, n_params
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Step 9: Create results object
|
|
320
|
+
valid_mask = ~np.isnan(residuals.flatten())
|
|
321
|
+
self.results = GMMResults(
|
|
322
|
+
params=pd.Series(beta, index=var_names),
|
|
323
|
+
std_errors=pd.Series(std_errors, index=var_names),
|
|
324
|
+
tvalues=pd.Series(tvalues, index=var_names),
|
|
325
|
+
pvalues=pd.Series(pvalues, index=var_names),
|
|
326
|
+
nobs=int(np.sum(valid_mask)),
|
|
327
|
+
n_groups=self.instrument_builder.n_groups,
|
|
328
|
+
n_instruments=Z_stacked.shape[1],
|
|
329
|
+
n_params=n_params,
|
|
330
|
+
hansen_j=hansen,
|
|
331
|
+
sargan=sargan,
|
|
332
|
+
ar1_test=ar1,
|
|
333
|
+
ar2_test=ar2,
|
|
334
|
+
diff_hansen=diff_hansen,
|
|
335
|
+
vcov=vcov,
|
|
336
|
+
weight_matrix=W,
|
|
337
|
+
converged=converged,
|
|
338
|
+
two_step=self.two_step,
|
|
339
|
+
windmeijer_corrected=self.robust and self.two_step,
|
|
340
|
+
model_type='system',
|
|
341
|
+
transformation='fd',
|
|
342
|
+
residuals=residuals
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
self.params = self.results.params
|
|
346
|
+
|
|
347
|
+
# Post-estimation warning for low observation retention
|
|
348
|
+
retention_rate = self.results.nobs / len(self.data)
|
|
349
|
+
if retention_rate < 0.30:
|
|
350
|
+
import warnings
|
|
351
|
+
warnings.warn(
|
|
352
|
+
f"\nLow observation retention: {self.results.nobs}/{len(self.data)} "
|
|
353
|
+
f"({retention_rate*100:.1f}%).\n"
|
|
354
|
+
f"Many observations were dropped due to insufficient valid instruments.\n\n"
|
|
355
|
+
f"Recommendations:\n"
|
|
356
|
+
f" 1. Simplify specification (fewer variables/lags)\n"
|
|
357
|
+
f" 2. Set time_dummies=False (or use linear trend)\n"
|
|
358
|
+
f" 3. Ensure collapse=True (currently: {self.collapse})\n"
|
|
359
|
+
f" 4. Check data for excessive missing values\n"
|
|
360
|
+
f" 5. Consider using DifferenceGMM (more robust for weak instruments)\n\n"
|
|
361
|
+
f"See examples/gmm/unbalanced_panel_guide.py for detailed guidance.",
|
|
362
|
+
UserWarning
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return self.results
|
|
366
|
+
|
|
367
|
+
def _transform_data_system(self) -> tuple:
|
|
368
|
+
"""
|
|
369
|
+
Transform data for System GMM (both differences and levels).
|
|
370
|
+
|
|
371
|
+
Returns
|
|
372
|
+
-------
|
|
373
|
+
y_diff : np.ndarray
|
|
374
|
+
Differenced dependent variable
|
|
375
|
+
X_diff : np.ndarray
|
|
376
|
+
Differenced regressors
|
|
377
|
+
y_level : np.ndarray
|
|
378
|
+
Level dependent variable
|
|
379
|
+
X_level : np.ndarray
|
|
380
|
+
Level regressors
|
|
381
|
+
ids : np.ndarray
|
|
382
|
+
ID variable
|
|
383
|
+
times : np.ndarray
|
|
384
|
+
Time variable
|
|
385
|
+
"""
|
|
386
|
+
# Get difference transformation from parent
|
|
387
|
+
y_diff, X_diff, ids, times = super()._transform_data()
|
|
388
|
+
|
|
389
|
+
# Also need levels
|
|
390
|
+
df = self.data.sort_values([self.id_var, self.time_var])
|
|
391
|
+
|
|
392
|
+
# Create lagged dependent variable for levels
|
|
393
|
+
for lag in self.lags:
|
|
394
|
+
lag_name = f'{self.dep_var}_L{lag}'
|
|
395
|
+
df[lag_name] = df.groupby(self.id_var)[self.dep_var].shift(lag)
|
|
396
|
+
|
|
397
|
+
# Build regressor list (same as difference)
|
|
398
|
+
regressors = []
|
|
399
|
+
for lag in self.lags:
|
|
400
|
+
regressors.append(f'{self.dep_var}_L{lag}')
|
|
401
|
+
regressors.extend(self.exog_vars)
|
|
402
|
+
regressors.extend(self.endogenous_vars)
|
|
403
|
+
regressors.extend(self.predetermined_vars)
|
|
404
|
+
|
|
405
|
+
# Add time dummies if requested
|
|
406
|
+
if self.time_dummies:
|
|
407
|
+
time_dummies = pd.get_dummies(df[self.time_var], prefix='year', drop_first=True)
|
|
408
|
+
for col in time_dummies.columns:
|
|
409
|
+
df[col] = time_dummies[col]
|
|
410
|
+
if col not in regressors:
|
|
411
|
+
regressors.append(col)
|
|
412
|
+
|
|
413
|
+
# Extract level data
|
|
414
|
+
y_level = df[self.dep_var].values.reshape(-1, 1)
|
|
415
|
+
X_level = np.column_stack([df[var].values for var in regressors])
|
|
416
|
+
|
|
417
|
+
return y_diff, X_diff, y_level, X_level, ids, times
|
|
418
|
+
|
|
419
|
+
def _generate_instruments_system(self) -> tuple:
|
|
420
|
+
"""
|
|
421
|
+
Generate instruments for System GMM.
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
Z_diff : np.ndarray
|
|
426
|
+
Instruments for difference equations
|
|
427
|
+
Z_level : np.ndarray
|
|
428
|
+
Instruments for level equations
|
|
429
|
+
"""
|
|
430
|
+
# Difference equation instruments (same as Difference GMM)
|
|
431
|
+
Z_diff = self._generate_instruments()
|
|
432
|
+
|
|
433
|
+
# FIRST: Create ALL differenced variables and add to data
|
|
434
|
+
df = self.data.sort_values([self.id_var, self.time_var]).copy()
|
|
435
|
+
|
|
436
|
+
# Create differences of lagged dependent variable
|
|
437
|
+
for lag in self.lags:
|
|
438
|
+
lag_name = f'{self.dep_var}_L{lag}'
|
|
439
|
+
if lag_name in df.columns:
|
|
440
|
+
df[f'{lag_name}_diff'] = df.groupby(self.id_var)[lag_name].diff()
|
|
441
|
+
self.data[f'{lag_name}_diff'] = df[f'{lag_name}_diff']
|
|
442
|
+
|
|
443
|
+
# Create differences of predetermined/endogenous variables
|
|
444
|
+
for var in self.predetermined_vars + self.endogenous_vars:
|
|
445
|
+
if var in df.columns:
|
|
446
|
+
df[f'{var}_diff'] = df.groupby(self.id_var)[var].diff()
|
|
447
|
+
self.data[f'{var}_diff'] = df[f'{var}_diff']
|
|
448
|
+
|
|
449
|
+
# SECOND: Recreate InstrumentBuilder with updated data
|
|
450
|
+
self.instrument_builder = InstrumentBuilder(self.data, self.id_var, self.time_var)
|
|
451
|
+
|
|
452
|
+
# THIRD: Generate level instruments using the differenced variables
|
|
453
|
+
instrument_sets_level = []
|
|
454
|
+
|
|
455
|
+
# For lagged dependent variable in levels, use differences as instruments
|
|
456
|
+
for lag in self.lags:
|
|
457
|
+
lag_name = f'{self.dep_var}_L{lag}'
|
|
458
|
+
|
|
459
|
+
# Use lagged differences as instruments for levels
|
|
460
|
+
max_lags_level = self.level_instruments.get('max_lags', 1)
|
|
461
|
+
Z_level_lag = self.instrument_builder.create_gmm_style_instruments(
|
|
462
|
+
var=f'{lag_name}_diff',
|
|
463
|
+
min_lag=0, # Can use contemporaneous difference
|
|
464
|
+
max_lag=max_lags_level,
|
|
465
|
+
equation='level',
|
|
466
|
+
collapse=self.collapse
|
|
467
|
+
)
|
|
468
|
+
instrument_sets_level.append(Z_level_lag)
|
|
469
|
+
|
|
470
|
+
# For exogenous variables in levels, use themselves
|
|
471
|
+
for var in self.exog_vars:
|
|
472
|
+
Z_level_exog = self.instrument_builder.create_iv_style_instruments(
|
|
473
|
+
var=var,
|
|
474
|
+
min_lag=0,
|
|
475
|
+
max_lag=0,
|
|
476
|
+
equation='level'
|
|
477
|
+
)
|
|
478
|
+
instrument_sets_level.append(Z_level_exog)
|
|
479
|
+
|
|
480
|
+
# For predetermined/endogenous in levels, use lagged differences
|
|
481
|
+
for var in self.predetermined_vars + self.endogenous_vars:
|
|
482
|
+
# Variable differences already created above
|
|
483
|
+
max_lags_level = self.level_instruments.get('max_lags', 1)
|
|
484
|
+
Z_level_var = self.instrument_builder.create_gmm_style_instruments(
|
|
485
|
+
var=f'{var}_diff',
|
|
486
|
+
min_lag=1,
|
|
487
|
+
max_lag=max_lags_level,
|
|
488
|
+
equation='level',
|
|
489
|
+
collapse=self.collapse
|
|
490
|
+
)
|
|
491
|
+
instrument_sets_level.append(Z_level_var)
|
|
492
|
+
|
|
493
|
+
# Combine level instruments
|
|
494
|
+
if instrument_sets_level:
|
|
495
|
+
Z_level = self.instrument_builder.combine_instruments(*instrument_sets_level)
|
|
496
|
+
else:
|
|
497
|
+
# No level-specific instruments, use empty matrix
|
|
498
|
+
Z_level = InstrumentSet(
|
|
499
|
+
Z=np.empty((len(self.data), 0)),
|
|
500
|
+
variable_names=[],
|
|
501
|
+
instrument_names=[],
|
|
502
|
+
equation='level',
|
|
503
|
+
style='mixed',
|
|
504
|
+
collapsed=False
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
return Z_diff, Z_level
|
|
508
|
+
|
|
509
|
+
def _stack_instruments(self,
|
|
510
|
+
Z_diff: InstrumentSet,
|
|
511
|
+
Z_level: InstrumentSet) -> np.ndarray:
|
|
512
|
+
"""
|
|
513
|
+
Stack instruments for System GMM.
|
|
514
|
+
|
|
515
|
+
Creates block-diagonal matrix:
|
|
516
|
+
[ Z_diff 0 ]
|
|
517
|
+
[ 0 Z_level ]
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
Z_diff : InstrumentSet
|
|
522
|
+
Difference equation instruments
|
|
523
|
+
Z_level : InstrumentSet
|
|
524
|
+
Level equation instruments
|
|
525
|
+
|
|
526
|
+
Returns
|
|
527
|
+
-------
|
|
528
|
+
np.ndarray
|
|
529
|
+
Stacked instrument matrix
|
|
530
|
+
"""
|
|
531
|
+
n_obs = Z_diff.n_obs
|
|
532
|
+
|
|
533
|
+
# Create block diagonal matrix
|
|
534
|
+
n_instruments_total = Z_diff.n_instruments + Z_level.n_instruments
|
|
535
|
+
|
|
536
|
+
Z_stacked = np.zeros((2 * n_obs, n_instruments_total))
|
|
537
|
+
|
|
538
|
+
# Fill difference block
|
|
539
|
+
Z_stacked[:n_obs, :Z_diff.n_instruments] = Z_diff.Z
|
|
540
|
+
|
|
541
|
+
# Fill level block
|
|
542
|
+
Z_stacked[n_obs:, Z_diff.n_instruments:] = Z_level.Z
|
|
543
|
+
|
|
544
|
+
return Z_stacked
|
|
545
|
+
|
|
546
|
+
def _compute_diff_hansen(self,
|
|
547
|
+
residuals: np.ndarray,
|
|
548
|
+
Z_diff: InstrumentSet,
|
|
549
|
+
Z_level: InstrumentSet,
|
|
550
|
+
W_full: np.ndarray,
|
|
551
|
+
n_params: int):
|
|
552
|
+
"""
|
|
553
|
+
Compute Difference-in-Hansen test for level instruments.
|
|
554
|
+
|
|
555
|
+
Tests the validity of level equation instruments by comparing
|
|
556
|
+
Hansen J statistics with and without level instruments.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
residuals : np.ndarray
|
|
561
|
+
Residuals from full system
|
|
562
|
+
Z_diff : InstrumentSet
|
|
563
|
+
Difference instruments
|
|
564
|
+
Z_level : InstrumentSet
|
|
565
|
+
Level instruments
|
|
566
|
+
W_full : np.ndarray
|
|
567
|
+
Weight matrix from full system
|
|
568
|
+
n_params : int
|
|
569
|
+
Number of parameters
|
|
570
|
+
|
|
571
|
+
Returns
|
|
572
|
+
-------
|
|
573
|
+
TestResult
|
|
574
|
+
Difference-in-Hansen test result
|
|
575
|
+
"""
|
|
576
|
+
# Full system instruments
|
|
577
|
+
Z_full = self._stack_instruments(Z_diff, Z_level)
|
|
578
|
+
|
|
579
|
+
# Subset system (difference only)
|
|
580
|
+
n_obs = Z_diff.n_obs
|
|
581
|
+
Z_subset = np.zeros((2 * n_obs, Z_diff.n_instruments))
|
|
582
|
+
Z_subset[:n_obs, :] = Z_diff.Z
|
|
583
|
+
# Level equations get same instruments as difference (for subset comparison)
|
|
584
|
+
Z_subset[n_obs:, :] = Z_diff.Z
|
|
585
|
+
|
|
586
|
+
# Compute weight matrix for subset
|
|
587
|
+
# (simplified - in practice should re-estimate)
|
|
588
|
+
W_subset = W_full[:Z_diff.n_instruments, :Z_diff.n_instruments]
|
|
589
|
+
|
|
590
|
+
# Compute Difference-in-Hansen test
|
|
591
|
+
diff_hansen = self.tester.difference_in_hansen(
|
|
592
|
+
residuals=residuals,
|
|
593
|
+
Z_full=Z_full,
|
|
594
|
+
Z_subset=Z_subset,
|
|
595
|
+
W_full=W_full,
|
|
596
|
+
W_subset=W_subset,
|
|
597
|
+
n_params=n_params,
|
|
598
|
+
subset_name='level instruments'
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
return diff_hansen
|
|
602
|
+
|
|
603
|
+
def summary(self) -> str:
|
|
604
|
+
"""
|
|
605
|
+
Print model summary.
|
|
606
|
+
|
|
607
|
+
Returns
|
|
608
|
+
-------
|
|
609
|
+
str
|
|
610
|
+
Summary string
|
|
611
|
+
"""
|
|
612
|
+
if self.results is None:
|
|
613
|
+
raise ValueError("Model has not been fit yet. Call fit() first.")
|
|
614
|
+
|
|
615
|
+
return self.results.summary(title='System GMM (Blundell-Bond)')
|
|
616
|
+
|
|
617
|
+
def __repr__(self) -> str:
|
|
618
|
+
"""Representation of the model."""
|
|
619
|
+
status = "fitted" if self.results is not None else "not fitted"
|
|
620
|
+
return (f"SystemGMM(dep_var='{self.dep_var}', lags={self.lags}, "
|
|
621
|
+
f"status='{status}')")
|