panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Difference GMM Estimator
|
|
3
|
+
=========================
|
|
4
|
+
|
|
5
|
+
Arellano-Bond (1991) Difference GMM estimator for dynamic panel data models.
|
|
6
|
+
|
|
7
|
+
Classes
|
|
8
|
+
-------
|
|
9
|
+
DifferenceGMM : Arellano-Bond Difference GMM estimator
|
|
10
|
+
|
|
11
|
+
References
|
|
12
|
+
----------
|
|
13
|
+
.. [1] Arellano, M., & Bond, S. (1991). "Some Tests of Specification for Panel
|
|
14
|
+
Data: Monte Carlo Evidence and an Application to Employment Equations."
|
|
15
|
+
Review of Economic Studies, 58(2), 277-297.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Union, List, Optional, Dict
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from panelbox.gmm.results import GMMResults, TestResult
|
|
22
|
+
from panelbox.gmm.instruments import InstrumentBuilder, InstrumentSet
|
|
23
|
+
from panelbox.gmm.estimator import GMMEstimator
|
|
24
|
+
from panelbox.gmm.tests import GMMTests
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DifferenceGMM:
|
|
28
|
+
"""
|
|
29
|
+
Arellano-Bond (1991) Difference GMM estimator.
|
|
30
|
+
|
|
31
|
+
Eliminates fixed effects through first-differencing and uses lagged
|
|
32
|
+
levels as instruments for the differenced equation.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
data : pd.DataFrame
|
|
37
|
+
Panel data in long format
|
|
38
|
+
dep_var : str
|
|
39
|
+
Name of dependent variable
|
|
40
|
+
lags : Union[int, List[int]]
|
|
41
|
+
Lags of dependent variable to include (e.g., 1 or [1, 2])
|
|
42
|
+
id_var : str
|
|
43
|
+
Name of cross-sectional identifier (default: 'id')
|
|
44
|
+
time_var : str
|
|
45
|
+
Name of time variable (default: 'year')
|
|
46
|
+
exog_vars : List[str], optional
|
|
47
|
+
List of strictly exogenous variables
|
|
48
|
+
endogenous_vars : List[str], optional
|
|
49
|
+
List of endogenous variables (excluding lagged dependent)
|
|
50
|
+
predetermined_vars : List[str], optional
|
|
51
|
+
List of predetermined variables
|
|
52
|
+
time_dummies : bool
|
|
53
|
+
Include time dummies (default: True)
|
|
54
|
+
collapse : bool
|
|
55
|
+
Collapse instruments to avoid proliferation (default: False)
|
|
56
|
+
two_step : bool
|
|
57
|
+
Use two-step GMM (default: True)
|
|
58
|
+
robust : bool
|
|
59
|
+
Use robust variance matrix with Windmeijer correction (default: True)
|
|
60
|
+
gmm_type : str
|
|
61
|
+
GMM estimation type: 'one_step', 'two_step', or 'iterative' (default: 'two_step')
|
|
62
|
+
|
|
63
|
+
Attributes
|
|
64
|
+
----------
|
|
65
|
+
data : pd.DataFrame
|
|
66
|
+
Panel data
|
|
67
|
+
params : pd.Series
|
|
68
|
+
Estimated coefficients (after fitting)
|
|
69
|
+
results : GMMResults
|
|
70
|
+
Full results object (after fitting)
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
**Basic example with employment data:**
|
|
75
|
+
|
|
76
|
+
>>> import pandas as pd
|
|
77
|
+
>>> from panelbox.gmm import DifferenceGMM
|
|
78
|
+
>>>
|
|
79
|
+
>>> # Load panel data (firms over time)
|
|
80
|
+
>>> data = pd.read_csv('panel_data.csv')
|
|
81
|
+
>>>
|
|
82
|
+
>>> # Estimate Difference GMM
|
|
83
|
+
>>> model = DifferenceGMM(
|
|
84
|
+
... data=data,
|
|
85
|
+
... dep_var='employment',
|
|
86
|
+
... lags=1, # Include employment_{t-1}
|
|
87
|
+
... id_var='firm_id',
|
|
88
|
+
... time_var='year',
|
|
89
|
+
... exog_vars=['wages', 'capital'],
|
|
90
|
+
... time_dummies=True,
|
|
91
|
+
... collapse=True, # Recommended to avoid instrument proliferation
|
|
92
|
+
... two_step=True, # Two-step with Windmeijer correction
|
|
93
|
+
... robust=True # Robust standard errors
|
|
94
|
+
... )
|
|
95
|
+
>>>
|
|
96
|
+
>>> # Fit and get results
|
|
97
|
+
>>> results = model.fit()
|
|
98
|
+
>>> print(results.summary())
|
|
99
|
+
>>>
|
|
100
|
+
>>> # Access coefficients
|
|
101
|
+
>>> print(f"Persistence: {results.params['L1.employment']:.3f}")
|
|
102
|
+
>>> print(f"Wage effect: {results.params['wages']:.3f}")
|
|
103
|
+
|
|
104
|
+
**Interpreting diagnostic tests:**
|
|
105
|
+
|
|
106
|
+
>>> # Check if estimation is valid
|
|
107
|
+
>>> if results.ar2_test.pvalue > 0.10:
|
|
108
|
+
... print("✓ Moment conditions valid")
|
|
109
|
+
...
|
|
110
|
+
>>> if 0.10 < results.hansen_j.pvalue < 0.25:
|
|
111
|
+
... print("✓ Instruments appear valid")
|
|
112
|
+
...
|
|
113
|
+
>>> if results.instrument_ratio < 1.0:
|
|
114
|
+
... print("✓ Instrument count appropriate")
|
|
115
|
+
|
|
116
|
+
**With predetermined and endogenous variables:**
|
|
117
|
+
|
|
118
|
+
>>> # Some variables may not be strictly exogenous
|
|
119
|
+
>>> model = DifferenceGMM(
|
|
120
|
+
... data=data,
|
|
121
|
+
... dep_var='output',
|
|
122
|
+
... lags=1,
|
|
123
|
+
... exog_vars=['policy_var'], # Strictly exogenous
|
|
124
|
+
... predetermined_vars=['capital'], # Instruments: t-1 and earlier
|
|
125
|
+
... endogenous_vars=['labor'], # Instruments: t-2 and earlier
|
|
126
|
+
... collapse=True,
|
|
127
|
+
... two_step=True
|
|
128
|
+
... )
|
|
129
|
+
>>> results = model.fit()
|
|
130
|
+
|
|
131
|
+
**For unbalanced panels:**
|
|
132
|
+
|
|
133
|
+
>>> # Always use collapse=True and avoid many time dummies
|
|
134
|
+
>>> model = DifferenceGMM(
|
|
135
|
+
... data=unbalanced_data,
|
|
136
|
+
... dep_var='y',
|
|
137
|
+
... lags=1,
|
|
138
|
+
... exog_vars=['x1', 'x2'],
|
|
139
|
+
... time_dummies=False, # Or use linear trend
|
|
140
|
+
... collapse=True, # Essential for unbalanced panels
|
|
141
|
+
... two_step=True
|
|
142
|
+
... )
|
|
143
|
+
>>> results = model.fit()
|
|
144
|
+
>>> print(f"Retained {results.nobs}/{len(unbalanced_data)} observations")
|
|
145
|
+
|
|
146
|
+
Notes
|
|
147
|
+
-----
|
|
148
|
+
Transformation: First-differences to eliminate fixed effects
|
|
149
|
+
Δy_{it} = γ Δy_{i,t-1} + β' Δx_{it} + Δε_{it}
|
|
150
|
+
|
|
151
|
+
Instruments: Lags of levels for differenced equations
|
|
152
|
+
- Strictly exogenous: all lags and leads
|
|
153
|
+
- Predetermined: lags t-2 and earlier
|
|
154
|
+
- Endogenous: lags t-3 and earlier
|
|
155
|
+
|
|
156
|
+
References
|
|
157
|
+
----------
|
|
158
|
+
Arellano, M., & Bond, S. (1991). Review of Economic Studies, 58(2), 277-297.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
def __init__(self,
|
|
162
|
+
data: pd.DataFrame,
|
|
163
|
+
dep_var: str,
|
|
164
|
+
lags: Union[int, List[int]],
|
|
165
|
+
id_var: str = 'id',
|
|
166
|
+
time_var: str = 'year',
|
|
167
|
+
exog_vars: Optional[List[str]] = None,
|
|
168
|
+
endogenous_vars: Optional[List[str]] = None,
|
|
169
|
+
predetermined_vars: Optional[List[str]] = None,
|
|
170
|
+
time_dummies: bool = True,
|
|
171
|
+
collapse: bool = False,
|
|
172
|
+
two_step: bool = True,
|
|
173
|
+
robust: bool = True,
|
|
174
|
+
gmm_type: str = 'two_step'):
|
|
175
|
+
"""Initialize Difference GMM model."""
|
|
176
|
+
self.data = data.copy()
|
|
177
|
+
self.dep_var = dep_var
|
|
178
|
+
self.lags = [lags] if isinstance(lags, int) else lags
|
|
179
|
+
self.id_var = id_var
|
|
180
|
+
self.time_var = time_var
|
|
181
|
+
self.exog_vars = exog_vars or []
|
|
182
|
+
self.endogenous_vars = endogenous_vars or []
|
|
183
|
+
self.predetermined_vars = predetermined_vars or []
|
|
184
|
+
self.time_dummies = time_dummies
|
|
185
|
+
self.collapse = collapse
|
|
186
|
+
self.two_step = two_step
|
|
187
|
+
self.robust = robust
|
|
188
|
+
self.gmm_type = gmm_type
|
|
189
|
+
|
|
190
|
+
# Initialize components
|
|
191
|
+
self.instrument_builder = InstrumentBuilder(data, id_var, time_var)
|
|
192
|
+
self.estimator = GMMEstimator()
|
|
193
|
+
self.tester = GMMTests()
|
|
194
|
+
|
|
195
|
+
# Results (populated after fit)
|
|
196
|
+
self.results = None
|
|
197
|
+
self.params = None
|
|
198
|
+
|
|
199
|
+
# Validate inputs
|
|
200
|
+
self._validate_inputs()
|
|
201
|
+
|
|
202
|
+
def _validate_inputs(self):
|
|
203
|
+
"""Validate model inputs."""
|
|
204
|
+
import warnings
|
|
205
|
+
|
|
206
|
+
# Check dep_var exists
|
|
207
|
+
if self.dep_var not in self.data.columns:
|
|
208
|
+
raise ValueError(f"Dependent variable '{self.dep_var}' not found in data")
|
|
209
|
+
|
|
210
|
+
# Check id_var and time_var exist
|
|
211
|
+
if self.id_var not in self.data.columns:
|
|
212
|
+
raise ValueError(f"ID variable '{self.id_var}' not found in data")
|
|
213
|
+
if self.time_var not in self.data.columns:
|
|
214
|
+
raise ValueError(f"Time variable '{self.time_var}' not found in data")
|
|
215
|
+
|
|
216
|
+
# Check exogenous variables exist
|
|
217
|
+
for var in self.exog_vars + self.endogenous_vars + self.predetermined_vars:
|
|
218
|
+
if var not in self.data.columns:
|
|
219
|
+
raise ValueError(f"Variable '{var}' not found in data")
|
|
220
|
+
|
|
221
|
+
# Check gmm_type is valid
|
|
222
|
+
valid_types = ['one_step', 'two_step', 'iterative']
|
|
223
|
+
if self.gmm_type not in valid_types:
|
|
224
|
+
raise ValueError(f"gmm_type must be one of {valid_types}")
|
|
225
|
+
|
|
226
|
+
# If gmm_type is specified, override two_step flag
|
|
227
|
+
if self.gmm_type == 'one_step':
|
|
228
|
+
self.two_step = False
|
|
229
|
+
elif self.gmm_type == 'two_step':
|
|
230
|
+
self.two_step = True
|
|
231
|
+
|
|
232
|
+
# Check for unbalanced panel + time dummies issue
|
|
233
|
+
if self.time_dummies:
|
|
234
|
+
is_unbalanced, balance_rate = self._check_panel_balance()
|
|
235
|
+
if is_unbalanced:
|
|
236
|
+
n_time_periods = self.data[self.time_var].nunique()
|
|
237
|
+
n_dummies = n_time_periods - 1
|
|
238
|
+
|
|
239
|
+
if n_dummies >= 5 and balance_rate < 0.80:
|
|
240
|
+
warnings.warn(
|
|
241
|
+
f"\nUnbalanced panel detected ({balance_rate*100:.0f}% balanced) with "
|
|
242
|
+
f"{n_dummies} time dummies.\n"
|
|
243
|
+
f"This may result in very few observations being retained.\n\n"
|
|
244
|
+
f"Recommendations:\n"
|
|
245
|
+
f" 1. Set time_dummies=False and add a linear trend\n"
|
|
246
|
+
f" 2. Use only subset of key time dummies\n"
|
|
247
|
+
f" 3. Ensure collapse=True (currently: {self.collapse})\n\n"
|
|
248
|
+
f"See examples/gmm/unbalanced_panel_guide.py for details.",
|
|
249
|
+
UserWarning
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Check collapse recommendation
|
|
253
|
+
if not self.collapse:
|
|
254
|
+
warnings.warn(
|
|
255
|
+
"\nRecommendation: Set collapse=True to avoid instrument proliferation.\n"
|
|
256
|
+
"This is especially important for unbalanced panels.",
|
|
257
|
+
UserWarning
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
def _check_panel_balance(self):
|
|
261
|
+
"""
|
|
262
|
+
Check if panel data is balanced.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
tuple
|
|
267
|
+
(is_unbalanced: bool, balance_rate: float)
|
|
268
|
+
"""
|
|
269
|
+
obs_per_unit = self.data.groupby(self.id_var).size()
|
|
270
|
+
max_periods = obs_per_unit.max()
|
|
271
|
+
|
|
272
|
+
# Panel is balanced if all units have same number of periods
|
|
273
|
+
is_balanced = (obs_per_unit == max_periods).all()
|
|
274
|
+
|
|
275
|
+
# Balance rate: proportion of units with max periods
|
|
276
|
+
balance_rate = (obs_per_unit == max_periods).mean()
|
|
277
|
+
|
|
278
|
+
return not is_balanced, balance_rate
|
|
279
|
+
|
|
280
|
+
def fit(self) -> GMMResults:
|
|
281
|
+
"""
|
|
282
|
+
Estimate the Difference GMM model.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
GMMResults
|
|
287
|
+
Estimation results including coefficients, tests, and diagnostics
|
|
288
|
+
|
|
289
|
+
Raises
|
|
290
|
+
------
|
|
291
|
+
ValueError
|
|
292
|
+
If model specification is invalid
|
|
293
|
+
RuntimeError
|
|
294
|
+
If estimation fails
|
|
295
|
+
|
|
296
|
+
Notes
|
|
297
|
+
-----
|
|
298
|
+
Estimation procedure:
|
|
299
|
+
1. Transform data to first-differences
|
|
300
|
+
2. Generate instruments (lags of levels)
|
|
301
|
+
3. Estimate GMM (one-step, two-step, or iterative)
|
|
302
|
+
4. Compute specification tests
|
|
303
|
+
5. Return results object
|
|
304
|
+
"""
|
|
305
|
+
# Step 1: Transform data
|
|
306
|
+
y_diff, X_diff, ids, times = self._transform_data()
|
|
307
|
+
|
|
308
|
+
# Step 1.5: Recreate InstrumentBuilder with updated data (includes lagged vars)
|
|
309
|
+
self.instrument_builder = InstrumentBuilder(self.data, self.id_var, self.time_var)
|
|
310
|
+
|
|
311
|
+
# Step 2: Generate instruments
|
|
312
|
+
Z = self._generate_instruments()
|
|
313
|
+
|
|
314
|
+
# Step 2.5: Pre-clean instruments for unbalanced panels
|
|
315
|
+
# Remove instrument columns that have excessive NaNs
|
|
316
|
+
Z_matrix = Z.Z.copy()
|
|
317
|
+
|
|
318
|
+
# First, remove columns that are all NaN
|
|
319
|
+
not_all_nan = ~np.isnan(Z_matrix).all(axis=0)
|
|
320
|
+
Z_matrix = Z_matrix[:, not_all_nan]
|
|
321
|
+
|
|
322
|
+
# Then, remove columns with >90% NaN (too few valid observations)
|
|
323
|
+
nan_fraction = np.isnan(Z_matrix).mean(axis=0)
|
|
324
|
+
mostly_valid = nan_fraction < 0.9
|
|
325
|
+
Z_matrix = Z_matrix[:, mostly_valid]
|
|
326
|
+
|
|
327
|
+
# Finally, replace any remaining NaNs with 0
|
|
328
|
+
# This is reasonable: NaN means instrument not available, contributes 0 to moment conditions
|
|
329
|
+
Z_matrix = np.nan_to_num(Z_matrix, nan=0.0)
|
|
330
|
+
|
|
331
|
+
# Step 3: Estimate GMM
|
|
332
|
+
if self.gmm_type == 'one_step':
|
|
333
|
+
beta, W, residuals = self.estimator.one_step(y_diff, X_diff, Z_matrix)
|
|
334
|
+
vcov = self._compute_one_step_vcov(X_diff, Z_matrix, residuals, W)
|
|
335
|
+
converged = True
|
|
336
|
+
elif self.gmm_type == 'two_step':
|
|
337
|
+
beta, vcov, W, residuals = self.estimator.two_step(
|
|
338
|
+
y_diff, X_diff, Z_matrix, robust=self.robust
|
|
339
|
+
)
|
|
340
|
+
converged = True
|
|
341
|
+
else: # iterative
|
|
342
|
+
beta, vcov, W, converged = self.estimator.iterative(
|
|
343
|
+
y_diff, X_diff, Z_matrix
|
|
344
|
+
)
|
|
345
|
+
residuals = y_diff - X_diff @ beta
|
|
346
|
+
|
|
347
|
+
# Step 4: Compute standard errors and t-statistics
|
|
348
|
+
beta = beta.flatten() # Ensure beta is 1D
|
|
349
|
+
std_errors = np.sqrt(np.diag(vcov))
|
|
350
|
+
tvalues = beta / std_errors
|
|
351
|
+
from scipy import stats
|
|
352
|
+
pvalues = 2 * (1 - stats.norm.cdf(np.abs(tvalues)))
|
|
353
|
+
|
|
354
|
+
# Step 5: Get variable names
|
|
355
|
+
var_names = self._get_variable_names()
|
|
356
|
+
|
|
357
|
+
# Step 6: Compute specification tests
|
|
358
|
+
hansen = self.tester.hansen_j_test(
|
|
359
|
+
residuals, Z_matrix, W, len(beta)
|
|
360
|
+
)
|
|
361
|
+
sargan = self.tester.sargan_test(
|
|
362
|
+
residuals, Z_matrix, len(beta)
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# For AR tests, we need clean data without NaN
|
|
366
|
+
residuals_flat = residuals.flatten() if residuals.ndim > 1 else residuals
|
|
367
|
+
valid_mask = ~np.isnan(residuals_flat)
|
|
368
|
+
ar1 = self.tester.arellano_bond_ar_test(
|
|
369
|
+
residuals_flat[valid_mask], ids[valid_mask], order=1
|
|
370
|
+
)
|
|
371
|
+
ar2 = self.tester.arellano_bond_ar_test(
|
|
372
|
+
residuals_flat[valid_mask], ids[valid_mask], order=2
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Step 7: Create results object
|
|
376
|
+
self.results = GMMResults(
|
|
377
|
+
params=pd.Series(beta, index=var_names),
|
|
378
|
+
std_errors=pd.Series(std_errors, index=var_names),
|
|
379
|
+
tvalues=pd.Series(tvalues, index=var_names),
|
|
380
|
+
pvalues=pd.Series(pvalues, index=var_names),
|
|
381
|
+
nobs=int(np.sum(valid_mask)),
|
|
382
|
+
n_groups=self.instrument_builder.n_groups,
|
|
383
|
+
n_instruments=Z_matrix.shape[1], # Use actual number of instruments after cleaning
|
|
384
|
+
n_params=len(beta),
|
|
385
|
+
hansen_j=hansen,
|
|
386
|
+
sargan=sargan,
|
|
387
|
+
ar1_test=ar1,
|
|
388
|
+
ar2_test=ar2,
|
|
389
|
+
vcov=vcov,
|
|
390
|
+
weight_matrix=W,
|
|
391
|
+
converged=converged,
|
|
392
|
+
two_step=self.two_step,
|
|
393
|
+
windmeijer_corrected=self.robust and self.two_step,
|
|
394
|
+
model_type='difference',
|
|
395
|
+
transformation='fd',
|
|
396
|
+
residuals=residuals
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
self.params = self.results.params
|
|
400
|
+
|
|
401
|
+
# Post-estimation warning for low observation retention
|
|
402
|
+
retention_rate = self.results.nobs / len(self.data)
|
|
403
|
+
if retention_rate < 0.30:
|
|
404
|
+
import warnings
|
|
405
|
+
warnings.warn(
|
|
406
|
+
f"\nLow observation retention: {self.results.nobs}/{len(self.data)} "
|
|
407
|
+
f"({retention_rate*100:.1f}%).\n"
|
|
408
|
+
f"Many observations were dropped due to insufficient valid instruments.\n\n"
|
|
409
|
+
f"Recommendations:\n"
|
|
410
|
+
f" 1. Simplify specification (fewer variables/lags)\n"
|
|
411
|
+
f" 2. Set time_dummies=False (or use linear trend)\n"
|
|
412
|
+
f" 3. Ensure collapse=True (currently: {self.collapse})\n"
|
|
413
|
+
f" 4. Check data for excessive missing values\n\n"
|
|
414
|
+
f"See examples/gmm/unbalanced_panel_guide.py for detailed guidance.",
|
|
415
|
+
UserWarning
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return self.results
|
|
419
|
+
|
|
420
|
+
def _transform_data(self) -> tuple:
|
|
421
|
+
"""
|
|
422
|
+
Transform data to first-differences.
|
|
423
|
+
|
|
424
|
+
Returns
|
|
425
|
+
-------
|
|
426
|
+
y_diff : np.ndarray
|
|
427
|
+
Differenced dependent variable
|
|
428
|
+
X_diff : np.ndarray
|
|
429
|
+
Differenced regressors
|
|
430
|
+
ids : np.ndarray
|
|
431
|
+
ID variable
|
|
432
|
+
times : np.ndarray
|
|
433
|
+
Time variable
|
|
434
|
+
"""
|
|
435
|
+
# Sort data
|
|
436
|
+
df = self.data.sort_values([self.id_var, self.time_var]).copy()
|
|
437
|
+
|
|
438
|
+
# Create lagged dependent variable
|
|
439
|
+
for lag in self.lags:
|
|
440
|
+
lag_name = f'{self.dep_var}_L{lag}'
|
|
441
|
+
df[lag_name] = df.groupby(self.id_var)[self.dep_var].shift(lag)
|
|
442
|
+
# Also add to self.data for instrument generation
|
|
443
|
+
self.data[lag_name] = df[lag_name]
|
|
444
|
+
|
|
445
|
+
# Build regressor list
|
|
446
|
+
regressors = []
|
|
447
|
+
for lag in self.lags:
|
|
448
|
+
regressors.append(f'{self.dep_var}_L{lag}')
|
|
449
|
+
regressors.extend(self.exog_vars)
|
|
450
|
+
regressors.extend(self.endogenous_vars)
|
|
451
|
+
regressors.extend(self.predetermined_vars)
|
|
452
|
+
|
|
453
|
+
# Add time dummies if requested
|
|
454
|
+
if self.time_dummies:
|
|
455
|
+
time_dummies = pd.get_dummies(df[self.time_var], prefix='year', drop_first=True)
|
|
456
|
+
for col in time_dummies.columns:
|
|
457
|
+
df[col] = time_dummies[col]
|
|
458
|
+
regressors.append(col)
|
|
459
|
+
|
|
460
|
+
# First-difference transformation
|
|
461
|
+
df['y_diff'] = df.groupby(self.id_var)[self.dep_var].diff()
|
|
462
|
+
|
|
463
|
+
X_diff_dict = {}
|
|
464
|
+
for var in regressors:
|
|
465
|
+
X_diff_dict[var] = df.groupby(self.id_var)[var].diff()
|
|
466
|
+
|
|
467
|
+
# Extract arrays, ensuring float64 dtype
|
|
468
|
+
y_diff = df['y_diff'].values.reshape(-1, 1).astype(np.float64)
|
|
469
|
+
X_diff = np.column_stack([X_diff_dict[var].values for var in regressors]).astype(np.float64)
|
|
470
|
+
ids = df[self.id_var].values
|
|
471
|
+
times = df[self.time_var].values
|
|
472
|
+
|
|
473
|
+
return y_diff, X_diff, ids, times
|
|
474
|
+
|
|
475
|
+
def _generate_instruments(self) -> InstrumentSet:
|
|
476
|
+
"""
|
|
477
|
+
Generate instrument matrix.
|
|
478
|
+
|
|
479
|
+
Returns
|
|
480
|
+
-------
|
|
481
|
+
InstrumentSet
|
|
482
|
+
Combined instrument set
|
|
483
|
+
"""
|
|
484
|
+
instrument_sets = []
|
|
485
|
+
|
|
486
|
+
# Instruments for lagged dependent variable (GMM-style)
|
|
487
|
+
# For Δy_{i,t-lag}, use levels y_{i,t-lag-1}, y_{i,t-lag-2}, ... as instruments
|
|
488
|
+
for lag in self.lags:
|
|
489
|
+
# min_lag for instruments should be lag+1 (e.g., for L1.y use y_{t-2}, y_{t-3}, ...)
|
|
490
|
+
Z_lag = self.instrument_builder.create_gmm_style_instruments(
|
|
491
|
+
var=self.dep_var,
|
|
492
|
+
min_lag=lag + 1, # For L1.y, use y_{t-2} and earlier
|
|
493
|
+
max_lag=99, # All available lags
|
|
494
|
+
equation='diff',
|
|
495
|
+
collapse=self.collapse
|
|
496
|
+
)
|
|
497
|
+
instrument_sets.append(Z_lag)
|
|
498
|
+
|
|
499
|
+
# Instruments for strictly exogenous variables (IV-style, all lags)
|
|
500
|
+
for var in self.exog_vars:
|
|
501
|
+
Z_exog = self.instrument_builder.create_iv_style_instruments(
|
|
502
|
+
var=var,
|
|
503
|
+
min_lag=0, # Current and all lags
|
|
504
|
+
max_lag=0, # Just current for simplicity (can extend)
|
|
505
|
+
equation='diff'
|
|
506
|
+
)
|
|
507
|
+
instrument_sets.append(Z_exog)
|
|
508
|
+
|
|
509
|
+
# Instruments for predetermined variables (GMM-style, lag 2+)
|
|
510
|
+
for var in self.predetermined_vars:
|
|
511
|
+
Z_pred = self.instrument_builder.create_gmm_style_instruments(
|
|
512
|
+
var=var,
|
|
513
|
+
min_lag=2, # t-2 and earlier
|
|
514
|
+
max_lag=99,
|
|
515
|
+
equation='diff',
|
|
516
|
+
collapse=self.collapse
|
|
517
|
+
)
|
|
518
|
+
instrument_sets.append(Z_pred)
|
|
519
|
+
|
|
520
|
+
# Instruments for endogenous variables (GMM-style, lag 3+)
|
|
521
|
+
for var in self.endogenous_vars:
|
|
522
|
+
Z_endog = self.instrument_builder.create_gmm_style_instruments(
|
|
523
|
+
var=var,
|
|
524
|
+
min_lag=3, # t-3 and earlier
|
|
525
|
+
max_lag=99,
|
|
526
|
+
equation='diff',
|
|
527
|
+
collapse=self.collapse
|
|
528
|
+
)
|
|
529
|
+
instrument_sets.append(Z_endog)
|
|
530
|
+
|
|
531
|
+
# Combine all instruments
|
|
532
|
+
Z_combined = self.instrument_builder.combine_instruments(*instrument_sets)
|
|
533
|
+
|
|
534
|
+
return Z_combined
|
|
535
|
+
|
|
536
|
+
def _compute_one_step_vcov(self,
|
|
537
|
+
X: np.ndarray,
|
|
538
|
+
Z: np.ndarray,
|
|
539
|
+
residuals: np.ndarray,
|
|
540
|
+
W: np.ndarray) -> np.ndarray:
|
|
541
|
+
"""
|
|
542
|
+
Compute variance-covariance matrix for one-step GMM.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
X : np.ndarray
|
|
547
|
+
Regressors
|
|
548
|
+
Z : np.ndarray
|
|
549
|
+
Instruments
|
|
550
|
+
residuals : np.ndarray
|
|
551
|
+
Residuals
|
|
552
|
+
W : np.ndarray
|
|
553
|
+
Weight matrix
|
|
554
|
+
|
|
555
|
+
Returns
|
|
556
|
+
-------
|
|
557
|
+
np.ndarray
|
|
558
|
+
Variance-covariance matrix
|
|
559
|
+
"""
|
|
560
|
+
# Ensure arrays are float64
|
|
561
|
+
X = np.asarray(X, dtype=np.float64)
|
|
562
|
+
Z = np.asarray(Z, dtype=np.float64)
|
|
563
|
+
residuals = np.asarray(residuals, dtype=np.float64)
|
|
564
|
+
W = np.asarray(W, dtype=np.float64)
|
|
565
|
+
|
|
566
|
+
# Remove missing values
|
|
567
|
+
valid_mask = ~np.isnan(residuals.flatten())
|
|
568
|
+
X_clean = X[valid_mask]
|
|
569
|
+
Z_clean = Z[valid_mask]
|
|
570
|
+
resid_clean = residuals[valid_mask]
|
|
571
|
+
|
|
572
|
+
# Robust variance: (X'Z W Z'X)^{-1} (X'Z W Ω W Z'X) (X'Z W Z'X)^{-1}
|
|
573
|
+
# where Ω = Z' diag(ε²) Z
|
|
574
|
+
|
|
575
|
+
XtZ = X_clean.T @ Z_clean
|
|
576
|
+
ZtX = Z_clean.T @ X_clean
|
|
577
|
+
|
|
578
|
+
A = XtZ @ W @ ZtX
|
|
579
|
+
try:
|
|
580
|
+
A_inv = np.linalg.inv(A)
|
|
581
|
+
except np.linalg.LinAlgError:
|
|
582
|
+
A_inv = np.linalg.pinv(A)
|
|
583
|
+
|
|
584
|
+
# Compute Omega
|
|
585
|
+
Omega = np.diag(resid_clean.flatten() ** 2)
|
|
586
|
+
ZtOmegaZ = Z_clean.T @ Omega @ Z_clean
|
|
587
|
+
|
|
588
|
+
# Robust variance
|
|
589
|
+
B = XtZ @ W @ ZtOmegaZ @ W @ ZtX
|
|
590
|
+
vcov = A_inv @ B @ A_inv
|
|
591
|
+
|
|
592
|
+
return vcov
|
|
593
|
+
|
|
594
|
+
def _get_variable_names(self) -> List[str]:
|
|
595
|
+
"""
|
|
596
|
+
Get list of variable names in order.
|
|
597
|
+
|
|
598
|
+
Returns
|
|
599
|
+
-------
|
|
600
|
+
List[str]
|
|
601
|
+
Variable names
|
|
602
|
+
"""
|
|
603
|
+
var_names = []
|
|
604
|
+
|
|
605
|
+
# Lagged dependent variable
|
|
606
|
+
for lag in self.lags:
|
|
607
|
+
var_names.append(f'L{lag}.{self.dep_var}')
|
|
608
|
+
|
|
609
|
+
# Other variables
|
|
610
|
+
var_names.extend(self.exog_vars)
|
|
611
|
+
var_names.extend(self.endogenous_vars)
|
|
612
|
+
var_names.extend(self.predetermined_vars)
|
|
613
|
+
|
|
614
|
+
# Time dummies
|
|
615
|
+
if self.time_dummies:
|
|
616
|
+
time_periods = sorted(self.data[self.time_var].unique())[1:] # Drop first
|
|
617
|
+
for t in time_periods:
|
|
618
|
+
var_names.append(f'year_{t}')
|
|
619
|
+
|
|
620
|
+
return var_names
|
|
621
|
+
|
|
622
|
+
def summary(self) -> str:
|
|
623
|
+
"""
|
|
624
|
+
Print model summary.
|
|
625
|
+
|
|
626
|
+
Returns
|
|
627
|
+
-------
|
|
628
|
+
str
|
|
629
|
+
Summary string
|
|
630
|
+
|
|
631
|
+
Raises
|
|
632
|
+
------
|
|
633
|
+
ValueError
|
|
634
|
+
If model has not been fit yet
|
|
635
|
+
"""
|
|
636
|
+
if self.results is None:
|
|
637
|
+
raise ValueError("Model has not been fit yet. Call fit() first.")
|
|
638
|
+
|
|
639
|
+
return self.results.summary(title='Difference GMM (Arellano-Bond)')
|
|
640
|
+
|
|
641
|
+
def __repr__(self) -> str:
|
|
642
|
+
"""Representation of the model."""
|
|
643
|
+
status = "fitted" if self.results is not None else "not fitted"
|
|
644
|
+
return (f"DifferenceGMM(dep_var='{self.dep_var}', lags={self.lags}, "
|
|
645
|
+
f"status='{status}')")
|