panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Instrument Generation for GMM
|
|
3
|
+
==============================
|
|
4
|
+
|
|
5
|
+
Tools for generating and managing instrument matrices for GMM estimation.
|
|
6
|
+
|
|
7
|
+
Classes
|
|
8
|
+
-------
|
|
9
|
+
InstrumentSet : Container for instrument matrices
|
|
10
|
+
InstrumentBuilder : Generates instrument matrices following xtabond2 rules
|
|
11
|
+
|
|
12
|
+
References
|
|
13
|
+
----------
|
|
14
|
+
.. [1] Roodman, D. (2009). "How to do xtabond2: An Introduction to Difference
|
|
15
|
+
and System GMM in Stata." Stata Journal, 9(1), 86-136.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import List, Optional, Dict, Tuple
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
from enum import Enum
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InstrumentStyle(Enum):
|
|
26
|
+
"""Style of instrument generation."""
|
|
27
|
+
IV = 'iv' # IV-style: one column per lag
|
|
28
|
+
GMM = 'gmm' # GMM-style: separate columns per time period
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class EquationType(Enum):
|
|
32
|
+
"""Type of equation for instruments."""
|
|
33
|
+
DIFF = 'diff' # Differenced equation
|
|
34
|
+
LEVEL = 'level' # Level equation
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class InstrumentSet:
|
|
39
|
+
"""
|
|
40
|
+
Container for instrument matrices.
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
Z : np.ndarray
|
|
45
|
+
Instrument matrix (T*N x n_instruments)
|
|
46
|
+
variable_names : List[str]
|
|
47
|
+
Names of instrumented variables
|
|
48
|
+
instrument_names : List[str]
|
|
49
|
+
Names of instrument columns
|
|
50
|
+
equation : str
|
|
51
|
+
Equation type ('diff' or 'level')
|
|
52
|
+
style : str
|
|
53
|
+
Instrument style ('iv' or 'gmm')
|
|
54
|
+
collapsed : bool
|
|
55
|
+
Whether instruments are collapsed
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
Z: np.ndarray
|
|
59
|
+
variable_names: List[str] = field(default_factory=list)
|
|
60
|
+
instrument_names: List[str] = field(default_factory=list)
|
|
61
|
+
equation: str = 'diff'
|
|
62
|
+
style: str = 'gmm'
|
|
63
|
+
collapsed: bool = False
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def n_instruments(self) -> int:
|
|
67
|
+
"""Number of instruments."""
|
|
68
|
+
return self.Z.shape[1] if self.Z is not None else 0
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def n_obs(self) -> int:
|
|
72
|
+
"""Number of observations."""
|
|
73
|
+
return self.Z.shape[0] if self.Z is not None else 0
|
|
74
|
+
|
|
75
|
+
def __repr__(self) -> str:
|
|
76
|
+
return (f"InstrumentSet(n_instruments={self.n_instruments}, "
|
|
77
|
+
f"n_obs={self.n_obs}, equation='{self.equation}', "
|
|
78
|
+
f"style='{self.style}', collapsed={self.collapsed})")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class InstrumentBuilder:
|
|
82
|
+
"""
|
|
83
|
+
Generates instrument matrices for GMM estimation.
|
|
84
|
+
|
|
85
|
+
Follows Stata xtabond2 instrument generation rules.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
data : pd.DataFrame
|
|
90
|
+
Panel data in long format
|
|
91
|
+
id_var : str
|
|
92
|
+
Name of cross-sectional identifier
|
|
93
|
+
time_var : str
|
|
94
|
+
Name of time variable
|
|
95
|
+
|
|
96
|
+
Attributes
|
|
97
|
+
----------
|
|
98
|
+
data : pd.DataFrame
|
|
99
|
+
Panel data
|
|
100
|
+
id_var : str
|
|
101
|
+
Cross-sectional identifier
|
|
102
|
+
time_var : str
|
|
103
|
+
Time variable
|
|
104
|
+
n_groups : int
|
|
105
|
+
Number of cross-sectional units
|
|
106
|
+
time_periods : np.ndarray
|
|
107
|
+
Unique time periods (sorted)
|
|
108
|
+
|
|
109
|
+
Examples
|
|
110
|
+
--------
|
|
111
|
+
>>> builder = InstrumentBuilder(data, id_var='id', time_var='year')
|
|
112
|
+
>>> # IV-style instruments
|
|
113
|
+
>>> Z_iv = builder.create_iv_style_instruments('x', min_lag=2, max_lag=4)
|
|
114
|
+
>>> # GMM-style instruments
|
|
115
|
+
>>> Z_gmm = builder.create_gmm_style_instruments('y', min_lag=2, max_lag=99)
|
|
116
|
+
>>> # Collapsed GMM-style
|
|
117
|
+
>>> Z_collapsed = builder.create_gmm_style_instruments(
|
|
118
|
+
... 'y', min_lag=2, max_lag=99, collapse=True
|
|
119
|
+
... )
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(self,
|
|
123
|
+
data: pd.DataFrame,
|
|
124
|
+
id_var: str,
|
|
125
|
+
time_var: str):
|
|
126
|
+
"""Initialize instrument builder."""
|
|
127
|
+
self.data = data.copy()
|
|
128
|
+
self.id_var = id_var
|
|
129
|
+
self.time_var = time_var
|
|
130
|
+
|
|
131
|
+
# Ensure data is sorted
|
|
132
|
+
self.data = self.data.sort_values([id_var, time_var])
|
|
133
|
+
|
|
134
|
+
# Extract groups and time periods
|
|
135
|
+
self.groups = self.data[id_var].unique()
|
|
136
|
+
self.n_groups = len(self.groups)
|
|
137
|
+
self.time_periods = np.sort(self.data[time_var].unique())
|
|
138
|
+
self.n_periods = len(self.time_periods)
|
|
139
|
+
|
|
140
|
+
def create_iv_style_instruments(self,
|
|
141
|
+
var: str,
|
|
142
|
+
min_lag: int,
|
|
143
|
+
max_lag: int,
|
|
144
|
+
equation: str = 'diff') -> InstrumentSet:
|
|
145
|
+
"""
|
|
146
|
+
Create IV-style instruments (one column per lag).
|
|
147
|
+
|
|
148
|
+
IV-style instruments create one column for each lag, with observations
|
|
149
|
+
placed appropriately for each time period.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
var : str
|
|
154
|
+
Variable to instrument
|
|
155
|
+
min_lag : int
|
|
156
|
+
Minimum lag to use (e.g., 2 means t-2)
|
|
157
|
+
max_lag : int
|
|
158
|
+
Maximum lag to use (e.g., 4 means t-4)
|
|
159
|
+
equation : str
|
|
160
|
+
'diff' for differenced equation, 'level' for level equation
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
InstrumentSet
|
|
165
|
+
IV-style instrument set
|
|
166
|
+
|
|
167
|
+
Examples
|
|
168
|
+
--------
|
|
169
|
+
>>> # gmm(x, lag(2 4)) in IV-style creates 3 columns: x_{t-2}, x_{t-3}, x_{t-4}
|
|
170
|
+
>>> Z = builder.create_iv_style_instruments('x', min_lag=2, max_lag=4)
|
|
171
|
+
|
|
172
|
+
Notes
|
|
173
|
+
-----
|
|
174
|
+
For equation='diff', instruments levels: x_{i,t-k}
|
|
175
|
+
For equation='level', instruments differences: Δx_{i,t-k}
|
|
176
|
+
"""
|
|
177
|
+
n_lags = max_lag - min_lag + 1
|
|
178
|
+
n_obs = len(self.data)
|
|
179
|
+
|
|
180
|
+
# Initialize instrument matrix
|
|
181
|
+
Z = np.full((n_obs, n_lags), np.nan)
|
|
182
|
+
|
|
183
|
+
# Get variable data
|
|
184
|
+
var_data = self.data[var].values
|
|
185
|
+
ids = self.data[self.id_var].values
|
|
186
|
+
times = self.data[self.time_var].values
|
|
187
|
+
|
|
188
|
+
# Create instrument names
|
|
189
|
+
instrument_names = []
|
|
190
|
+
for lag in range(min_lag, max_lag + 1):
|
|
191
|
+
if equation == 'diff':
|
|
192
|
+
instrument_names.append(f"{var}_L{lag}")
|
|
193
|
+
else:
|
|
194
|
+
instrument_names.append(f"D.{var}_L{lag}")
|
|
195
|
+
|
|
196
|
+
# Fill instrument matrix
|
|
197
|
+
for i, (current_id, current_time) in enumerate(zip(ids, times)):
|
|
198
|
+
for lag_idx, lag in enumerate(range(min_lag, max_lag + 1)):
|
|
199
|
+
# Find lagged value
|
|
200
|
+
mask = (ids == current_id) & (times == current_time - lag)
|
|
201
|
+
if np.any(mask):
|
|
202
|
+
lag_idx_data = np.where(mask)[0][0]
|
|
203
|
+
if equation == 'diff':
|
|
204
|
+
Z[i, lag_idx] = var_data[lag_idx_data]
|
|
205
|
+
else:
|
|
206
|
+
# For level equation, use differences as instruments
|
|
207
|
+
# Find t-lag-1 for differencing
|
|
208
|
+
mask_lag1 = (ids == current_id) & (times == current_time - lag - 1)
|
|
209
|
+
if np.any(mask_lag1):
|
|
210
|
+
lag1_idx_data = np.where(mask_lag1)[0][0]
|
|
211
|
+
Z[i, lag_idx] = var_data[lag_idx_data] - var_data[lag1_idx_data]
|
|
212
|
+
|
|
213
|
+
return InstrumentSet(
|
|
214
|
+
Z=Z,
|
|
215
|
+
variable_names=[var],
|
|
216
|
+
instrument_names=instrument_names,
|
|
217
|
+
equation=equation,
|
|
218
|
+
style='iv',
|
|
219
|
+
collapsed=False
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def create_gmm_style_instruments(self,
|
|
223
|
+
var: str,
|
|
224
|
+
min_lag: int,
|
|
225
|
+
max_lag: Optional[int] = None,
|
|
226
|
+
equation: str = 'diff',
|
|
227
|
+
collapse: bool = False) -> InstrumentSet:
|
|
228
|
+
"""
|
|
229
|
+
Create GMM-style instruments (separate column per time period).
|
|
230
|
+
|
|
231
|
+
GMM-style instruments create a separate column for each available lag
|
|
232
|
+
in each time period, leading to instrument proliferation unless collapsed.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
var : str
|
|
237
|
+
Variable to instrument
|
|
238
|
+
min_lag : int
|
|
239
|
+
Minimum lag to use
|
|
240
|
+
max_lag : int, optional
|
|
241
|
+
Maximum lag to use (None = all available)
|
|
242
|
+
equation : str
|
|
243
|
+
'diff' for differenced equation, 'level' for level equation
|
|
244
|
+
collapse : bool
|
|
245
|
+
Whether to collapse instruments to avoid proliferation
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
InstrumentSet
|
|
250
|
+
GMM-style instrument set
|
|
251
|
+
|
|
252
|
+
Examples
|
|
253
|
+
--------
|
|
254
|
+
>>> # Without collapse: Creates many columns (one per time*lag)
|
|
255
|
+
>>> Z = builder.create_gmm_style_instruments('x', min_lag=2, max_lag=99)
|
|
256
|
+
>>> # With collapse: Creates one column per lag
|
|
257
|
+
>>> Z_collapsed = builder.create_gmm_style_instruments(
|
|
258
|
+
... 'x', min_lag=2, max_lag=99, collapse=True
|
|
259
|
+
... )
|
|
260
|
+
|
|
261
|
+
Notes
|
|
262
|
+
-----
|
|
263
|
+
Collapse mode (Roodman 2009 recommendation):
|
|
264
|
+
- Reduces instrument count from O(T²) to O(T)
|
|
265
|
+
- Uses sum of available lags instead of separate columns
|
|
266
|
+
- Helps avoid overfitting and weak instruments
|
|
267
|
+
"""
|
|
268
|
+
if collapse:
|
|
269
|
+
return self._create_gmm_collapsed(var, min_lag, max_lag, equation)
|
|
270
|
+
else:
|
|
271
|
+
return self._create_gmm_standard(var, min_lag, max_lag, equation)
|
|
272
|
+
|
|
273
|
+
def _create_gmm_standard(self,
|
|
274
|
+
var: str,
|
|
275
|
+
min_lag: int,
|
|
276
|
+
max_lag: Optional[int],
|
|
277
|
+
equation: str) -> InstrumentSet:
|
|
278
|
+
"""Create GMM-style instruments without collapse."""
|
|
279
|
+
var_data = self.data[var].values
|
|
280
|
+
ids = self.data[self.id_var].values
|
|
281
|
+
times = self.data[self.time_var].values
|
|
282
|
+
n_obs = len(self.data)
|
|
283
|
+
|
|
284
|
+
# Determine actual max_lag
|
|
285
|
+
if max_lag is None:
|
|
286
|
+
max_lag = int(1e6) # Effectively infinite
|
|
287
|
+
|
|
288
|
+
# Build instruments time period by time period
|
|
289
|
+
Z_list = []
|
|
290
|
+
instrument_names = []
|
|
291
|
+
|
|
292
|
+
for t_idx, t in enumerate(self.time_periods):
|
|
293
|
+
# Skip early periods where no lags available
|
|
294
|
+
if t_idx < min_lag:
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
# Determine available lags for this period
|
|
298
|
+
available_lags = []
|
|
299
|
+
for lag in range(min_lag, min(max_lag + 1, t_idx + 1)):
|
|
300
|
+
available_lags.append(lag)
|
|
301
|
+
|
|
302
|
+
if not available_lags:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# Create instrument columns for this time period
|
|
306
|
+
for lag in available_lags:
|
|
307
|
+
col = np.full(n_obs, np.nan)
|
|
308
|
+
col_name = f"{var}_t{t}_L{lag}"
|
|
309
|
+
|
|
310
|
+
# Fill only for observations at time t
|
|
311
|
+
mask_t = times == t
|
|
312
|
+
for i in np.where(mask_t)[0]:
|
|
313
|
+
current_id = ids[i]
|
|
314
|
+
# Find lagged value
|
|
315
|
+
mask_lag = (ids == current_id) & (times == t - lag)
|
|
316
|
+
if np.any(mask_lag):
|
|
317
|
+
lag_idx = np.where(mask_lag)[0][0]
|
|
318
|
+
if equation == 'diff':
|
|
319
|
+
col[i] = var_data[lag_idx]
|
|
320
|
+
else:
|
|
321
|
+
# For level equation, use differences
|
|
322
|
+
mask_lag1 = (ids == current_id) & (times == t - lag - 1)
|
|
323
|
+
if np.any(mask_lag1):
|
|
324
|
+
lag1_idx = np.where(mask_lag1)[0][0]
|
|
325
|
+
col[i] = var_data[lag_idx] - var_data[lag1_idx]
|
|
326
|
+
|
|
327
|
+
Z_list.append(col)
|
|
328
|
+
instrument_names.append(col_name)
|
|
329
|
+
|
|
330
|
+
# Stack into matrix
|
|
331
|
+
Z = np.column_stack(Z_list) if Z_list else np.empty((n_obs, 0))
|
|
332
|
+
|
|
333
|
+
return InstrumentSet(
|
|
334
|
+
Z=Z,
|
|
335
|
+
variable_names=[var],
|
|
336
|
+
instrument_names=instrument_names,
|
|
337
|
+
equation=equation,
|
|
338
|
+
style='gmm',
|
|
339
|
+
collapsed=False
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def _analyze_lag_availability(self,
|
|
343
|
+
var: str,
|
|
344
|
+
min_lag: int,
|
|
345
|
+
max_lag: int,
|
|
346
|
+
min_coverage: float = 0.10) -> List[int]:
|
|
347
|
+
"""
|
|
348
|
+
Analyze which lags have sufficient data coverage.
|
|
349
|
+
|
|
350
|
+
For unbalanced panels, some lags may be available for very few
|
|
351
|
+
observations. This method identifies lags with sufficient coverage.
|
|
352
|
+
|
|
353
|
+
Parameters
|
|
354
|
+
----------
|
|
355
|
+
var : str
|
|
356
|
+
Variable to analyze
|
|
357
|
+
min_lag : int
|
|
358
|
+
Minimum lag to consider
|
|
359
|
+
max_lag : int
|
|
360
|
+
Maximum lag to consider
|
|
361
|
+
min_coverage : float
|
|
362
|
+
Minimum fraction of observations that must have valid lagged values
|
|
363
|
+
(default: 0.10 = 10%)
|
|
364
|
+
|
|
365
|
+
Returns
|
|
366
|
+
-------
|
|
367
|
+
List[int]
|
|
368
|
+
Lags with sufficient coverage
|
|
369
|
+
"""
|
|
370
|
+
var_data = self.data[var].values
|
|
371
|
+
ids = self.data[self.id_var].values
|
|
372
|
+
times = self.data[self.time_var].values
|
|
373
|
+
n_obs = len(self.data)
|
|
374
|
+
|
|
375
|
+
valid_lags = []
|
|
376
|
+
|
|
377
|
+
for lag in range(min_lag, max_lag + 1):
|
|
378
|
+
# Count how many observations would have valid lagged values
|
|
379
|
+
n_valid = 0
|
|
380
|
+
for i in range(n_obs):
|
|
381
|
+
current_id = ids[i]
|
|
382
|
+
current_time = times[i]
|
|
383
|
+
|
|
384
|
+
# Check if lagged value exists
|
|
385
|
+
mask_lag = (ids == current_id) & (times == current_time - lag)
|
|
386
|
+
if np.any(mask_lag):
|
|
387
|
+
lag_idx = np.where(mask_lag)[0][0]
|
|
388
|
+
if not np.isnan(var_data[lag_idx]):
|
|
389
|
+
n_valid += 1
|
|
390
|
+
|
|
391
|
+
# Include lag if coverage is sufficient
|
|
392
|
+
coverage = n_valid / n_obs
|
|
393
|
+
if coverage >= min_coverage:
|
|
394
|
+
valid_lags.append(lag)
|
|
395
|
+
|
|
396
|
+
return valid_lags
|
|
397
|
+
|
|
398
|
+
def _create_gmm_collapsed(self,
|
|
399
|
+
var: str,
|
|
400
|
+
min_lag: int,
|
|
401
|
+
max_lag: Optional[int],
|
|
402
|
+
equation: str) -> InstrumentSet:
|
|
403
|
+
"""
|
|
404
|
+
Create collapsed GMM-style instruments.
|
|
405
|
+
|
|
406
|
+
Collapse creates one column per lag, summing across time periods.
|
|
407
|
+
This dramatically reduces instrument count while preserving information.
|
|
408
|
+
|
|
409
|
+
For unbalanced panels, automatically filters out lags with very low
|
|
410
|
+
data coverage (< 10% of observations).
|
|
411
|
+
"""
|
|
412
|
+
var_data = self.data[var].values
|
|
413
|
+
ids = self.data[self.id_var].values
|
|
414
|
+
times = self.data[self.time_var].values
|
|
415
|
+
n_obs = len(self.data)
|
|
416
|
+
|
|
417
|
+
# Determine actual max_lag based on data
|
|
418
|
+
# Maximum possible lag is n_periods - 1
|
|
419
|
+
actual_max_lag = self.n_periods - 1
|
|
420
|
+
|
|
421
|
+
if max_lag is None:
|
|
422
|
+
max_lag = actual_max_lag
|
|
423
|
+
else:
|
|
424
|
+
# Limit max_lag to what's actually available
|
|
425
|
+
max_lag = min(max_lag, actual_max_lag)
|
|
426
|
+
|
|
427
|
+
# Smart selection: Filter lags with sufficient data coverage
|
|
428
|
+
# This helps with unbalanced panels by excluding mostly-NaN lags
|
|
429
|
+
possible_lags = self._analyze_lag_availability(
|
|
430
|
+
var, min_lag, max_lag, min_coverage=0.10
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# If no lags meet the coverage threshold, use at least min_lag and min_lag+1
|
|
434
|
+
if not possible_lags and min_lag <= max_lag:
|
|
435
|
+
import warnings
|
|
436
|
+
warnings.warn(
|
|
437
|
+
f"No lags for variable '{var}' meet the 10% coverage threshold. "
|
|
438
|
+
f"Using lags {min_lag} and {min(min_lag+1, max_lag)} anyway.",
|
|
439
|
+
UserWarning
|
|
440
|
+
)
|
|
441
|
+
possible_lags = [min_lag]
|
|
442
|
+
if min_lag + 1 <= max_lag:
|
|
443
|
+
possible_lags.append(min_lag + 1)
|
|
444
|
+
|
|
445
|
+
Z_list = []
|
|
446
|
+
instrument_names = []
|
|
447
|
+
|
|
448
|
+
for lag in possible_lags:
|
|
449
|
+
col = np.full(n_obs, np.nan)
|
|
450
|
+
col_name = f"{var}_L{lag}_collapsed"
|
|
451
|
+
|
|
452
|
+
# For each observation, get lagged value if available
|
|
453
|
+
for i in range(n_obs):
|
|
454
|
+
current_id = ids[i]
|
|
455
|
+
current_time = times[i]
|
|
456
|
+
|
|
457
|
+
# Find lagged value
|
|
458
|
+
mask_lag = (ids == current_id) & (times == current_time - lag)
|
|
459
|
+
if np.any(mask_lag):
|
|
460
|
+
lag_idx = np.where(mask_lag)[0][0]
|
|
461
|
+
if equation == 'diff':
|
|
462
|
+
col[i] = var_data[lag_idx]
|
|
463
|
+
else:
|
|
464
|
+
# For level equation, use differences
|
|
465
|
+
mask_lag1 = (ids == current_id) & (times == current_time - lag - 1)
|
|
466
|
+
if np.any(mask_lag1):
|
|
467
|
+
lag1_idx = np.where(mask_lag1)[0][0]
|
|
468
|
+
col[i] = var_data[lag_idx] - var_data[lag1_idx]
|
|
469
|
+
|
|
470
|
+
Z_list.append(col)
|
|
471
|
+
instrument_names.append(col_name)
|
|
472
|
+
|
|
473
|
+
# Stack into matrix
|
|
474
|
+
Z = np.column_stack(Z_list) if Z_list else np.empty((n_obs, 0))
|
|
475
|
+
|
|
476
|
+
return InstrumentSet(
|
|
477
|
+
Z=Z,
|
|
478
|
+
variable_names=[var],
|
|
479
|
+
instrument_names=instrument_names,
|
|
480
|
+
equation=equation,
|
|
481
|
+
style='gmm',
|
|
482
|
+
collapsed=True
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
def combine_instruments(self, *instrument_sets: InstrumentSet) -> InstrumentSet:
|
|
486
|
+
"""
|
|
487
|
+
Combine multiple instrument sets.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
*instrument_sets : InstrumentSet
|
|
492
|
+
Instrument sets to combine
|
|
493
|
+
|
|
494
|
+
Returns
|
|
495
|
+
-------
|
|
496
|
+
InstrumentSet
|
|
497
|
+
Combined instrument set
|
|
498
|
+
|
|
499
|
+
Examples
|
|
500
|
+
--------
|
|
501
|
+
>>> Z_gmm = builder.create_gmm_style_instruments('y', 2, 99, collapse=True)
|
|
502
|
+
>>> Z_iv = builder.create_iv_style_instruments('x', 2, 4)
|
|
503
|
+
>>> Z_combined = builder.combine_instruments(Z_gmm, Z_iv)
|
|
504
|
+
"""
|
|
505
|
+
if not instrument_sets:
|
|
506
|
+
raise ValueError("Must provide at least one instrument set")
|
|
507
|
+
|
|
508
|
+
# Combine matrices
|
|
509
|
+
Z_combined = np.column_stack([iset.Z for iset in instrument_sets])
|
|
510
|
+
|
|
511
|
+
# Combine names
|
|
512
|
+
var_names = []
|
|
513
|
+
inst_names = []
|
|
514
|
+
for iset in instrument_sets:
|
|
515
|
+
var_names.extend(iset.variable_names)
|
|
516
|
+
inst_names.extend(iset.instrument_names)
|
|
517
|
+
|
|
518
|
+
return InstrumentSet(
|
|
519
|
+
Z=Z_combined,
|
|
520
|
+
variable_names=var_names,
|
|
521
|
+
instrument_names=inst_names,
|
|
522
|
+
equation=instrument_sets[0].equation,
|
|
523
|
+
style='mixed',
|
|
524
|
+
collapsed=False
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
def instrument_count_analysis(self, Z: InstrumentSet) -> pd.DataFrame:
|
|
528
|
+
"""
|
|
529
|
+
Analyze instrument count.
|
|
530
|
+
|
|
531
|
+
Parameters
|
|
532
|
+
----------
|
|
533
|
+
Z : InstrumentSet
|
|
534
|
+
Instrument set to analyze
|
|
535
|
+
|
|
536
|
+
Returns
|
|
537
|
+
-------
|
|
538
|
+
pd.DataFrame
|
|
539
|
+
Analysis of instrument counts
|
|
540
|
+
|
|
541
|
+
Examples
|
|
542
|
+
--------
|
|
543
|
+
>>> Z = builder.create_gmm_style_instruments('y', 2, 99)
|
|
544
|
+
>>> analysis = builder.instrument_count_analysis(Z)
|
|
545
|
+
>>> print(analysis)
|
|
546
|
+
"""
|
|
547
|
+
analysis = {
|
|
548
|
+
'Total instruments': Z.n_instruments,
|
|
549
|
+
'Observations': Z.n_obs,
|
|
550
|
+
'Groups': self.n_groups,
|
|
551
|
+
'Instrument ratio': Z.n_instruments / self.n_groups,
|
|
552
|
+
'Style': Z.style,
|
|
553
|
+
'Collapsed': Z.collapsed,
|
|
554
|
+
'Variables': ', '.join(Z.variable_names)
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
# Warning if too many instruments
|
|
558
|
+
if Z.n_instruments > self.n_groups:
|
|
559
|
+
analysis['Warning'] = f"Too many instruments ({Z.n_instruments} > {self.n_groups} groups)"
|
|
560
|
+
else:
|
|
561
|
+
analysis['Warning'] = 'OK'
|
|
562
|
+
|
|
563
|
+
return pd.DataFrame([analysis]).T
|
|
564
|
+
|
|
565
|
+
def get_valid_obs_mask(self, Z: InstrumentSet) -> np.ndarray:
|
|
566
|
+
"""
|
|
567
|
+
Get mask of valid observations (non-missing instruments).
|
|
568
|
+
|
|
569
|
+
Parameters
|
|
570
|
+
----------
|
|
571
|
+
Z : InstrumentSet
|
|
572
|
+
Instrument set
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
np.ndarray
|
|
577
|
+
Boolean mask of valid observations
|
|
578
|
+
"""
|
|
579
|
+
# Valid if at least one instrument is non-missing
|
|
580
|
+
return ~np.all(np.isnan(Z.Z), axis=1)
|