panelbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. panelbox/__init__.py +67 -0
  2. panelbox/__version__.py +14 -0
  3. panelbox/cli/__init__.py +0 -0
  4. panelbox/cli/{commands}/__init__.py +0 -0
  5. panelbox/core/__init__.py +0 -0
  6. panelbox/core/base_model.py +164 -0
  7. panelbox/core/formula_parser.py +318 -0
  8. panelbox/core/panel_data.py +387 -0
  9. panelbox/core/results.py +366 -0
  10. panelbox/datasets/__init__.py +0 -0
  11. panelbox/datasets/{data}/__init__.py +0 -0
  12. panelbox/gmm/__init__.py +65 -0
  13. panelbox/gmm/difference_gmm.py +645 -0
  14. panelbox/gmm/estimator.py +562 -0
  15. panelbox/gmm/instruments.py +580 -0
  16. panelbox/gmm/results.py +550 -0
  17. panelbox/gmm/system_gmm.py +621 -0
  18. panelbox/gmm/tests.py +535 -0
  19. panelbox/models/__init__.py +11 -0
  20. panelbox/models/dynamic/__init__.py +0 -0
  21. panelbox/models/iv/__init__.py +0 -0
  22. panelbox/models/static/__init__.py +13 -0
  23. panelbox/models/static/fixed_effects.py +516 -0
  24. panelbox/models/static/pooled_ols.py +298 -0
  25. panelbox/models/static/random_effects.py +512 -0
  26. panelbox/report/__init__.py +61 -0
  27. panelbox/report/asset_manager.py +410 -0
  28. panelbox/report/css_manager.py +472 -0
  29. panelbox/report/exporters/__init__.py +15 -0
  30. panelbox/report/exporters/html_exporter.py +440 -0
  31. panelbox/report/exporters/latex_exporter.py +510 -0
  32. panelbox/report/exporters/markdown_exporter.py +446 -0
  33. panelbox/report/renderers/__init__.py +11 -0
  34. panelbox/report/renderers/static/__init__.py +0 -0
  35. panelbox/report/renderers/static_validation_renderer.py +341 -0
  36. panelbox/report/report_manager.py +502 -0
  37. panelbox/report/template_manager.py +337 -0
  38. panelbox/report/transformers/__init__.py +0 -0
  39. panelbox/report/transformers/static/__init__.py +0 -0
  40. panelbox/report/validation_transformer.py +449 -0
  41. panelbox/standard_errors/__init__.py +0 -0
  42. panelbox/templates/__init__.py +0 -0
  43. panelbox/templates/assets/css/base_styles.css +382 -0
  44. panelbox/templates/assets/css/report_components.css +747 -0
  45. panelbox/templates/assets/js/tab-navigation.js +161 -0
  46. panelbox/templates/assets/js/utils.js +276 -0
  47. panelbox/templates/common/footer.html +24 -0
  48. panelbox/templates/common/header.html +44 -0
  49. panelbox/templates/common/meta.html +5 -0
  50. panelbox/templates/validation/interactive/index.html +272 -0
  51. panelbox/templates/validation/interactive/partials/charts.html +58 -0
  52. panelbox/templates/validation/interactive/partials/methodology.html +201 -0
  53. panelbox/templates/validation/interactive/partials/overview.html +146 -0
  54. panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
  55. panelbox/templates/validation/interactive/partials/test_results.html +231 -0
  56. panelbox/utils/__init__.py +0 -0
  57. panelbox/utils/formatting.py +172 -0
  58. panelbox/utils/matrix_ops.py +233 -0
  59. panelbox/utils/statistical.py +173 -0
  60. panelbox/validation/__init__.py +58 -0
  61. panelbox/validation/base.py +175 -0
  62. panelbox/validation/cointegration/__init__.py +0 -0
  63. panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
  64. panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
  65. panelbox/validation/cross_sectional_dependence/frees.py +297 -0
  66. panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
  67. panelbox/validation/heteroskedasticity/__init__.py +13 -0
  68. panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
  69. panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
  70. panelbox/validation/heteroskedasticity/white.py +208 -0
  71. panelbox/validation/instruments/__init__.py +0 -0
  72. panelbox/validation/robustness/__init__.py +0 -0
  73. panelbox/validation/serial_correlation/__init__.py +13 -0
  74. panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
  75. panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
  76. panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
  77. panelbox/validation/specification/__init__.py +16 -0
  78. panelbox/validation/specification/chow.py +273 -0
  79. panelbox/validation/specification/hausman.py +264 -0
  80. panelbox/validation/specification/mundlak.py +331 -0
  81. panelbox/validation/specification/reset.py +273 -0
  82. panelbox/validation/unit_root/__init__.py +0 -0
  83. panelbox/validation/validation_report.py +257 -0
  84. panelbox/validation/validation_suite.py +401 -0
  85. panelbox-0.2.0.dist-info/METADATA +337 -0
  86. panelbox-0.2.0.dist-info/RECORD +90 -0
  87. panelbox-0.2.0.dist-info/WHEEL +5 -0
  88. panelbox-0.2.0.dist-info/entry_points.txt +2 -0
  89. panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
  90. panelbox-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,645 @@
1
+ """
2
+ Difference GMM Estimator
3
+ =========================
4
+
5
+ Arellano-Bond (1991) Difference GMM estimator for dynamic panel data models.
6
+
7
+ Classes
8
+ -------
9
+ DifferenceGMM : Arellano-Bond Difference GMM estimator
10
+
11
+ References
12
+ ----------
13
+ .. [1] Arellano, M., & Bond, S. (1991). "Some Tests of Specification for Panel
14
+ Data: Monte Carlo Evidence and an Application to Employment Equations."
15
+ Review of Economic Studies, 58(2), 277-297.
16
+ """
17
+
18
+ from typing import Union, List, Optional, Dict
19
+ import numpy as np
20
+ import pandas as pd
21
+ from panelbox.gmm.results import GMMResults, TestResult
22
+ from panelbox.gmm.instruments import InstrumentBuilder, InstrumentSet
23
+ from panelbox.gmm.estimator import GMMEstimator
24
+ from panelbox.gmm.tests import GMMTests
25
+
26
+
27
+ class DifferenceGMM:
28
+ """
29
+ Arellano-Bond (1991) Difference GMM estimator.
30
+
31
+ Eliminates fixed effects through first-differencing and uses lagged
32
+ levels as instruments for the differenced equation.
33
+
34
+ Parameters
35
+ ----------
36
+ data : pd.DataFrame
37
+ Panel data in long format
38
+ dep_var : str
39
+ Name of dependent variable
40
+ lags : Union[int, List[int]]
41
+ Lags of dependent variable to include (e.g., 1 or [1, 2])
42
+ id_var : str
43
+ Name of cross-sectional identifier (default: 'id')
44
+ time_var : str
45
+ Name of time variable (default: 'year')
46
+ exog_vars : List[str], optional
47
+ List of strictly exogenous variables
48
+ endogenous_vars : List[str], optional
49
+ List of endogenous variables (excluding lagged dependent)
50
+ predetermined_vars : List[str], optional
51
+ List of predetermined variables
52
+ time_dummies : bool
53
+ Include time dummies (default: True)
54
+ collapse : bool
55
+ Collapse instruments to avoid proliferation (default: False)
56
+ two_step : bool
57
+ Use two-step GMM (default: True)
58
+ robust : bool
59
+ Use robust variance matrix with Windmeijer correction (default: True)
60
+ gmm_type : str
61
+ GMM estimation type: 'one_step', 'two_step', or 'iterative' (default: 'two_step')
62
+
63
+ Attributes
64
+ ----------
65
+ data : pd.DataFrame
66
+ Panel data
67
+ params : pd.Series
68
+ Estimated coefficients (after fitting)
69
+ results : GMMResults
70
+ Full results object (after fitting)
71
+
72
+ Examples
73
+ --------
74
+ **Basic example with employment data:**
75
+
76
+ >>> import pandas as pd
77
+ >>> from panelbox.gmm import DifferenceGMM
78
+ >>>
79
+ >>> # Load panel data (firms over time)
80
+ >>> data = pd.read_csv('panel_data.csv')
81
+ >>>
82
+ >>> # Estimate Difference GMM
83
+ >>> model = DifferenceGMM(
84
+ ... data=data,
85
+ ... dep_var='employment',
86
+ ... lags=1, # Include employment_{t-1}
87
+ ... id_var='firm_id',
88
+ ... time_var='year',
89
+ ... exog_vars=['wages', 'capital'],
90
+ ... time_dummies=True,
91
+ ... collapse=True, # Recommended to avoid instrument proliferation
92
+ ... two_step=True, # Two-step with Windmeijer correction
93
+ ... robust=True # Robust standard errors
94
+ ... )
95
+ >>>
96
+ >>> # Fit and get results
97
+ >>> results = model.fit()
98
+ >>> print(results.summary())
99
+ >>>
100
+ >>> # Access coefficients
101
+ >>> print(f"Persistence: {results.params['L1.employment']:.3f}")
102
+ >>> print(f"Wage effect: {results.params['wages']:.3f}")
103
+
104
+ **Interpreting diagnostic tests:**
105
+
106
+ >>> # Check if estimation is valid
107
+ >>> if results.ar2_test.pvalue > 0.10:
108
+ ... print("✓ Moment conditions valid")
109
+ ...
110
+ >>> if 0.10 < results.hansen_j.pvalue < 0.25:
111
+ ... print("✓ Instruments appear valid")
112
+ ...
113
+ >>> if results.instrument_ratio < 1.0:
114
+ ... print("✓ Instrument count appropriate")
115
+
116
+ **With predetermined and endogenous variables:**
117
+
118
+ >>> # Some variables may not be strictly exogenous
119
+ >>> model = DifferenceGMM(
120
+ ... data=data,
121
+ ... dep_var='output',
122
+ ... lags=1,
123
+ ... exog_vars=['policy_var'], # Strictly exogenous
124
+ ... predetermined_vars=['capital'], # Instruments: t-1 and earlier
125
+ ... endogenous_vars=['labor'], # Instruments: t-2 and earlier
126
+ ... collapse=True,
127
+ ... two_step=True
128
+ ... )
129
+ >>> results = model.fit()
130
+
131
+ **For unbalanced panels:**
132
+
133
+ >>> # Always use collapse=True and avoid many time dummies
134
+ >>> model = DifferenceGMM(
135
+ ... data=unbalanced_data,
136
+ ... dep_var='y',
137
+ ... lags=1,
138
+ ... exog_vars=['x1', 'x2'],
139
+ ... time_dummies=False, # Or use linear trend
140
+ ... collapse=True, # Essential for unbalanced panels
141
+ ... two_step=True
142
+ ... )
143
+ >>> results = model.fit()
144
+ >>> print(f"Retained {results.nobs}/{len(unbalanced_data)} observations")
145
+
146
+ Notes
147
+ -----
148
+ Transformation: First-differences to eliminate fixed effects
149
+ Δy_{it} = γ Δy_{i,t-1} + β' Δx_{it} + Δε_{it}
150
+
151
+ Instruments: Lags of levels for differenced equations
152
+ - Strictly exogenous: all lags and leads
153
+ - Predetermined: lags t-2 and earlier
154
+ - Endogenous: lags t-3 and earlier
155
+
156
+ References
157
+ ----------
158
+ Arellano, M., & Bond, S. (1991). Review of Economic Studies, 58(2), 277-297.
159
+ """
160
+
161
+ def __init__(self,
162
+ data: pd.DataFrame,
163
+ dep_var: str,
164
+ lags: Union[int, List[int]],
165
+ id_var: str = 'id',
166
+ time_var: str = 'year',
167
+ exog_vars: Optional[List[str]] = None,
168
+ endogenous_vars: Optional[List[str]] = None,
169
+ predetermined_vars: Optional[List[str]] = None,
170
+ time_dummies: bool = True,
171
+ collapse: bool = False,
172
+ two_step: bool = True,
173
+ robust: bool = True,
174
+ gmm_type: str = 'two_step'):
175
+ """Initialize Difference GMM model."""
176
+ self.data = data.copy()
177
+ self.dep_var = dep_var
178
+ self.lags = [lags] if isinstance(lags, int) else lags
179
+ self.id_var = id_var
180
+ self.time_var = time_var
181
+ self.exog_vars = exog_vars or []
182
+ self.endogenous_vars = endogenous_vars or []
183
+ self.predetermined_vars = predetermined_vars or []
184
+ self.time_dummies = time_dummies
185
+ self.collapse = collapse
186
+ self.two_step = two_step
187
+ self.robust = robust
188
+ self.gmm_type = gmm_type
189
+
190
+ # Initialize components
191
+ self.instrument_builder = InstrumentBuilder(data, id_var, time_var)
192
+ self.estimator = GMMEstimator()
193
+ self.tester = GMMTests()
194
+
195
+ # Results (populated after fit)
196
+ self.results = None
197
+ self.params = None
198
+
199
+ # Validate inputs
200
+ self._validate_inputs()
201
+
202
+ def _validate_inputs(self):
203
+ """Validate model inputs."""
204
+ import warnings
205
+
206
+ # Check dep_var exists
207
+ if self.dep_var not in self.data.columns:
208
+ raise ValueError(f"Dependent variable '{self.dep_var}' not found in data")
209
+
210
+ # Check id_var and time_var exist
211
+ if self.id_var not in self.data.columns:
212
+ raise ValueError(f"ID variable '{self.id_var}' not found in data")
213
+ if self.time_var not in self.data.columns:
214
+ raise ValueError(f"Time variable '{self.time_var}' not found in data")
215
+
216
+ # Check exogenous variables exist
217
+ for var in self.exog_vars + self.endogenous_vars + self.predetermined_vars:
218
+ if var not in self.data.columns:
219
+ raise ValueError(f"Variable '{var}' not found in data")
220
+
221
+ # Check gmm_type is valid
222
+ valid_types = ['one_step', 'two_step', 'iterative']
223
+ if self.gmm_type not in valid_types:
224
+ raise ValueError(f"gmm_type must be one of {valid_types}")
225
+
226
+ # If gmm_type is specified, override two_step flag
227
+ if self.gmm_type == 'one_step':
228
+ self.two_step = False
229
+ elif self.gmm_type == 'two_step':
230
+ self.two_step = True
231
+
232
+ # Check for unbalanced panel + time dummies issue
233
+ if self.time_dummies:
234
+ is_unbalanced, balance_rate = self._check_panel_balance()
235
+ if is_unbalanced:
236
+ n_time_periods = self.data[self.time_var].nunique()
237
+ n_dummies = n_time_periods - 1
238
+
239
+ if n_dummies >= 5 and balance_rate < 0.80:
240
+ warnings.warn(
241
+ f"\nUnbalanced panel detected ({balance_rate*100:.0f}% balanced) with "
242
+ f"{n_dummies} time dummies.\n"
243
+ f"This may result in very few observations being retained.\n\n"
244
+ f"Recommendations:\n"
245
+ f" 1. Set time_dummies=False and add a linear trend\n"
246
+ f" 2. Use only subset of key time dummies\n"
247
+ f" 3. Ensure collapse=True (currently: {self.collapse})\n\n"
248
+ f"See examples/gmm/unbalanced_panel_guide.py for details.",
249
+ UserWarning
250
+ )
251
+
252
+ # Check collapse recommendation
253
+ if not self.collapse:
254
+ warnings.warn(
255
+ "\nRecommendation: Set collapse=True to avoid instrument proliferation.\n"
256
+ "This is especially important for unbalanced panels.",
257
+ UserWarning
258
+ )
259
+
260
+ def _check_panel_balance(self):
261
+ """
262
+ Check if panel data is balanced.
263
+
264
+ Returns
265
+ -------
266
+ tuple
267
+ (is_unbalanced: bool, balance_rate: float)
268
+ """
269
+ obs_per_unit = self.data.groupby(self.id_var).size()
270
+ max_periods = obs_per_unit.max()
271
+
272
+ # Panel is balanced if all units have same number of periods
273
+ is_balanced = (obs_per_unit == max_periods).all()
274
+
275
+ # Balance rate: proportion of units with max periods
276
+ balance_rate = (obs_per_unit == max_periods).mean()
277
+
278
+ return not is_balanced, balance_rate
279
+
280
+ def fit(self) -> GMMResults:
281
+ """
282
+ Estimate the Difference GMM model.
283
+
284
+ Returns
285
+ -------
286
+ GMMResults
287
+ Estimation results including coefficients, tests, and diagnostics
288
+
289
+ Raises
290
+ ------
291
+ ValueError
292
+ If model specification is invalid
293
+ RuntimeError
294
+ If estimation fails
295
+
296
+ Notes
297
+ -----
298
+ Estimation procedure:
299
+ 1. Transform data to first-differences
300
+ 2. Generate instruments (lags of levels)
301
+ 3. Estimate GMM (one-step, two-step, or iterative)
302
+ 4. Compute specification tests
303
+ 5. Return results object
304
+ """
305
+ # Step 1: Transform data
306
+ y_diff, X_diff, ids, times = self._transform_data()
307
+
308
+ # Step 1.5: Recreate InstrumentBuilder with updated data (includes lagged vars)
309
+ self.instrument_builder = InstrumentBuilder(self.data, self.id_var, self.time_var)
310
+
311
+ # Step 2: Generate instruments
312
+ Z = self._generate_instruments()
313
+
314
+ # Step 2.5: Pre-clean instruments for unbalanced panels
315
+ # Remove instrument columns that have excessive NaNs
316
+ Z_matrix = Z.Z.copy()
317
+
318
+ # First, remove columns that are all NaN
319
+ not_all_nan = ~np.isnan(Z_matrix).all(axis=0)
320
+ Z_matrix = Z_matrix[:, not_all_nan]
321
+
322
+ # Then, remove columns with >90% NaN (too few valid observations)
323
+ nan_fraction = np.isnan(Z_matrix).mean(axis=0)
324
+ mostly_valid = nan_fraction < 0.9
325
+ Z_matrix = Z_matrix[:, mostly_valid]
326
+
327
+ # Finally, replace any remaining NaNs with 0
328
+ # This is reasonable: NaN means instrument not available, contributes 0 to moment conditions
329
+ Z_matrix = np.nan_to_num(Z_matrix, nan=0.0)
330
+
331
+ # Step 3: Estimate GMM
332
+ if self.gmm_type == 'one_step':
333
+ beta, W, residuals = self.estimator.one_step(y_diff, X_diff, Z_matrix)
334
+ vcov = self._compute_one_step_vcov(X_diff, Z_matrix, residuals, W)
335
+ converged = True
336
+ elif self.gmm_type == 'two_step':
337
+ beta, vcov, W, residuals = self.estimator.two_step(
338
+ y_diff, X_diff, Z_matrix, robust=self.robust
339
+ )
340
+ converged = True
341
+ else: # iterative
342
+ beta, vcov, W, converged = self.estimator.iterative(
343
+ y_diff, X_diff, Z_matrix
344
+ )
345
+ residuals = y_diff - X_diff @ beta
346
+
347
+ # Step 4: Compute standard errors and t-statistics
348
+ beta = beta.flatten() # Ensure beta is 1D
349
+ std_errors = np.sqrt(np.diag(vcov))
350
+ tvalues = beta / std_errors
351
+ from scipy import stats
352
+ pvalues = 2 * (1 - stats.norm.cdf(np.abs(tvalues)))
353
+
354
+ # Step 5: Get variable names
355
+ var_names = self._get_variable_names()
356
+
357
+ # Step 6: Compute specification tests
358
+ hansen = self.tester.hansen_j_test(
359
+ residuals, Z_matrix, W, len(beta)
360
+ )
361
+ sargan = self.tester.sargan_test(
362
+ residuals, Z_matrix, len(beta)
363
+ )
364
+
365
+ # For AR tests, we need clean data without NaN
366
+ residuals_flat = residuals.flatten() if residuals.ndim > 1 else residuals
367
+ valid_mask = ~np.isnan(residuals_flat)
368
+ ar1 = self.tester.arellano_bond_ar_test(
369
+ residuals_flat[valid_mask], ids[valid_mask], order=1
370
+ )
371
+ ar2 = self.tester.arellano_bond_ar_test(
372
+ residuals_flat[valid_mask], ids[valid_mask], order=2
373
+ )
374
+
375
+ # Step 7: Create results object
376
+ self.results = GMMResults(
377
+ params=pd.Series(beta, index=var_names),
378
+ std_errors=pd.Series(std_errors, index=var_names),
379
+ tvalues=pd.Series(tvalues, index=var_names),
380
+ pvalues=pd.Series(pvalues, index=var_names),
381
+ nobs=int(np.sum(valid_mask)),
382
+ n_groups=self.instrument_builder.n_groups,
383
+ n_instruments=Z_matrix.shape[1], # Use actual number of instruments after cleaning
384
+ n_params=len(beta),
385
+ hansen_j=hansen,
386
+ sargan=sargan,
387
+ ar1_test=ar1,
388
+ ar2_test=ar2,
389
+ vcov=vcov,
390
+ weight_matrix=W,
391
+ converged=converged,
392
+ two_step=self.two_step,
393
+ windmeijer_corrected=self.robust and self.two_step,
394
+ model_type='difference',
395
+ transformation='fd',
396
+ residuals=residuals
397
+ )
398
+
399
+ self.params = self.results.params
400
+
401
+ # Post-estimation warning for low observation retention
402
+ retention_rate = self.results.nobs / len(self.data)
403
+ if retention_rate < 0.30:
404
+ import warnings
405
+ warnings.warn(
406
+ f"\nLow observation retention: {self.results.nobs}/{len(self.data)} "
407
+ f"({retention_rate*100:.1f}%).\n"
408
+ f"Many observations were dropped due to insufficient valid instruments.\n\n"
409
+ f"Recommendations:\n"
410
+ f" 1. Simplify specification (fewer variables/lags)\n"
411
+ f" 2. Set time_dummies=False (or use linear trend)\n"
412
+ f" 3. Ensure collapse=True (currently: {self.collapse})\n"
413
+ f" 4. Check data for excessive missing values\n\n"
414
+ f"See examples/gmm/unbalanced_panel_guide.py for detailed guidance.",
415
+ UserWarning
416
+ )
417
+
418
+ return self.results
419
+
420
+ def _transform_data(self) -> tuple:
421
+ """
422
+ Transform data to first-differences.
423
+
424
+ Returns
425
+ -------
426
+ y_diff : np.ndarray
427
+ Differenced dependent variable
428
+ X_diff : np.ndarray
429
+ Differenced regressors
430
+ ids : np.ndarray
431
+ ID variable
432
+ times : np.ndarray
433
+ Time variable
434
+ """
435
+ # Sort data
436
+ df = self.data.sort_values([self.id_var, self.time_var]).copy()
437
+
438
+ # Create lagged dependent variable
439
+ for lag in self.lags:
440
+ lag_name = f'{self.dep_var}_L{lag}'
441
+ df[lag_name] = df.groupby(self.id_var)[self.dep_var].shift(lag)
442
+ # Also add to self.data for instrument generation
443
+ self.data[lag_name] = df[lag_name]
444
+
445
+ # Build regressor list
446
+ regressors = []
447
+ for lag in self.lags:
448
+ regressors.append(f'{self.dep_var}_L{lag}')
449
+ regressors.extend(self.exog_vars)
450
+ regressors.extend(self.endogenous_vars)
451
+ regressors.extend(self.predetermined_vars)
452
+
453
+ # Add time dummies if requested
454
+ if self.time_dummies:
455
+ time_dummies = pd.get_dummies(df[self.time_var], prefix='year', drop_first=True)
456
+ for col in time_dummies.columns:
457
+ df[col] = time_dummies[col]
458
+ regressors.append(col)
459
+
460
+ # First-difference transformation
461
+ df['y_diff'] = df.groupby(self.id_var)[self.dep_var].diff()
462
+
463
+ X_diff_dict = {}
464
+ for var in regressors:
465
+ X_diff_dict[var] = df.groupby(self.id_var)[var].diff()
466
+
467
+ # Extract arrays, ensuring float64 dtype
468
+ y_diff = df['y_diff'].values.reshape(-1, 1).astype(np.float64)
469
+ X_diff = np.column_stack([X_diff_dict[var].values for var in regressors]).astype(np.float64)
470
+ ids = df[self.id_var].values
471
+ times = df[self.time_var].values
472
+
473
+ return y_diff, X_diff, ids, times
474
+
475
+ def _generate_instruments(self) -> InstrumentSet:
476
+ """
477
+ Generate instrument matrix.
478
+
479
+ Returns
480
+ -------
481
+ InstrumentSet
482
+ Combined instrument set
483
+ """
484
+ instrument_sets = []
485
+
486
+ # Instruments for lagged dependent variable (GMM-style)
487
+ # For Δy_{i,t-lag}, use levels y_{i,t-lag-1}, y_{i,t-lag-2}, ... as instruments
488
+ for lag in self.lags:
489
+ # min_lag for instruments should be lag+1 (e.g., for L1.y use y_{t-2}, y_{t-3}, ...)
490
+ Z_lag = self.instrument_builder.create_gmm_style_instruments(
491
+ var=self.dep_var,
492
+ min_lag=lag + 1, # For L1.y, use y_{t-2} and earlier
493
+ max_lag=99, # All available lags
494
+ equation='diff',
495
+ collapse=self.collapse
496
+ )
497
+ instrument_sets.append(Z_lag)
498
+
499
+ # Instruments for strictly exogenous variables (IV-style, all lags)
500
+ for var in self.exog_vars:
501
+ Z_exog = self.instrument_builder.create_iv_style_instruments(
502
+ var=var,
503
+ min_lag=0, # Current and all lags
504
+ max_lag=0, # Just current for simplicity (can extend)
505
+ equation='diff'
506
+ )
507
+ instrument_sets.append(Z_exog)
508
+
509
+ # Instruments for predetermined variables (GMM-style, lag 2+)
510
+ for var in self.predetermined_vars:
511
+ Z_pred = self.instrument_builder.create_gmm_style_instruments(
512
+ var=var,
513
+ min_lag=2, # t-2 and earlier
514
+ max_lag=99,
515
+ equation='diff',
516
+ collapse=self.collapse
517
+ )
518
+ instrument_sets.append(Z_pred)
519
+
520
+ # Instruments for endogenous variables (GMM-style, lag 3+)
521
+ for var in self.endogenous_vars:
522
+ Z_endog = self.instrument_builder.create_gmm_style_instruments(
523
+ var=var,
524
+ min_lag=3, # t-3 and earlier
525
+ max_lag=99,
526
+ equation='diff',
527
+ collapse=self.collapse
528
+ )
529
+ instrument_sets.append(Z_endog)
530
+
531
+ # Combine all instruments
532
+ Z_combined = self.instrument_builder.combine_instruments(*instrument_sets)
533
+
534
+ return Z_combined
535
+
536
+ def _compute_one_step_vcov(self,
537
+ X: np.ndarray,
538
+ Z: np.ndarray,
539
+ residuals: np.ndarray,
540
+ W: np.ndarray) -> np.ndarray:
541
+ """
542
+ Compute variance-covariance matrix for one-step GMM.
543
+
544
+ Parameters
545
+ ----------
546
+ X : np.ndarray
547
+ Regressors
548
+ Z : np.ndarray
549
+ Instruments
550
+ residuals : np.ndarray
551
+ Residuals
552
+ W : np.ndarray
553
+ Weight matrix
554
+
555
+ Returns
556
+ -------
557
+ np.ndarray
558
+ Variance-covariance matrix
559
+ """
560
+ # Ensure arrays are float64
561
+ X = np.asarray(X, dtype=np.float64)
562
+ Z = np.asarray(Z, dtype=np.float64)
563
+ residuals = np.asarray(residuals, dtype=np.float64)
564
+ W = np.asarray(W, dtype=np.float64)
565
+
566
+ # Remove missing values
567
+ valid_mask = ~np.isnan(residuals.flatten())
568
+ X_clean = X[valid_mask]
569
+ Z_clean = Z[valid_mask]
570
+ resid_clean = residuals[valid_mask]
571
+
572
+ # Robust variance: (X'Z W Z'X)^{-1} (X'Z W Ω W Z'X) (X'Z W Z'X)^{-1}
573
+ # where Ω = Z' diag(ε²) Z
574
+
575
+ XtZ = X_clean.T @ Z_clean
576
+ ZtX = Z_clean.T @ X_clean
577
+
578
+ A = XtZ @ W @ ZtX
579
+ try:
580
+ A_inv = np.linalg.inv(A)
581
+ except np.linalg.LinAlgError:
582
+ A_inv = np.linalg.pinv(A)
583
+
584
+ # Compute Omega
585
+ Omega = np.diag(resid_clean.flatten() ** 2)
586
+ ZtOmegaZ = Z_clean.T @ Omega @ Z_clean
587
+
588
+ # Robust variance
589
+ B = XtZ @ W @ ZtOmegaZ @ W @ ZtX
590
+ vcov = A_inv @ B @ A_inv
591
+
592
+ return vcov
593
+
594
+ def _get_variable_names(self) -> List[str]:
595
+ """
596
+ Get list of variable names in order.
597
+
598
+ Returns
599
+ -------
600
+ List[str]
601
+ Variable names
602
+ """
603
+ var_names = []
604
+
605
+ # Lagged dependent variable
606
+ for lag in self.lags:
607
+ var_names.append(f'L{lag}.{self.dep_var}')
608
+
609
+ # Other variables
610
+ var_names.extend(self.exog_vars)
611
+ var_names.extend(self.endogenous_vars)
612
+ var_names.extend(self.predetermined_vars)
613
+
614
+ # Time dummies
615
+ if self.time_dummies:
616
+ time_periods = sorted(self.data[self.time_var].unique())[1:] # Drop first
617
+ for t in time_periods:
618
+ var_names.append(f'year_{t}')
619
+
620
+ return var_names
621
+
622
+ def summary(self) -> str:
623
+ """
624
+ Print model summary.
625
+
626
+ Returns
627
+ -------
628
+ str
629
+ Summary string
630
+
631
+ Raises
632
+ ------
633
+ ValueError
634
+ If model has not been fit yet
635
+ """
636
+ if self.results is None:
637
+ raise ValueError("Model has not been fit yet. Call fit() first.")
638
+
639
+ return self.results.summary(title='Difference GMM (Arellano-Bond)')
640
+
641
+ def __repr__(self) -> str:
642
+ """Representation of the model."""
643
+ status = "fitted" if self.results is not None else "not fitted"
644
+ return (f"DifferenceGMM(dep_var='{self.dep_var}', lags={self.lags}, "
645
+ f"status='{status}')")