panelbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. panelbox/__init__.py +67 -0
  2. panelbox/__version__.py +14 -0
  3. panelbox/cli/__init__.py +0 -0
  4. panelbox/cli/{commands}/__init__.py +0 -0
  5. panelbox/core/__init__.py +0 -0
  6. panelbox/core/base_model.py +164 -0
  7. panelbox/core/formula_parser.py +318 -0
  8. panelbox/core/panel_data.py +387 -0
  9. panelbox/core/results.py +366 -0
  10. panelbox/datasets/__init__.py +0 -0
  11. panelbox/datasets/{data}/__init__.py +0 -0
  12. panelbox/gmm/__init__.py +65 -0
  13. panelbox/gmm/difference_gmm.py +645 -0
  14. panelbox/gmm/estimator.py +562 -0
  15. panelbox/gmm/instruments.py +580 -0
  16. panelbox/gmm/results.py +550 -0
  17. panelbox/gmm/system_gmm.py +621 -0
  18. panelbox/gmm/tests.py +535 -0
  19. panelbox/models/__init__.py +11 -0
  20. panelbox/models/dynamic/__init__.py +0 -0
  21. panelbox/models/iv/__init__.py +0 -0
  22. panelbox/models/static/__init__.py +13 -0
  23. panelbox/models/static/fixed_effects.py +516 -0
  24. panelbox/models/static/pooled_ols.py +298 -0
  25. panelbox/models/static/random_effects.py +512 -0
  26. panelbox/report/__init__.py +61 -0
  27. panelbox/report/asset_manager.py +410 -0
  28. panelbox/report/css_manager.py +472 -0
  29. panelbox/report/exporters/__init__.py +15 -0
  30. panelbox/report/exporters/html_exporter.py +440 -0
  31. panelbox/report/exporters/latex_exporter.py +510 -0
  32. panelbox/report/exporters/markdown_exporter.py +446 -0
  33. panelbox/report/renderers/__init__.py +11 -0
  34. panelbox/report/renderers/static/__init__.py +0 -0
  35. panelbox/report/renderers/static_validation_renderer.py +341 -0
  36. panelbox/report/report_manager.py +502 -0
  37. panelbox/report/template_manager.py +337 -0
  38. panelbox/report/transformers/__init__.py +0 -0
  39. panelbox/report/transformers/static/__init__.py +0 -0
  40. panelbox/report/validation_transformer.py +449 -0
  41. panelbox/standard_errors/__init__.py +0 -0
  42. panelbox/templates/__init__.py +0 -0
  43. panelbox/templates/assets/css/base_styles.css +382 -0
  44. panelbox/templates/assets/css/report_components.css +747 -0
  45. panelbox/templates/assets/js/tab-navigation.js +161 -0
  46. panelbox/templates/assets/js/utils.js +276 -0
  47. panelbox/templates/common/footer.html +24 -0
  48. panelbox/templates/common/header.html +44 -0
  49. panelbox/templates/common/meta.html +5 -0
  50. panelbox/templates/validation/interactive/index.html +272 -0
  51. panelbox/templates/validation/interactive/partials/charts.html +58 -0
  52. panelbox/templates/validation/interactive/partials/methodology.html +201 -0
  53. panelbox/templates/validation/interactive/partials/overview.html +146 -0
  54. panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
  55. panelbox/templates/validation/interactive/partials/test_results.html +231 -0
  56. panelbox/utils/__init__.py +0 -0
  57. panelbox/utils/formatting.py +172 -0
  58. panelbox/utils/matrix_ops.py +233 -0
  59. panelbox/utils/statistical.py +173 -0
  60. panelbox/validation/__init__.py +58 -0
  61. panelbox/validation/base.py +175 -0
  62. panelbox/validation/cointegration/__init__.py +0 -0
  63. panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
  64. panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
  65. panelbox/validation/cross_sectional_dependence/frees.py +297 -0
  66. panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
  67. panelbox/validation/heteroskedasticity/__init__.py +13 -0
  68. panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
  69. panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
  70. panelbox/validation/heteroskedasticity/white.py +208 -0
  71. panelbox/validation/instruments/__init__.py +0 -0
  72. panelbox/validation/robustness/__init__.py +0 -0
  73. panelbox/validation/serial_correlation/__init__.py +13 -0
  74. panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
  75. panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
  76. panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
  77. panelbox/validation/specification/__init__.py +16 -0
  78. panelbox/validation/specification/chow.py +273 -0
  79. panelbox/validation/specification/hausman.py +264 -0
  80. panelbox/validation/specification/mundlak.py +331 -0
  81. panelbox/validation/specification/reset.py +273 -0
  82. panelbox/validation/unit_root/__init__.py +0 -0
  83. panelbox/validation/validation_report.py +257 -0
  84. panelbox/validation/validation_suite.py +401 -0
  85. panelbox-0.2.0.dist-info/METADATA +337 -0
  86. panelbox-0.2.0.dist-info/RECORD +90 -0
  87. panelbox-0.2.0.dist-info/WHEEL +5 -0
  88. panelbox-0.2.0.dist-info/entry_points.txt +2 -0
  89. panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
  90. panelbox-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,580 @@
1
+ """
2
+ Instrument Generation for GMM
3
+ ==============================
4
+
5
+ Tools for generating and managing instrument matrices for GMM estimation.
6
+
7
+ Classes
8
+ -------
9
+ InstrumentSet : Container for instrument matrices
10
+ InstrumentBuilder : Generates instrument matrices following xtabond2 rules
11
+
12
+ References
13
+ ----------
14
+ .. [1] Roodman, D. (2009). "How to do xtabond2: An Introduction to Difference
15
+ and System GMM in Stata." Stata Journal, 9(1), 86-136.
16
+ """
17
+
18
+ from dataclasses import dataclass, field
19
+ from typing import List, Optional, Dict, Tuple
20
+ import numpy as np
21
+ import pandas as pd
22
+ from enum import Enum
23
+
24
+
25
+ class InstrumentStyle(Enum):
26
+ """Style of instrument generation."""
27
+ IV = 'iv' # IV-style: one column per lag
28
+ GMM = 'gmm' # GMM-style: separate columns per time period
29
+
30
+
31
+ class EquationType(Enum):
32
+ """Type of equation for instruments."""
33
+ DIFF = 'diff' # Differenced equation
34
+ LEVEL = 'level' # Level equation
35
+
36
+
37
+ @dataclass
38
+ class InstrumentSet:
39
+ """
40
+ Container for instrument matrices.
41
+
42
+ Attributes
43
+ ----------
44
+ Z : np.ndarray
45
+ Instrument matrix (T*N x n_instruments)
46
+ variable_names : List[str]
47
+ Names of instrumented variables
48
+ instrument_names : List[str]
49
+ Names of instrument columns
50
+ equation : str
51
+ Equation type ('diff' or 'level')
52
+ style : str
53
+ Instrument style ('iv' or 'gmm')
54
+ collapsed : bool
55
+ Whether instruments are collapsed
56
+ """
57
+
58
+ Z: np.ndarray
59
+ variable_names: List[str] = field(default_factory=list)
60
+ instrument_names: List[str] = field(default_factory=list)
61
+ equation: str = 'diff'
62
+ style: str = 'gmm'
63
+ collapsed: bool = False
64
+
65
+ @property
66
+ def n_instruments(self) -> int:
67
+ """Number of instruments."""
68
+ return self.Z.shape[1] if self.Z is not None else 0
69
+
70
+ @property
71
+ def n_obs(self) -> int:
72
+ """Number of observations."""
73
+ return self.Z.shape[0] if self.Z is not None else 0
74
+
75
+ def __repr__(self) -> str:
76
+ return (f"InstrumentSet(n_instruments={self.n_instruments}, "
77
+ f"n_obs={self.n_obs}, equation='{self.equation}', "
78
+ f"style='{self.style}', collapsed={self.collapsed})")
79
+
80
+
81
+ class InstrumentBuilder:
82
+ """
83
+ Generates instrument matrices for GMM estimation.
84
+
85
+ Follows Stata xtabond2 instrument generation rules.
86
+
87
+ Parameters
88
+ ----------
89
+ data : pd.DataFrame
90
+ Panel data in long format
91
+ id_var : str
92
+ Name of cross-sectional identifier
93
+ time_var : str
94
+ Name of time variable
95
+
96
+ Attributes
97
+ ----------
98
+ data : pd.DataFrame
99
+ Panel data
100
+ id_var : str
101
+ Cross-sectional identifier
102
+ time_var : str
103
+ Time variable
104
+ n_groups : int
105
+ Number of cross-sectional units
106
+ time_periods : np.ndarray
107
+ Unique time periods (sorted)
108
+
109
+ Examples
110
+ --------
111
+ >>> builder = InstrumentBuilder(data, id_var='id', time_var='year')
112
+ >>> # IV-style instruments
113
+ >>> Z_iv = builder.create_iv_style_instruments('x', min_lag=2, max_lag=4)
114
+ >>> # GMM-style instruments
115
+ >>> Z_gmm = builder.create_gmm_style_instruments('y', min_lag=2, max_lag=99)
116
+ >>> # Collapsed GMM-style
117
+ >>> Z_collapsed = builder.create_gmm_style_instruments(
118
+ ... 'y', min_lag=2, max_lag=99, collapse=True
119
+ ... )
120
+ """
121
+
122
+ def __init__(self,
123
+ data: pd.DataFrame,
124
+ id_var: str,
125
+ time_var: str):
126
+ """Initialize instrument builder."""
127
+ self.data = data.copy()
128
+ self.id_var = id_var
129
+ self.time_var = time_var
130
+
131
+ # Ensure data is sorted
132
+ self.data = self.data.sort_values([id_var, time_var])
133
+
134
+ # Extract groups and time periods
135
+ self.groups = self.data[id_var].unique()
136
+ self.n_groups = len(self.groups)
137
+ self.time_periods = np.sort(self.data[time_var].unique())
138
+ self.n_periods = len(self.time_periods)
139
+
140
+ def create_iv_style_instruments(self,
141
+ var: str,
142
+ min_lag: int,
143
+ max_lag: int,
144
+ equation: str = 'diff') -> InstrumentSet:
145
+ """
146
+ Create IV-style instruments (one column per lag).
147
+
148
+ IV-style instruments create one column for each lag, with observations
149
+ placed appropriately for each time period.
150
+
151
+ Parameters
152
+ ----------
153
+ var : str
154
+ Variable to instrument
155
+ min_lag : int
156
+ Minimum lag to use (e.g., 2 means t-2)
157
+ max_lag : int
158
+ Maximum lag to use (e.g., 4 means t-4)
159
+ equation : str
160
+ 'diff' for differenced equation, 'level' for level equation
161
+
162
+ Returns
163
+ -------
164
+ InstrumentSet
165
+ IV-style instrument set
166
+
167
+ Examples
168
+ --------
169
+ >>> # gmm(x, lag(2 4)) in IV-style creates 3 columns: x_{t-2}, x_{t-3}, x_{t-4}
170
+ >>> Z = builder.create_iv_style_instruments('x', min_lag=2, max_lag=4)
171
+
172
+ Notes
173
+ -----
174
+ For equation='diff', instruments levels: x_{i,t-k}
175
+ For equation='level', instruments differences: Δx_{i,t-k}
176
+ """
177
+ n_lags = max_lag - min_lag + 1
178
+ n_obs = len(self.data)
179
+
180
+ # Initialize instrument matrix
181
+ Z = np.full((n_obs, n_lags), np.nan)
182
+
183
+ # Get variable data
184
+ var_data = self.data[var].values
185
+ ids = self.data[self.id_var].values
186
+ times = self.data[self.time_var].values
187
+
188
+ # Create instrument names
189
+ instrument_names = []
190
+ for lag in range(min_lag, max_lag + 1):
191
+ if equation == 'diff':
192
+ instrument_names.append(f"{var}_L{lag}")
193
+ else:
194
+ instrument_names.append(f"D.{var}_L{lag}")
195
+
196
+ # Fill instrument matrix
197
+ for i, (current_id, current_time) in enumerate(zip(ids, times)):
198
+ for lag_idx, lag in enumerate(range(min_lag, max_lag + 1)):
199
+ # Find lagged value
200
+ mask = (ids == current_id) & (times == current_time - lag)
201
+ if np.any(mask):
202
+ lag_idx_data = np.where(mask)[0][0]
203
+ if equation == 'diff':
204
+ Z[i, lag_idx] = var_data[lag_idx_data]
205
+ else:
206
+ # For level equation, use differences as instruments
207
+ # Find t-lag-1 for differencing
208
+ mask_lag1 = (ids == current_id) & (times == current_time - lag - 1)
209
+ if np.any(mask_lag1):
210
+ lag1_idx_data = np.where(mask_lag1)[0][0]
211
+ Z[i, lag_idx] = var_data[lag_idx_data] - var_data[lag1_idx_data]
212
+
213
+ return InstrumentSet(
214
+ Z=Z,
215
+ variable_names=[var],
216
+ instrument_names=instrument_names,
217
+ equation=equation,
218
+ style='iv',
219
+ collapsed=False
220
+ )
221
+
222
+ def create_gmm_style_instruments(self,
223
+ var: str,
224
+ min_lag: int,
225
+ max_lag: Optional[int] = None,
226
+ equation: str = 'diff',
227
+ collapse: bool = False) -> InstrumentSet:
228
+ """
229
+ Create GMM-style instruments (separate column per time period).
230
+
231
+ GMM-style instruments create a separate column for each available lag
232
+ in each time period, leading to instrument proliferation unless collapsed.
233
+
234
+ Parameters
235
+ ----------
236
+ var : str
237
+ Variable to instrument
238
+ min_lag : int
239
+ Minimum lag to use
240
+ max_lag : int, optional
241
+ Maximum lag to use (None = all available)
242
+ equation : str
243
+ 'diff' for differenced equation, 'level' for level equation
244
+ collapse : bool
245
+ Whether to collapse instruments to avoid proliferation
246
+
247
+ Returns
248
+ -------
249
+ InstrumentSet
250
+ GMM-style instrument set
251
+
252
+ Examples
253
+ --------
254
+ >>> # Without collapse: Creates many columns (one per time*lag)
255
+ >>> Z = builder.create_gmm_style_instruments('x', min_lag=2, max_lag=99)
256
+ >>> # With collapse: Creates one column per lag
257
+ >>> Z_collapsed = builder.create_gmm_style_instruments(
258
+ ... 'x', min_lag=2, max_lag=99, collapse=True
259
+ ... )
260
+
261
+ Notes
262
+ -----
263
+ Collapse mode (Roodman 2009 recommendation):
264
+ - Reduces instrument count from O(T²) to O(T)
265
+ - Uses sum of available lags instead of separate columns
266
+ - Helps avoid overfitting and weak instruments
267
+ """
268
+ if collapse:
269
+ return self._create_gmm_collapsed(var, min_lag, max_lag, equation)
270
+ else:
271
+ return self._create_gmm_standard(var, min_lag, max_lag, equation)
272
+
273
+ def _create_gmm_standard(self,
274
+ var: str,
275
+ min_lag: int,
276
+ max_lag: Optional[int],
277
+ equation: str) -> InstrumentSet:
278
+ """Create GMM-style instruments without collapse."""
279
+ var_data = self.data[var].values
280
+ ids = self.data[self.id_var].values
281
+ times = self.data[self.time_var].values
282
+ n_obs = len(self.data)
283
+
284
+ # Determine actual max_lag
285
+ if max_lag is None:
286
+ max_lag = int(1e6) # Effectively infinite
287
+
288
+ # Build instruments time period by time period
289
+ Z_list = []
290
+ instrument_names = []
291
+
292
+ for t_idx, t in enumerate(self.time_periods):
293
+ # Skip early periods where no lags available
294
+ if t_idx < min_lag:
295
+ continue
296
+
297
+ # Determine available lags for this period
298
+ available_lags = []
299
+ for lag in range(min_lag, min(max_lag + 1, t_idx + 1)):
300
+ available_lags.append(lag)
301
+
302
+ if not available_lags:
303
+ continue
304
+
305
+ # Create instrument columns for this time period
306
+ for lag in available_lags:
307
+ col = np.full(n_obs, np.nan)
308
+ col_name = f"{var}_t{t}_L{lag}"
309
+
310
+ # Fill only for observations at time t
311
+ mask_t = times == t
312
+ for i in np.where(mask_t)[0]:
313
+ current_id = ids[i]
314
+ # Find lagged value
315
+ mask_lag = (ids == current_id) & (times == t - lag)
316
+ if np.any(mask_lag):
317
+ lag_idx = np.where(mask_lag)[0][0]
318
+ if equation == 'diff':
319
+ col[i] = var_data[lag_idx]
320
+ else:
321
+ # For level equation, use differences
322
+ mask_lag1 = (ids == current_id) & (times == t - lag - 1)
323
+ if np.any(mask_lag1):
324
+ lag1_idx = np.where(mask_lag1)[0][0]
325
+ col[i] = var_data[lag_idx] - var_data[lag1_idx]
326
+
327
+ Z_list.append(col)
328
+ instrument_names.append(col_name)
329
+
330
+ # Stack into matrix
331
+ Z = np.column_stack(Z_list) if Z_list else np.empty((n_obs, 0))
332
+
333
+ return InstrumentSet(
334
+ Z=Z,
335
+ variable_names=[var],
336
+ instrument_names=instrument_names,
337
+ equation=equation,
338
+ style='gmm',
339
+ collapsed=False
340
+ )
341
+
342
+ def _analyze_lag_availability(self,
343
+ var: str,
344
+ min_lag: int,
345
+ max_lag: int,
346
+ min_coverage: float = 0.10) -> List[int]:
347
+ """
348
+ Analyze which lags have sufficient data coverage.
349
+
350
+ For unbalanced panels, some lags may be available for very few
351
+ observations. This method identifies lags with sufficient coverage.
352
+
353
+ Parameters
354
+ ----------
355
+ var : str
356
+ Variable to analyze
357
+ min_lag : int
358
+ Minimum lag to consider
359
+ max_lag : int
360
+ Maximum lag to consider
361
+ min_coverage : float
362
+ Minimum fraction of observations that must have valid lagged values
363
+ (default: 0.10 = 10%)
364
+
365
+ Returns
366
+ -------
367
+ List[int]
368
+ Lags with sufficient coverage
369
+ """
370
+ var_data = self.data[var].values
371
+ ids = self.data[self.id_var].values
372
+ times = self.data[self.time_var].values
373
+ n_obs = len(self.data)
374
+
375
+ valid_lags = []
376
+
377
+ for lag in range(min_lag, max_lag + 1):
378
+ # Count how many observations would have valid lagged values
379
+ n_valid = 0
380
+ for i in range(n_obs):
381
+ current_id = ids[i]
382
+ current_time = times[i]
383
+
384
+ # Check if lagged value exists
385
+ mask_lag = (ids == current_id) & (times == current_time - lag)
386
+ if np.any(mask_lag):
387
+ lag_idx = np.where(mask_lag)[0][0]
388
+ if not np.isnan(var_data[lag_idx]):
389
+ n_valid += 1
390
+
391
+ # Include lag if coverage is sufficient
392
+ coverage = n_valid / n_obs
393
+ if coverage >= min_coverage:
394
+ valid_lags.append(lag)
395
+
396
+ return valid_lags
397
+
398
+ def _create_gmm_collapsed(self,
399
+ var: str,
400
+ min_lag: int,
401
+ max_lag: Optional[int],
402
+ equation: str) -> InstrumentSet:
403
+ """
404
+ Create collapsed GMM-style instruments.
405
+
406
+ Collapse creates one column per lag, summing across time periods.
407
+ This dramatically reduces instrument count while preserving information.
408
+
409
+ For unbalanced panels, automatically filters out lags with very low
410
+ data coverage (< 10% of observations).
411
+ """
412
+ var_data = self.data[var].values
413
+ ids = self.data[self.id_var].values
414
+ times = self.data[self.time_var].values
415
+ n_obs = len(self.data)
416
+
417
+ # Determine actual max_lag based on data
418
+ # Maximum possible lag is n_periods - 1
419
+ actual_max_lag = self.n_periods - 1
420
+
421
+ if max_lag is None:
422
+ max_lag = actual_max_lag
423
+ else:
424
+ # Limit max_lag to what's actually available
425
+ max_lag = min(max_lag, actual_max_lag)
426
+
427
+ # Smart selection: Filter lags with sufficient data coverage
428
+ # This helps with unbalanced panels by excluding mostly-NaN lags
429
+ possible_lags = self._analyze_lag_availability(
430
+ var, min_lag, max_lag, min_coverage=0.10
431
+ )
432
+
433
+ # If no lags meet the coverage threshold, use at least min_lag and min_lag+1
434
+ if not possible_lags and min_lag <= max_lag:
435
+ import warnings
436
+ warnings.warn(
437
+ f"No lags for variable '{var}' meet the 10% coverage threshold. "
438
+ f"Using lags {min_lag} and {min(min_lag+1, max_lag)} anyway.",
439
+ UserWarning
440
+ )
441
+ possible_lags = [min_lag]
442
+ if min_lag + 1 <= max_lag:
443
+ possible_lags.append(min_lag + 1)
444
+
445
+ Z_list = []
446
+ instrument_names = []
447
+
448
+ for lag in possible_lags:
449
+ col = np.full(n_obs, np.nan)
450
+ col_name = f"{var}_L{lag}_collapsed"
451
+
452
+ # For each observation, get lagged value if available
453
+ for i in range(n_obs):
454
+ current_id = ids[i]
455
+ current_time = times[i]
456
+
457
+ # Find lagged value
458
+ mask_lag = (ids == current_id) & (times == current_time - lag)
459
+ if np.any(mask_lag):
460
+ lag_idx = np.where(mask_lag)[0][0]
461
+ if equation == 'diff':
462
+ col[i] = var_data[lag_idx]
463
+ else:
464
+ # For level equation, use differences
465
+ mask_lag1 = (ids == current_id) & (times == current_time - lag - 1)
466
+ if np.any(mask_lag1):
467
+ lag1_idx = np.where(mask_lag1)[0][0]
468
+ col[i] = var_data[lag_idx] - var_data[lag1_idx]
469
+
470
+ Z_list.append(col)
471
+ instrument_names.append(col_name)
472
+
473
+ # Stack into matrix
474
+ Z = np.column_stack(Z_list) if Z_list else np.empty((n_obs, 0))
475
+
476
+ return InstrumentSet(
477
+ Z=Z,
478
+ variable_names=[var],
479
+ instrument_names=instrument_names,
480
+ equation=equation,
481
+ style='gmm',
482
+ collapsed=True
483
+ )
484
+
485
+ def combine_instruments(self, *instrument_sets: InstrumentSet) -> InstrumentSet:
486
+ """
487
+ Combine multiple instrument sets.
488
+
489
+ Parameters
490
+ ----------
491
+ *instrument_sets : InstrumentSet
492
+ Instrument sets to combine
493
+
494
+ Returns
495
+ -------
496
+ InstrumentSet
497
+ Combined instrument set
498
+
499
+ Examples
500
+ --------
501
+ >>> Z_gmm = builder.create_gmm_style_instruments('y', 2, 99, collapse=True)
502
+ >>> Z_iv = builder.create_iv_style_instruments('x', 2, 4)
503
+ >>> Z_combined = builder.combine_instruments(Z_gmm, Z_iv)
504
+ """
505
+ if not instrument_sets:
506
+ raise ValueError("Must provide at least one instrument set")
507
+
508
+ # Combine matrices
509
+ Z_combined = np.column_stack([iset.Z for iset in instrument_sets])
510
+
511
+ # Combine names
512
+ var_names = []
513
+ inst_names = []
514
+ for iset in instrument_sets:
515
+ var_names.extend(iset.variable_names)
516
+ inst_names.extend(iset.instrument_names)
517
+
518
+ return InstrumentSet(
519
+ Z=Z_combined,
520
+ variable_names=var_names,
521
+ instrument_names=inst_names,
522
+ equation=instrument_sets[0].equation,
523
+ style='mixed',
524
+ collapsed=False
525
+ )
526
+
527
+ def instrument_count_analysis(self, Z: InstrumentSet) -> pd.DataFrame:
528
+ """
529
+ Analyze instrument count.
530
+
531
+ Parameters
532
+ ----------
533
+ Z : InstrumentSet
534
+ Instrument set to analyze
535
+
536
+ Returns
537
+ -------
538
+ pd.DataFrame
539
+ Analysis of instrument counts
540
+
541
+ Examples
542
+ --------
543
+ >>> Z = builder.create_gmm_style_instruments('y', 2, 99)
544
+ >>> analysis = builder.instrument_count_analysis(Z)
545
+ >>> print(analysis)
546
+ """
547
+ analysis = {
548
+ 'Total instruments': Z.n_instruments,
549
+ 'Observations': Z.n_obs,
550
+ 'Groups': self.n_groups,
551
+ 'Instrument ratio': Z.n_instruments / self.n_groups,
552
+ 'Style': Z.style,
553
+ 'Collapsed': Z.collapsed,
554
+ 'Variables': ', '.join(Z.variable_names)
555
+ }
556
+
557
+ # Warning if too many instruments
558
+ if Z.n_instruments > self.n_groups:
559
+ analysis['Warning'] = f"Too many instruments ({Z.n_instruments} > {self.n_groups} groups)"
560
+ else:
561
+ analysis['Warning'] = 'OK'
562
+
563
+ return pd.DataFrame([analysis]).T
564
+
565
+ def get_valid_obs_mask(self, Z: InstrumentSet) -> np.ndarray:
566
+ """
567
+ Get mask of valid observations (non-missing instruments).
568
+
569
+ Parameters
570
+ ----------
571
+ Z : InstrumentSet
572
+ Instrument set
573
+
574
+ Returns
575
+ -------
576
+ np.ndarray
577
+ Boolean mask of valid observations
578
+ """
579
+ # Valid if at least one instrument is non-missing
580
+ return ~np.all(np.isnan(Z.Z), axis=1)