panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. panelbox/__init__.py +41 -0
  2. panelbox/__version__.py +13 -1
  3. panelbox/core/formula_parser.py +9 -2
  4. panelbox/core/panel_data.py +1 -1
  5. panelbox/datasets/__init__.py +39 -0
  6. panelbox/datasets/load.py +334 -0
  7. panelbox/gmm/difference_gmm.py +63 -15
  8. panelbox/gmm/estimator.py +46 -5
  9. panelbox/gmm/system_gmm.py +136 -21
  10. panelbox/models/static/__init__.py +4 -0
  11. panelbox/models/static/between.py +434 -0
  12. panelbox/models/static/first_difference.py +494 -0
  13. panelbox/models/static/fixed_effects.py +80 -11
  14. panelbox/models/static/pooled_ols.py +80 -11
  15. panelbox/models/static/random_effects.py +52 -10
  16. panelbox/standard_errors/__init__.py +119 -0
  17. panelbox/standard_errors/clustered.py +386 -0
  18. panelbox/standard_errors/comparison.py +528 -0
  19. panelbox/standard_errors/driscoll_kraay.py +386 -0
  20. panelbox/standard_errors/newey_west.py +324 -0
  21. panelbox/standard_errors/pcse.py +358 -0
  22. panelbox/standard_errors/robust.py +324 -0
  23. panelbox/standard_errors/utils.py +390 -0
  24. panelbox/validation/__init__.py +6 -0
  25. panelbox/validation/robustness/__init__.py +51 -0
  26. panelbox/validation/robustness/bootstrap.py +933 -0
  27. panelbox/validation/robustness/checks.py +143 -0
  28. panelbox/validation/robustness/cross_validation.py +538 -0
  29. panelbox/validation/robustness/influence.py +364 -0
  30. panelbox/validation/robustness/jackknife.py +457 -0
  31. panelbox/validation/robustness/outliers.py +529 -0
  32. panelbox/validation/robustness/sensitivity.py +809 -0
  33. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
  34. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
  35. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
  36. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
  37. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
@@ -248,27 +248,51 @@ class SystemGMM(DifferenceGMM):
248
248
  # Step 4: Stack equations
249
249
  y_stacked = np.vstack([y_diff, y_level])
250
250
  X_stacked = np.vstack([X_diff, X_level])
251
- Z_stacked = self._stack_instruments(Z_diff, Z_level)
251
+ Z_stacked_raw = self._stack_instruments(Z_diff, Z_level)
252
+
253
+ # Clean instrument matrix before estimation
254
+ # Remove observations and columns with NaNs
255
+ valid_mask = self._get_valid_mask_system(y_stacked, X_stacked, Z_stacked_raw)
256
+ y_stacked_clean = y_stacked[valid_mask]
257
+ X_stacked_clean = X_stacked[valid_mask]
258
+ Z_stacked_clean = Z_stacked_raw[valid_mask]
259
+
260
+ # Remove instrument columns with remaining NaNs
261
+ valid_instrument_cols = ~np.isnan(Z_stacked_clean).any(axis=0)
262
+ if not valid_instrument_cols.any():
263
+ raise ValueError("No valid instrument columns in System GMM. Check data quality.")
264
+ Z_stacked_clean = Z_stacked_clean[:, valid_instrument_cols]
265
+
266
+ # For tests later, keep track of the full stacked residuals
267
+ residuals_full = np.full_like(y_stacked, np.nan)
252
268
 
253
269
  # Repeat ids and times for stacked system
254
270
  ids_stacked = np.concatenate([ids, ids])
255
271
  times_stacked = np.concatenate([times, times])
256
272
 
257
- # Step 5: Estimate GMM on stacked system
273
+ # Step 5: Estimate GMM on stacked system (using cleaned data)
258
274
  if self.gmm_type == 'one_step':
259
- beta, W, residuals = self.estimator.one_step(y_stacked, X_stacked, Z_stacked)
260
- vcov = self._compute_one_step_vcov(X_stacked, Z_stacked, residuals, W)
275
+ beta, W, residuals_clean = self.estimator.one_step(
276
+ y_stacked_clean, X_stacked_clean, Z_stacked_clean
277
+ )
278
+ vcov = self._compute_one_step_vcov(X_stacked_clean, Z_stacked_clean, residuals_clean, W)
261
279
  converged = True
262
280
  elif self.gmm_type == 'two_step':
263
- beta, vcov, W, residuals = self.estimator.two_step(
264
- y_stacked, X_stacked, Z_stacked, robust=self.robust
281
+ beta, vcov, W, residuals_clean = self.estimator.two_step(
282
+ y_stacked_clean, X_stacked_clean, Z_stacked_clean, robust=self.robust
265
283
  )
266
284
  converged = True
267
285
  else: # iterative
268
286
  beta, vcov, W, converged = self.estimator.iterative(
269
- y_stacked, X_stacked, Z_stacked
287
+ y_stacked_clean, X_stacked_clean, Z_stacked_clean
270
288
  )
271
- residuals = y_stacked - X_stacked @ beta
289
+ residuals_clean = y_stacked_clean - X_stacked_clean @ beta
290
+
291
+ # Fill residuals in full array
292
+ if residuals_full.ndim > 1:
293
+ residuals_full[valid_mask] = residuals_clean.reshape(-1, 1)
294
+ else:
295
+ residuals_full[valid_mask] = residuals_clean.flatten()
272
296
 
273
297
  # Ensure beta is 1D for pandas Series
274
298
  beta = beta.flatten()
@@ -285,19 +309,19 @@ class SystemGMM(DifferenceGMM):
285
309
  # Step 8: Compute specification tests
286
310
  n_params = len(beta)
287
311
 
288
- # Hansen J-test on full system
312
+ # Hansen J-test on full system (use cleaned data)
289
313
  hansen = self.tester.hansen_j_test(
290
- residuals, Z_stacked, W, n_params
314
+ residuals_clean, Z_stacked_clean, W, n_params
291
315
  )
292
316
 
293
317
  # Sargan test
294
318
  sargan = self.tester.sargan_test(
295
- residuals, Z_stacked, n_params
319
+ residuals_clean, Z_stacked_clean, n_params
296
320
  )
297
321
 
298
322
  # AR tests (on difference residuals only)
299
323
  n_diff = len(y_diff)
300
- residuals_diff_only = residuals[:n_diff]
324
+ residuals_diff_only = residuals_full[:n_diff]
301
325
  ids_diff_only = ids_stacked[:n_diff] # Use stacked ids, first half
302
326
 
303
327
  valid_mask_diff = ~np.isnan(residuals_diff_only.flatten())
@@ -312,12 +336,18 @@ class SystemGMM(DifferenceGMM):
312
336
  )
313
337
 
314
338
  # Difference-in-Hansen test for level instruments
315
- diff_hansen = self._compute_diff_hansen(
316
- residuals, Z_diff, Z_level, W, n_params
317
- )
339
+ # Note: Disabled when instrument columns are filtered due to dimension mismatches
340
+ # This is a known limitation when dealing with sparse instrument coverage
341
+ try:
342
+ diff_hansen = self._compute_diff_hansen(
343
+ residuals_full, Z_diff, Z_level, W, n_params
344
+ )
345
+ except (ValueError, np.linalg.LinAlgError):
346
+ # If dimensions don't match (due to column filtering), skip test
347
+ diff_hansen = None
318
348
 
319
349
  # Step 9: Create results object
320
- valid_mask = ~np.isnan(residuals.flatten())
350
+ valid_mask_results = ~np.isnan(residuals_full.flatten())
321
351
  self.results = GMMResults(
322
352
  params=pd.Series(beta, index=var_names),
323
353
  std_errors=pd.Series(std_errors, index=var_names),
@@ -325,7 +355,7 @@ class SystemGMM(DifferenceGMM):
325
355
  pvalues=pd.Series(pvalues, index=var_names),
326
356
  nobs=int(np.sum(valid_mask)),
327
357
  n_groups=self.instrument_builder.n_groups,
328
- n_instruments=Z_stacked.shape[1],
358
+ n_instruments=Z_stacked_clean.shape[1],
329
359
  n_params=n_params,
330
360
  hansen_j=hansen,
331
361
  sargan=sargan,
@@ -339,7 +369,7 @@ class SystemGMM(DifferenceGMM):
339
369
  windmeijer_corrected=self.robust and self.two_step,
340
370
  model_type='system',
341
371
  transformation='fd',
342
- residuals=residuals
372
+ residuals=residuals_full
343
373
  )
344
374
 
345
375
  self.params = self.results.params
@@ -530,19 +560,104 @@ class SystemGMM(DifferenceGMM):
530
560
  """
531
561
  n_obs = Z_diff.n_obs
532
562
 
563
+ # Filter out invalid instrument columns (all NaN or insufficient coverage)
564
+ # For difference instruments
565
+ Z_diff_clean = self._filter_invalid_columns(Z_diff.Z, min_coverage=0.10)
566
+
567
+ # For level instruments
568
+ Z_level_clean = self._filter_invalid_columns(Z_level.Z, min_coverage=0.10)
569
+
533
570
  # Create block diagonal matrix
534
- n_instruments_total = Z_diff.n_instruments + Z_level.n_instruments
571
+ n_instruments_total = Z_diff_clean.shape[1] + Z_level_clean.shape[1]
535
572
 
536
573
  Z_stacked = np.zeros((2 * n_obs, n_instruments_total))
537
574
 
538
575
  # Fill difference block
539
- Z_stacked[:n_obs, :Z_diff.n_instruments] = Z_diff.Z
576
+ Z_stacked[:n_obs, :Z_diff_clean.shape[1]] = Z_diff_clean
540
577
 
541
578
  # Fill level block
542
- Z_stacked[n_obs:, Z_diff.n_instruments:] = Z_level.Z
579
+ Z_stacked[n_obs:, Z_diff_clean.shape[1]:] = Z_level_clean
543
580
 
544
581
  return Z_stacked
545
582
 
583
+ def _filter_invalid_columns(self, Z: np.ndarray, min_coverage: float = 0.10) -> np.ndarray:
584
+ """
585
+ Filter out instrument columns with insufficient coverage.
586
+
587
+ Parameters
588
+ ----------
589
+ Z : np.ndarray
590
+ Instrument matrix
591
+ min_coverage : float
592
+ Minimum fraction of non-NaN values required (default: 0.10 = 10%)
593
+
594
+ Returns
595
+ -------
596
+ np.ndarray
597
+ Filtered instrument matrix with only valid columns
598
+ """
599
+ if Z.shape[1] == 0:
600
+ return Z
601
+
602
+ # Count non-NaN values per column
603
+ n_valid_per_col = (~np.isnan(Z)).sum(axis=0)
604
+ n_obs = Z.shape[0]
605
+
606
+ # Calculate coverage per column
607
+ coverage = n_valid_per_col / n_obs
608
+
609
+ # Keep columns with sufficient coverage
610
+ valid_cols = coverage >= min_coverage
611
+
612
+ # If no columns are valid, return at least one column (all zeros)
613
+ # This prevents dimension errors, though estimation may fail later
614
+ if not valid_cols.any():
615
+ import warnings
616
+ warnings.warn("No valid instrument columns found. System GMM may fail.")
617
+ return np.zeros((n_obs, 1))
618
+
619
+ return Z[:, valid_cols]
620
+
621
+ def _get_valid_mask_system(self,
622
+ y: np.ndarray,
623
+ X: np.ndarray,
624
+ Z: np.ndarray,
625
+ min_instruments: Optional[int] = None) -> np.ndarray:
626
+ """
627
+ Get mask of observations with sufficient valid data for System GMM.
628
+
629
+ Parameters
630
+ ----------
631
+ y : np.ndarray
632
+ Dependent variable
633
+ X : np.ndarray
634
+ Regressors
635
+ Z : np.ndarray
636
+ Instruments
637
+ min_instruments : int, optional
638
+ Minimum number of valid instruments required
639
+
640
+ Returns
641
+ -------
642
+ np.ndarray
643
+ Boolean mask of valid observations
644
+ """
645
+ y_valid = ~np.isnan(y).any(axis=1) if y.ndim > 1 else ~np.isnan(y)
646
+ X_valid = ~np.isnan(X).any(axis=1)
647
+
648
+ # For instruments, count how many are valid per observation
649
+ Z_notnan = ~np.isnan(Z)
650
+ n_valid_instruments = Z_notnan.sum(axis=1)
651
+
652
+ # Determine minimum required instruments
653
+ if min_instruments is None:
654
+ k = X.shape[1] if X.ndim > 1 else 1
655
+ min_instruments = k + 1
656
+
657
+ Z_valid = n_valid_instruments >= min_instruments
658
+
659
+ return y_valid & X_valid & Z_valid
660
+
546
661
  def _compute_diff_hansen(self,
547
662
  residuals: np.ndarray,
548
663
  Z_diff: InstrumentSet,
@@ -5,9 +5,13 @@ Static panel models.
5
5
  from panelbox.models.static.pooled_ols import PooledOLS
6
6
  from panelbox.models.static.fixed_effects import FixedEffects
7
7
  from panelbox.models.static.random_effects import RandomEffects
8
+ from panelbox.models.static.between import BetweenEstimator
9
+ from panelbox.models.static.first_difference import FirstDifferenceEstimator
8
10
 
9
11
  __all__ = [
10
12
  'PooledOLS',
11
13
  'FixedEffects',
12
14
  'RandomEffects',
15
+ 'BetweenEstimator',
16
+ 'FirstDifferenceEstimator',
13
17
  ]
@@ -0,0 +1,434 @@
1
+ """
2
+ Between estimator for panel data.
3
+
4
+ This module provides the Between estimator which regresses on group means,
5
+ capturing variation between entities rather than within entities.
6
+ """
7
+
8
+ from typing import Optional
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from panelbox.core.base_model import PanelModel
13
+ from panelbox.core.results import PanelResults
14
+ from panelbox.utils.matrix_ops import (
15
+ compute_ols,
16
+ compute_vcov_nonrobust,
17
+ compute_panel_rsquared
18
+ )
19
+ from panelbox.standard_errors import (
20
+ robust_covariance,
21
+ cluster_by_entity,
22
+ twoway_cluster,
23
+ driscoll_kraay,
24
+ newey_west,
25
+ pcse
26
+ )
27
+
28
+
29
+ class BetweenEstimator(PanelModel):
30
+ """
31
+ Between estimator for panel data.
32
+
33
+ This estimator regresses on group (entity) means, capturing the variation
34
+ between entities rather than within entities. It answers: "Do entities with
35
+ higher average X also have higher average Y?"
36
+
37
+ The between transformation computes group means:
38
+ ȳ_i = β x̄_i + α + ū_i
39
+
40
+ where bars denote averages over time for each entity i.
41
+
42
+ This estimator is useful when:
43
+ - T (time periods) is small relative to N (entities)
44
+ - Focus is on cross-sectional (between-entity) variation
45
+ - Time-invariant characteristics are of interest
46
+
47
+ Contrast with Fixed Effects (within estimator):
48
+ - FE uses deviations from entity means (within variation)
49
+ - BE uses entity means themselves (between variation)
50
+
51
+ Parameters
52
+ ----------
53
+ formula : str
54
+ Model formula in R-style syntax (e.g., "y ~ x1 + x2")
55
+ data : pd.DataFrame
56
+ Panel data in long format
57
+ entity_col : str
58
+ Name of the column identifying entities
59
+ time_col : str
60
+ Name of the column identifying time periods
61
+ weights : np.ndarray, optional
62
+ Observation weights (applied to entity means)
63
+
64
+ Attributes
65
+ ----------
66
+ entity_means : pd.DataFrame, optional
67
+ Entity-level means (after fitting)
68
+
69
+ Examples
70
+ --------
71
+ >>> import panelbox as pb
72
+ >>> import pandas as pd
73
+ >>>
74
+ >>> # Load data
75
+ >>> data = pb.load_grunfeld()
76
+ >>>
77
+ >>> # Between estimator
78
+ >>> be = pb.BetweenEstimator("invest ~ value + capital", data, "firm", "year")
79
+ >>> results = be.fit(cov_type='robust')
80
+ >>> print(results.summary())
81
+ >>>
82
+ >>> # Compare with Fixed Effects (within)
83
+ >>> fe = pb.FixedEffects("invest ~ value + capital", data, "firm", "year")
84
+ >>> results_fe = fe.fit()
85
+ >>>
86
+ >>> # BE captures between variation, FE captures within variation
87
+ >>> print(f"Between R²: {results.rsquared:.4f}")
88
+ >>> print(f"Within R²: {results_fe.rsquared:.4f}")
89
+ >>>
90
+ >>> # Access entity means
91
+ >>> entity_means = be.entity_means
92
+ >>> print(entity_means.head())
93
+
94
+ Notes
95
+ -----
96
+ The Between estimator:
97
+ 1. Computes entity-level means for all variables
98
+ 2. Runs OLS on the N entity means (not NT observations)
99
+ 3. Reports R² as the between R² (variation explained across entities)
100
+
101
+ Degrees of freedom:
102
+ - N observations (one per entity)
103
+ - k parameters (slopes + intercept)
104
+ - df_resid = N - k
105
+
106
+ Standard errors:
107
+ - All SE types are supported (robust, clustered, etc.)
108
+ - Applied to the N entity-level observations
109
+ - Clustering by time is possible if needed
110
+
111
+ References
112
+ ----------
113
+ .. [1] Wooldridge, J. M. (2010). Econometric Analysis of Cross Section
114
+ and Panel Data. MIT Press. Section 10.2.2.
115
+ .. [2] Baltagi, B. H. (2013). Econometric Analysis of Panel Data.
116
+ Wiley. Chapter 2.
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ formula: str,
122
+ data: pd.DataFrame,
123
+ entity_col: str,
124
+ time_col: str,
125
+ weights: Optional[np.ndarray] = None
126
+ ):
127
+ super().__init__(formula, data, entity_col, time_col, weights)
128
+
129
+ # Entity means (computed after fitting)
130
+ self.entity_means: Optional[pd.DataFrame] = None
131
+
132
+ def fit(
133
+ self,
134
+ cov_type: str = 'nonrobust',
135
+ **cov_kwds
136
+ ) -> PanelResults:
137
+ """
138
+ Fit the Between estimator.
139
+
140
+ Parameters
141
+ ----------
142
+ cov_type : str, default='nonrobust'
143
+ Type of covariance estimator:
144
+ - 'nonrobust': Classical standard errors
145
+ - 'robust' or 'hc1': Heteroskedasticity-robust (HC1)
146
+ - 'hc0', 'hc2', 'hc3': Other HC variants
147
+ - 'clustered': Cluster-robust (by entity by default, or custom)
148
+ - 'twoway': Two-way clustered (entity and time at group level)
149
+ - 'driscoll_kraay': Driscoll-Kraay (spatial/temporal dependence)
150
+ - 'newey_west': Newey-West HAC
151
+ - 'pcse': Panel-Corrected Standard Errors
152
+ **cov_kwds
153
+ Additional arguments for covariance estimation:
154
+ - cluster_col: For custom clustering
155
+ - max_lags: For Driscoll-Kraay and Newey-West
156
+ - kernel: For HAC estimators ('bartlett', 'parzen', 'quadratic_spectral')
157
+
158
+ Returns
159
+ -------
160
+ PanelResults
161
+ Fitted model results
162
+
163
+ Examples
164
+ --------
165
+ >>> # Classical standard errors
166
+ >>> results = model.fit(cov_type='nonrobust')
167
+
168
+ >>> # Heteroskedasticity-robust
169
+ >>> results = model.fit(cov_type='robust')
170
+ >>> results = model.fit(cov_type='hc3')
171
+
172
+ >>> # Cluster-robust
173
+ >>> results = model.fit(cov_type='clustered')
174
+
175
+ >>> # Driscoll-Kraay
176
+ >>> results = model.fit(cov_type='driscoll_kraay', max_lags=3)
177
+ """
178
+ # Build design matrices from original data
179
+ y_orig, X_orig = self.formula_parser.build_design_matrices(
180
+ self.data.data,
181
+ return_type='array'
182
+ )
183
+
184
+ # Get variable names
185
+ var_names = self.formula_parser.get_variable_names(self.data.data)
186
+
187
+ # Get entity and time identifiers
188
+ entities = self.data.data[self.data.entity_col].values
189
+ times = self.data.data[self.data.time_col].values
190
+
191
+ # Compute entity means (between transformation)
192
+ unique_entities = np.unique(entities)
193
+ n_entities = len(unique_entities)
194
+ k = X_orig.shape[1]
195
+
196
+ # Initialize arrays for entity means
197
+ y_between = np.zeros(n_entities)
198
+ X_between = np.zeros((n_entities, k))
199
+
200
+ # Compute means for each entity
201
+ for i, entity in enumerate(unique_entities):
202
+ mask = entities == entity
203
+ y_between[i] = y_orig[mask].mean()
204
+ X_between[i] = X_orig[mask].mean(axis=0)
205
+
206
+ # Store entity means for user access
207
+ entity_means_dict = {'entity': unique_entities}
208
+
209
+ # Add dependent variable mean
210
+ dep_var_name = self.formula_parser.dependent
211
+ entity_means_dict[dep_var_name] = y_between
212
+
213
+ # Add independent variable means (excluding intercept)
214
+ for j, var_name in enumerate(var_names):
215
+ if var_name != 'Intercept':
216
+ # Find the corresponding column in X_orig
217
+ # var_names includes 'Intercept' if present, so adjust index
218
+ if 'Intercept' in var_names:
219
+ X_col_idx = j
220
+ else:
221
+ X_col_idx = j
222
+ entity_means_dict[var_name] = X_between[:, X_col_idx]
223
+
224
+ self.entity_means = pd.DataFrame(entity_means_dict)
225
+
226
+ # Estimate coefficients on entity means (OLS)
227
+ beta, resid, fitted = compute_ols(y_between, X_between, self.weights)
228
+
229
+ # Degrees of freedom
230
+ n = n_entities # Number of entity-level observations
231
+ df_model = k - 1 if 'Intercept' in var_names else k # Slopes only
232
+ df_resid = n - k
233
+
234
+ # Ensure df_resid is positive
235
+ if df_resid <= 0:
236
+ raise ValueError(
237
+ f"Insufficient degrees of freedom: df_resid = {df_resid}. "
238
+ f"n_entities={n}, k={k}. Need more entities than parameters."
239
+ )
240
+
241
+ # Compute covariance matrix
242
+ cov_type_lower = cov_type.lower()
243
+
244
+ if cov_type_lower == 'nonrobust':
245
+ vcov = compute_vcov_nonrobust(X_between, resid, df_resid)
246
+
247
+ elif cov_type_lower in ['robust', 'hc0', 'hc1', 'hc2', 'hc3']:
248
+ # Map 'robust' to 'hc1'
249
+ method = 'HC1' if cov_type_lower == 'robust' else cov_type_lower.upper()
250
+ result = robust_covariance(X_between, resid, method=method)
251
+ vcov = result.cov_matrix
252
+
253
+ elif cov_type_lower == 'clustered':
254
+ # For between estimator, clustering is less common but supported
255
+ # Default: cluster by entity (though each entity appears once)
256
+ # Could cluster by another grouping variable if specified
257
+ cluster_col = cov_kwds.get('cluster_col', None)
258
+ if cluster_col is None:
259
+ # Each entity is its own cluster - equivalent to robust
260
+ result = robust_covariance(X_between, resid, method='HC1')
261
+ else:
262
+ # Use custom clustering variable from entity_means
263
+ if cluster_col not in self.entity_means.columns:
264
+ raise ValueError(f"cluster_col '{cluster_col}' not found in entity means")
265
+ cluster_ids = self.entity_means[cluster_col].values
266
+ result = cluster_by_entity(X_between, resid, cluster_ids, df_correction=True)
267
+ vcov = result.cov_matrix
268
+
269
+ elif cov_type_lower == 'twoway':
270
+ # Two-way clustering at entity level
271
+ # This is unusual for between estimator but technically possible
272
+ # Would need entity-level time groupings
273
+ cluster_col1 = cov_kwds.get('cluster_col1', 'entity')
274
+ cluster_col2 = cov_kwds.get('cluster_col2', None)
275
+
276
+ if cluster_col2 is None:
277
+ raise ValueError("twoway clustering requires cluster_col2 in cov_kwds")
278
+
279
+ cluster_ids1 = self.entity_means[cluster_col1].values if cluster_col1 in self.entity_means.columns else unique_entities
280
+ cluster_ids2 = self.entity_means[cluster_col2].values
281
+
282
+ result = twoway_cluster(X_between, resid, cluster_ids1, cluster_ids2, df_correction=True)
283
+ vcov = result.cov_matrix
284
+
285
+ elif cov_type_lower == 'driscoll_kraay':
286
+ # Driscoll-Kraay at entity level
287
+ # Use entity index as "time" dimension
288
+ max_lags = cov_kwds.get('max_lags', None)
289
+ kernel = cov_kwds.get('kernel', 'bartlett')
290
+ result = driscoll_kraay(X_between, resid, unique_entities, max_lags=max_lags, kernel=kernel)
291
+ vcov = result.cov_matrix
292
+
293
+ elif cov_type_lower == 'newey_west':
294
+ # Newey-West HAC
295
+ max_lags = cov_kwds.get('max_lags', None)
296
+ kernel = cov_kwds.get('kernel', 'bartlett')
297
+ result = newey_west(X_between, resid, max_lags=max_lags, kernel=kernel)
298
+ vcov = result.cov_matrix
299
+
300
+ elif cov_type_lower == 'pcse':
301
+ # Panel-Corrected Standard Errors
302
+ # For between estimator, each entity appears once
303
+ # PCSE is less meaningful but technically computable
304
+ result = pcse(X_between, resid, unique_entities, unique_entities)
305
+ vcov = result.cov_matrix
306
+
307
+ else:
308
+ raise ValueError(
309
+ f"cov_type must be one of: 'nonrobust', 'robust', 'hc0', 'hc1', 'hc2', 'hc3', "
310
+ f"'clustered', 'twoway', 'driscoll_kraay', 'newey_west', 'pcse', got '{cov_type}'"
311
+ )
312
+
313
+ # Standard errors
314
+ std_errors = np.sqrt(np.diag(vcov))
315
+
316
+ # Compute R-squared measures
317
+ # For between estimator:
318
+ # - rsquared = between R² (primary measure)
319
+ # - within R² = 0 by construction (no within variation used)
320
+ # - overall R² computed from fitted values mapped back to all observations
321
+
322
+ # Between R² (on entity means)
323
+ tss_between = np.sum((y_between - y_between.mean()) ** 2)
324
+ ess_between = np.sum(resid ** 2)
325
+ rsquared_between = 1 - ess_between / tss_between if tss_between > 0 else 0.0
326
+
327
+ # Map fitted values back to original observations for overall R²
328
+ fitted_all = np.zeros(len(y_orig))
329
+ for i, entity in enumerate(unique_entities):
330
+ mask = entities == entity
331
+ fitted_all[mask] = fitted[i]
332
+
333
+ resid_all = y_orig - fitted_all
334
+
335
+ # Overall R² (on all NT observations)
336
+ tss_overall = np.sum((y_orig - y_orig.mean()) ** 2)
337
+ ess_overall = np.sum(resid_all ** 2)
338
+ rsquared_overall = 1 - ess_overall / tss_overall if tss_overall > 0 else 0.0
339
+
340
+ # Within R² is not meaningful for between estimator
341
+ # (would require comparing within variation, which BE ignores)
342
+ rsquared_within = 0.0
343
+
344
+ # Adjusted R-squared (based on between R²)
345
+ rsquared_adj = 1 - (1 - rsquared_between) * (n - 1) / df_resid
346
+
347
+ # Create Series/DataFrame with variable names
348
+ params = pd.Series(beta.ravel(), index=var_names)
349
+ std_errors_series = pd.Series(std_errors, index=var_names)
350
+ cov_params = pd.DataFrame(vcov, index=var_names, columns=var_names)
351
+
352
+ # Model information
353
+ model_info = {
354
+ 'model_type': 'Between Estimator',
355
+ 'formula': self.formula,
356
+ 'cov_type': cov_type,
357
+ 'cov_kwds': cov_kwds,
358
+ 'entity_effects': False,
359
+ 'time_effects': False,
360
+ }
361
+
362
+ # Data information
363
+ data_info = {
364
+ 'nobs': n, # Number of entity-level observations
365
+ 'n_entities': self.data.n_entities,
366
+ 'n_periods': self.data.n_periods,
367
+ 'df_model': df_model,
368
+ 'df_resid': df_resid,
369
+ 'entity_index': unique_entities,
370
+ 'time_index': None, # Not applicable for between estimator
371
+ }
372
+
373
+ # R-squared dictionary
374
+ rsquared_dict = {
375
+ 'rsquared': rsquared_between, # For BE, R² = between R²
376
+ 'rsquared_adj': rsquared_adj,
377
+ 'rsquared_within': rsquared_within,
378
+ 'rsquared_between': rsquared_between,
379
+ 'rsquared_overall': rsquared_overall
380
+ }
381
+
382
+ # Create results object
383
+ results = PanelResults(
384
+ params=params,
385
+ std_errors=std_errors_series,
386
+ cov_params=cov_params,
387
+ resid=resid_all, # Residuals for all observations
388
+ fittedvalues=fitted_all, # Fitted values for all observations
389
+ model_info=model_info,
390
+ data_info=data_info,
391
+ rsquared_dict=rsquared_dict,
392
+ model=self
393
+ )
394
+
395
+ # Store results and update state
396
+ self._results = results
397
+ self._fitted = True
398
+
399
+ return results
400
+
401
+ def _estimate_coefficients(self) -> np.ndarray:
402
+ """
403
+ Estimate coefficients (implementation of abstract method).
404
+
405
+ Returns
406
+ -------
407
+ np.ndarray
408
+ Estimated coefficients
409
+ """
410
+ # Build design matrices
411
+ y, X = self.formula_parser.build_design_matrices(
412
+ self.data.data,
413
+ return_type='array'
414
+ )
415
+
416
+ # Get entity identifiers
417
+ entities = self.data.data[self.data.entity_col].values
418
+
419
+ # Compute entity means
420
+ unique_entities = np.unique(entities)
421
+ n_entities = len(unique_entities)
422
+ k = X.shape[1]
423
+
424
+ y_between = np.zeros(n_entities)
425
+ X_between = np.zeros((n_entities, k))
426
+
427
+ for i, entity in enumerate(unique_entities):
428
+ mask = entities == entity
429
+ y_between[i] = y[mask].mean()
430
+ X_between[i] = X[mask].mean(axis=0)
431
+
432
+ # OLS on entity means
433
+ beta, _, _ = compute_ols(y_between, X_between, self.weights)
434
+ return beta