panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
panelbox/gmm/system_gmm.py
CHANGED
|
@@ -248,27 +248,51 @@ class SystemGMM(DifferenceGMM):
|
|
|
248
248
|
# Step 4: Stack equations
|
|
249
249
|
y_stacked = np.vstack([y_diff, y_level])
|
|
250
250
|
X_stacked = np.vstack([X_diff, X_level])
|
|
251
|
-
|
|
251
|
+
Z_stacked_raw = self._stack_instruments(Z_diff, Z_level)
|
|
252
|
+
|
|
253
|
+
# Clean instrument matrix before estimation
|
|
254
|
+
# Remove observations and columns with NaNs
|
|
255
|
+
valid_mask = self._get_valid_mask_system(y_stacked, X_stacked, Z_stacked_raw)
|
|
256
|
+
y_stacked_clean = y_stacked[valid_mask]
|
|
257
|
+
X_stacked_clean = X_stacked[valid_mask]
|
|
258
|
+
Z_stacked_clean = Z_stacked_raw[valid_mask]
|
|
259
|
+
|
|
260
|
+
# Remove instrument columns with remaining NaNs
|
|
261
|
+
valid_instrument_cols = ~np.isnan(Z_stacked_clean).any(axis=0)
|
|
262
|
+
if not valid_instrument_cols.any():
|
|
263
|
+
raise ValueError("No valid instrument columns in System GMM. Check data quality.")
|
|
264
|
+
Z_stacked_clean = Z_stacked_clean[:, valid_instrument_cols]
|
|
265
|
+
|
|
266
|
+
# For tests later, keep track of the full stacked residuals
|
|
267
|
+
residuals_full = np.full_like(y_stacked, np.nan)
|
|
252
268
|
|
|
253
269
|
# Repeat ids and times for stacked system
|
|
254
270
|
ids_stacked = np.concatenate([ids, ids])
|
|
255
271
|
times_stacked = np.concatenate([times, times])
|
|
256
272
|
|
|
257
|
-
# Step 5: Estimate GMM on stacked system
|
|
273
|
+
# Step 5: Estimate GMM on stacked system (using cleaned data)
|
|
258
274
|
if self.gmm_type == 'one_step':
|
|
259
|
-
beta, W,
|
|
260
|
-
|
|
275
|
+
beta, W, residuals_clean = self.estimator.one_step(
|
|
276
|
+
y_stacked_clean, X_stacked_clean, Z_stacked_clean
|
|
277
|
+
)
|
|
278
|
+
vcov = self._compute_one_step_vcov(X_stacked_clean, Z_stacked_clean, residuals_clean, W)
|
|
261
279
|
converged = True
|
|
262
280
|
elif self.gmm_type == 'two_step':
|
|
263
|
-
beta, vcov, W,
|
|
264
|
-
|
|
281
|
+
beta, vcov, W, residuals_clean = self.estimator.two_step(
|
|
282
|
+
y_stacked_clean, X_stacked_clean, Z_stacked_clean, robust=self.robust
|
|
265
283
|
)
|
|
266
284
|
converged = True
|
|
267
285
|
else: # iterative
|
|
268
286
|
beta, vcov, W, converged = self.estimator.iterative(
|
|
269
|
-
|
|
287
|
+
y_stacked_clean, X_stacked_clean, Z_stacked_clean
|
|
270
288
|
)
|
|
271
|
-
|
|
289
|
+
residuals_clean = y_stacked_clean - X_stacked_clean @ beta
|
|
290
|
+
|
|
291
|
+
# Fill residuals in full array
|
|
292
|
+
if residuals_full.ndim > 1:
|
|
293
|
+
residuals_full[valid_mask] = residuals_clean.reshape(-1, 1)
|
|
294
|
+
else:
|
|
295
|
+
residuals_full[valid_mask] = residuals_clean.flatten()
|
|
272
296
|
|
|
273
297
|
# Ensure beta is 1D for pandas Series
|
|
274
298
|
beta = beta.flatten()
|
|
@@ -285,19 +309,19 @@ class SystemGMM(DifferenceGMM):
|
|
|
285
309
|
# Step 8: Compute specification tests
|
|
286
310
|
n_params = len(beta)
|
|
287
311
|
|
|
288
|
-
# Hansen J-test on full system
|
|
312
|
+
# Hansen J-test on full system (use cleaned data)
|
|
289
313
|
hansen = self.tester.hansen_j_test(
|
|
290
|
-
|
|
314
|
+
residuals_clean, Z_stacked_clean, W, n_params
|
|
291
315
|
)
|
|
292
316
|
|
|
293
317
|
# Sargan test
|
|
294
318
|
sargan = self.tester.sargan_test(
|
|
295
|
-
|
|
319
|
+
residuals_clean, Z_stacked_clean, n_params
|
|
296
320
|
)
|
|
297
321
|
|
|
298
322
|
# AR tests (on difference residuals only)
|
|
299
323
|
n_diff = len(y_diff)
|
|
300
|
-
residuals_diff_only =
|
|
324
|
+
residuals_diff_only = residuals_full[:n_diff]
|
|
301
325
|
ids_diff_only = ids_stacked[:n_diff] # Use stacked ids, first half
|
|
302
326
|
|
|
303
327
|
valid_mask_diff = ~np.isnan(residuals_diff_only.flatten())
|
|
@@ -312,12 +336,18 @@ class SystemGMM(DifferenceGMM):
|
|
|
312
336
|
)
|
|
313
337
|
|
|
314
338
|
# Difference-in-Hansen test for level instruments
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
339
|
+
# Note: Disabled when instrument columns are filtered due to dimension mismatches
|
|
340
|
+
# This is a known limitation when dealing with sparse instrument coverage
|
|
341
|
+
try:
|
|
342
|
+
diff_hansen = self._compute_diff_hansen(
|
|
343
|
+
residuals_full, Z_diff, Z_level, W, n_params
|
|
344
|
+
)
|
|
345
|
+
except (ValueError, np.linalg.LinAlgError):
|
|
346
|
+
# If dimensions don't match (due to column filtering), skip test
|
|
347
|
+
diff_hansen = None
|
|
318
348
|
|
|
319
349
|
# Step 9: Create results object
|
|
320
|
-
|
|
350
|
+
valid_mask_results = ~np.isnan(residuals_full.flatten())
|
|
321
351
|
self.results = GMMResults(
|
|
322
352
|
params=pd.Series(beta, index=var_names),
|
|
323
353
|
std_errors=pd.Series(std_errors, index=var_names),
|
|
@@ -325,7 +355,7 @@ class SystemGMM(DifferenceGMM):
|
|
|
325
355
|
pvalues=pd.Series(pvalues, index=var_names),
|
|
326
356
|
nobs=int(np.sum(valid_mask)),
|
|
327
357
|
n_groups=self.instrument_builder.n_groups,
|
|
328
|
-
n_instruments=
|
|
358
|
+
n_instruments=Z_stacked_clean.shape[1],
|
|
329
359
|
n_params=n_params,
|
|
330
360
|
hansen_j=hansen,
|
|
331
361
|
sargan=sargan,
|
|
@@ -339,7 +369,7 @@ class SystemGMM(DifferenceGMM):
|
|
|
339
369
|
windmeijer_corrected=self.robust and self.two_step,
|
|
340
370
|
model_type='system',
|
|
341
371
|
transformation='fd',
|
|
342
|
-
residuals=
|
|
372
|
+
residuals=residuals_full
|
|
343
373
|
)
|
|
344
374
|
|
|
345
375
|
self.params = self.results.params
|
|
@@ -530,19 +560,104 @@ class SystemGMM(DifferenceGMM):
|
|
|
530
560
|
"""
|
|
531
561
|
n_obs = Z_diff.n_obs
|
|
532
562
|
|
|
563
|
+
# Filter out invalid instrument columns (all NaN or insufficient coverage)
|
|
564
|
+
# For difference instruments
|
|
565
|
+
Z_diff_clean = self._filter_invalid_columns(Z_diff.Z, min_coverage=0.10)
|
|
566
|
+
|
|
567
|
+
# For level instruments
|
|
568
|
+
Z_level_clean = self._filter_invalid_columns(Z_level.Z, min_coverage=0.10)
|
|
569
|
+
|
|
533
570
|
# Create block diagonal matrix
|
|
534
|
-
n_instruments_total =
|
|
571
|
+
n_instruments_total = Z_diff_clean.shape[1] + Z_level_clean.shape[1]
|
|
535
572
|
|
|
536
573
|
Z_stacked = np.zeros((2 * n_obs, n_instruments_total))
|
|
537
574
|
|
|
538
575
|
# Fill difference block
|
|
539
|
-
Z_stacked[:n_obs, :
|
|
576
|
+
Z_stacked[:n_obs, :Z_diff_clean.shape[1]] = Z_diff_clean
|
|
540
577
|
|
|
541
578
|
# Fill level block
|
|
542
|
-
Z_stacked[n_obs:,
|
|
579
|
+
Z_stacked[n_obs:, Z_diff_clean.shape[1]:] = Z_level_clean
|
|
543
580
|
|
|
544
581
|
return Z_stacked
|
|
545
582
|
|
|
583
|
+
def _filter_invalid_columns(self, Z: np.ndarray, min_coverage: float = 0.10) -> np.ndarray:
|
|
584
|
+
"""
|
|
585
|
+
Filter out instrument columns with insufficient coverage.
|
|
586
|
+
|
|
587
|
+
Parameters
|
|
588
|
+
----------
|
|
589
|
+
Z : np.ndarray
|
|
590
|
+
Instrument matrix
|
|
591
|
+
min_coverage : float
|
|
592
|
+
Minimum fraction of non-NaN values required (default: 0.10 = 10%)
|
|
593
|
+
|
|
594
|
+
Returns
|
|
595
|
+
-------
|
|
596
|
+
np.ndarray
|
|
597
|
+
Filtered instrument matrix with only valid columns
|
|
598
|
+
"""
|
|
599
|
+
if Z.shape[1] == 0:
|
|
600
|
+
return Z
|
|
601
|
+
|
|
602
|
+
# Count non-NaN values per column
|
|
603
|
+
n_valid_per_col = (~np.isnan(Z)).sum(axis=0)
|
|
604
|
+
n_obs = Z.shape[0]
|
|
605
|
+
|
|
606
|
+
# Calculate coverage per column
|
|
607
|
+
coverage = n_valid_per_col / n_obs
|
|
608
|
+
|
|
609
|
+
# Keep columns with sufficient coverage
|
|
610
|
+
valid_cols = coverage >= min_coverage
|
|
611
|
+
|
|
612
|
+
# If no columns are valid, return at least one column (all zeros)
|
|
613
|
+
# This prevents dimension errors, though estimation may fail later
|
|
614
|
+
if not valid_cols.any():
|
|
615
|
+
import warnings
|
|
616
|
+
warnings.warn("No valid instrument columns found. System GMM may fail.")
|
|
617
|
+
return np.zeros((n_obs, 1))
|
|
618
|
+
|
|
619
|
+
return Z[:, valid_cols]
|
|
620
|
+
|
|
621
|
+
def _get_valid_mask_system(self,
|
|
622
|
+
y: np.ndarray,
|
|
623
|
+
X: np.ndarray,
|
|
624
|
+
Z: np.ndarray,
|
|
625
|
+
min_instruments: Optional[int] = None) -> np.ndarray:
|
|
626
|
+
"""
|
|
627
|
+
Get mask of observations with sufficient valid data for System GMM.
|
|
628
|
+
|
|
629
|
+
Parameters
|
|
630
|
+
----------
|
|
631
|
+
y : np.ndarray
|
|
632
|
+
Dependent variable
|
|
633
|
+
X : np.ndarray
|
|
634
|
+
Regressors
|
|
635
|
+
Z : np.ndarray
|
|
636
|
+
Instruments
|
|
637
|
+
min_instruments : int, optional
|
|
638
|
+
Minimum number of valid instruments required
|
|
639
|
+
|
|
640
|
+
Returns
|
|
641
|
+
-------
|
|
642
|
+
np.ndarray
|
|
643
|
+
Boolean mask of valid observations
|
|
644
|
+
"""
|
|
645
|
+
y_valid = ~np.isnan(y).any(axis=1) if y.ndim > 1 else ~np.isnan(y)
|
|
646
|
+
X_valid = ~np.isnan(X).any(axis=1)
|
|
647
|
+
|
|
648
|
+
# For instruments, count how many are valid per observation
|
|
649
|
+
Z_notnan = ~np.isnan(Z)
|
|
650
|
+
n_valid_instruments = Z_notnan.sum(axis=1)
|
|
651
|
+
|
|
652
|
+
# Determine minimum required instruments
|
|
653
|
+
if min_instruments is None:
|
|
654
|
+
k = X.shape[1] if X.ndim > 1 else 1
|
|
655
|
+
min_instruments = k + 1
|
|
656
|
+
|
|
657
|
+
Z_valid = n_valid_instruments >= min_instruments
|
|
658
|
+
|
|
659
|
+
return y_valid & X_valid & Z_valid
|
|
660
|
+
|
|
546
661
|
def _compute_diff_hansen(self,
|
|
547
662
|
residuals: np.ndarray,
|
|
548
663
|
Z_diff: InstrumentSet,
|
|
@@ -5,9 +5,13 @@ Static panel models.
|
|
|
5
5
|
from panelbox.models.static.pooled_ols import PooledOLS
|
|
6
6
|
from panelbox.models.static.fixed_effects import FixedEffects
|
|
7
7
|
from panelbox.models.static.random_effects import RandomEffects
|
|
8
|
+
from panelbox.models.static.between import BetweenEstimator
|
|
9
|
+
from panelbox.models.static.first_difference import FirstDifferenceEstimator
|
|
8
10
|
|
|
9
11
|
__all__ = [
|
|
10
12
|
'PooledOLS',
|
|
11
13
|
'FixedEffects',
|
|
12
14
|
'RandomEffects',
|
|
15
|
+
'BetweenEstimator',
|
|
16
|
+
'FirstDifferenceEstimator',
|
|
13
17
|
]
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Between estimator for panel data.
|
|
3
|
+
|
|
4
|
+
This module provides the Between estimator which regresses on group means,
|
|
5
|
+
capturing variation between entities rather than within entities.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from panelbox.core.base_model import PanelModel
|
|
13
|
+
from panelbox.core.results import PanelResults
|
|
14
|
+
from panelbox.utils.matrix_ops import (
|
|
15
|
+
compute_ols,
|
|
16
|
+
compute_vcov_nonrobust,
|
|
17
|
+
compute_panel_rsquared
|
|
18
|
+
)
|
|
19
|
+
from panelbox.standard_errors import (
|
|
20
|
+
robust_covariance,
|
|
21
|
+
cluster_by_entity,
|
|
22
|
+
twoway_cluster,
|
|
23
|
+
driscoll_kraay,
|
|
24
|
+
newey_west,
|
|
25
|
+
pcse
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BetweenEstimator(PanelModel):
|
|
30
|
+
"""
|
|
31
|
+
Between estimator for panel data.
|
|
32
|
+
|
|
33
|
+
This estimator regresses on group (entity) means, capturing the variation
|
|
34
|
+
between entities rather than within entities. It answers: "Do entities with
|
|
35
|
+
higher average X also have higher average Y?"
|
|
36
|
+
|
|
37
|
+
The between transformation computes group means:
|
|
38
|
+
ȳ_i = β x̄_i + α + ū_i
|
|
39
|
+
|
|
40
|
+
where bars denote averages over time for each entity i.
|
|
41
|
+
|
|
42
|
+
This estimator is useful when:
|
|
43
|
+
- T (time periods) is small relative to N (entities)
|
|
44
|
+
- Focus is on cross-sectional (between-entity) variation
|
|
45
|
+
- Time-invariant characteristics are of interest
|
|
46
|
+
|
|
47
|
+
Contrast with Fixed Effects (within estimator):
|
|
48
|
+
- FE uses deviations from entity means (within variation)
|
|
49
|
+
- BE uses entity means themselves (between variation)
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
formula : str
|
|
54
|
+
Model formula in R-style syntax (e.g., "y ~ x1 + x2")
|
|
55
|
+
data : pd.DataFrame
|
|
56
|
+
Panel data in long format
|
|
57
|
+
entity_col : str
|
|
58
|
+
Name of the column identifying entities
|
|
59
|
+
time_col : str
|
|
60
|
+
Name of the column identifying time periods
|
|
61
|
+
weights : np.ndarray, optional
|
|
62
|
+
Observation weights (applied to entity means)
|
|
63
|
+
|
|
64
|
+
Attributes
|
|
65
|
+
----------
|
|
66
|
+
entity_means : pd.DataFrame, optional
|
|
67
|
+
Entity-level means (after fitting)
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> import panelbox as pb
|
|
72
|
+
>>> import pandas as pd
|
|
73
|
+
>>>
|
|
74
|
+
>>> # Load data
|
|
75
|
+
>>> data = pb.load_grunfeld()
|
|
76
|
+
>>>
|
|
77
|
+
>>> # Between estimator
|
|
78
|
+
>>> be = pb.BetweenEstimator("invest ~ value + capital", data, "firm", "year")
|
|
79
|
+
>>> results = be.fit(cov_type='robust')
|
|
80
|
+
>>> print(results.summary())
|
|
81
|
+
>>>
|
|
82
|
+
>>> # Compare with Fixed Effects (within)
|
|
83
|
+
>>> fe = pb.FixedEffects("invest ~ value + capital", data, "firm", "year")
|
|
84
|
+
>>> results_fe = fe.fit()
|
|
85
|
+
>>>
|
|
86
|
+
>>> # BE captures between variation, FE captures within variation
|
|
87
|
+
>>> print(f"Between R²: {results.rsquared:.4f}")
|
|
88
|
+
>>> print(f"Within R²: {results_fe.rsquared:.4f}")
|
|
89
|
+
>>>
|
|
90
|
+
>>> # Access entity means
|
|
91
|
+
>>> entity_means = be.entity_means
|
|
92
|
+
>>> print(entity_means.head())
|
|
93
|
+
|
|
94
|
+
Notes
|
|
95
|
+
-----
|
|
96
|
+
The Between estimator:
|
|
97
|
+
1. Computes entity-level means for all variables
|
|
98
|
+
2. Runs OLS on the N entity means (not NT observations)
|
|
99
|
+
3. Reports R² as the between R² (variation explained across entities)
|
|
100
|
+
|
|
101
|
+
Degrees of freedom:
|
|
102
|
+
- N observations (one per entity)
|
|
103
|
+
- k parameters (slopes + intercept)
|
|
104
|
+
- df_resid = N - k
|
|
105
|
+
|
|
106
|
+
Standard errors:
|
|
107
|
+
- All SE types are supported (robust, clustered, etc.)
|
|
108
|
+
- Applied to the N entity-level observations
|
|
109
|
+
- Clustering by time is possible if needed
|
|
110
|
+
|
|
111
|
+
References
|
|
112
|
+
----------
|
|
113
|
+
.. [1] Wooldridge, J. M. (2010). Econometric Analysis of Cross Section
|
|
114
|
+
and Panel Data. MIT Press. Section 10.2.2.
|
|
115
|
+
.. [2] Baltagi, B. H. (2013). Econometric Analysis of Panel Data.
|
|
116
|
+
Wiley. Chapter 2.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
formula: str,
|
|
122
|
+
data: pd.DataFrame,
|
|
123
|
+
entity_col: str,
|
|
124
|
+
time_col: str,
|
|
125
|
+
weights: Optional[np.ndarray] = None
|
|
126
|
+
):
|
|
127
|
+
super().__init__(formula, data, entity_col, time_col, weights)
|
|
128
|
+
|
|
129
|
+
# Entity means (computed after fitting)
|
|
130
|
+
self.entity_means: Optional[pd.DataFrame] = None
|
|
131
|
+
|
|
132
|
+
def fit(
|
|
133
|
+
self,
|
|
134
|
+
cov_type: str = 'nonrobust',
|
|
135
|
+
**cov_kwds
|
|
136
|
+
) -> PanelResults:
|
|
137
|
+
"""
|
|
138
|
+
Fit the Between estimator.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
cov_type : str, default='nonrobust'
|
|
143
|
+
Type of covariance estimator:
|
|
144
|
+
- 'nonrobust': Classical standard errors
|
|
145
|
+
- 'robust' or 'hc1': Heteroskedasticity-robust (HC1)
|
|
146
|
+
- 'hc0', 'hc2', 'hc3': Other HC variants
|
|
147
|
+
- 'clustered': Cluster-robust (by entity by default, or custom)
|
|
148
|
+
- 'twoway': Two-way clustered (entity and time at group level)
|
|
149
|
+
- 'driscoll_kraay': Driscoll-Kraay (spatial/temporal dependence)
|
|
150
|
+
- 'newey_west': Newey-West HAC
|
|
151
|
+
- 'pcse': Panel-Corrected Standard Errors
|
|
152
|
+
**cov_kwds
|
|
153
|
+
Additional arguments for covariance estimation:
|
|
154
|
+
- cluster_col: For custom clustering
|
|
155
|
+
- max_lags: For Driscoll-Kraay and Newey-West
|
|
156
|
+
- kernel: For HAC estimators ('bartlett', 'parzen', 'quadratic_spectral')
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
PanelResults
|
|
161
|
+
Fitted model results
|
|
162
|
+
|
|
163
|
+
Examples
|
|
164
|
+
--------
|
|
165
|
+
>>> # Classical standard errors
|
|
166
|
+
>>> results = model.fit(cov_type='nonrobust')
|
|
167
|
+
|
|
168
|
+
>>> # Heteroskedasticity-robust
|
|
169
|
+
>>> results = model.fit(cov_type='robust')
|
|
170
|
+
>>> results = model.fit(cov_type='hc3')
|
|
171
|
+
|
|
172
|
+
>>> # Cluster-robust
|
|
173
|
+
>>> results = model.fit(cov_type='clustered')
|
|
174
|
+
|
|
175
|
+
>>> # Driscoll-Kraay
|
|
176
|
+
>>> results = model.fit(cov_type='driscoll_kraay', max_lags=3)
|
|
177
|
+
"""
|
|
178
|
+
# Build design matrices from original data
|
|
179
|
+
y_orig, X_orig = self.formula_parser.build_design_matrices(
|
|
180
|
+
self.data.data,
|
|
181
|
+
return_type='array'
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Get variable names
|
|
185
|
+
var_names = self.formula_parser.get_variable_names(self.data.data)
|
|
186
|
+
|
|
187
|
+
# Get entity and time identifiers
|
|
188
|
+
entities = self.data.data[self.data.entity_col].values
|
|
189
|
+
times = self.data.data[self.data.time_col].values
|
|
190
|
+
|
|
191
|
+
# Compute entity means (between transformation)
|
|
192
|
+
unique_entities = np.unique(entities)
|
|
193
|
+
n_entities = len(unique_entities)
|
|
194
|
+
k = X_orig.shape[1]
|
|
195
|
+
|
|
196
|
+
# Initialize arrays for entity means
|
|
197
|
+
y_between = np.zeros(n_entities)
|
|
198
|
+
X_between = np.zeros((n_entities, k))
|
|
199
|
+
|
|
200
|
+
# Compute means for each entity
|
|
201
|
+
for i, entity in enumerate(unique_entities):
|
|
202
|
+
mask = entities == entity
|
|
203
|
+
y_between[i] = y_orig[mask].mean()
|
|
204
|
+
X_between[i] = X_orig[mask].mean(axis=0)
|
|
205
|
+
|
|
206
|
+
# Store entity means for user access
|
|
207
|
+
entity_means_dict = {'entity': unique_entities}
|
|
208
|
+
|
|
209
|
+
# Add dependent variable mean
|
|
210
|
+
dep_var_name = self.formula_parser.dependent
|
|
211
|
+
entity_means_dict[dep_var_name] = y_between
|
|
212
|
+
|
|
213
|
+
# Add independent variable means (excluding intercept)
|
|
214
|
+
for j, var_name in enumerate(var_names):
|
|
215
|
+
if var_name != 'Intercept':
|
|
216
|
+
# Find the corresponding column in X_orig
|
|
217
|
+
# var_names includes 'Intercept' if present, so adjust index
|
|
218
|
+
if 'Intercept' in var_names:
|
|
219
|
+
X_col_idx = j
|
|
220
|
+
else:
|
|
221
|
+
X_col_idx = j
|
|
222
|
+
entity_means_dict[var_name] = X_between[:, X_col_idx]
|
|
223
|
+
|
|
224
|
+
self.entity_means = pd.DataFrame(entity_means_dict)
|
|
225
|
+
|
|
226
|
+
# Estimate coefficients on entity means (OLS)
|
|
227
|
+
beta, resid, fitted = compute_ols(y_between, X_between, self.weights)
|
|
228
|
+
|
|
229
|
+
# Degrees of freedom
|
|
230
|
+
n = n_entities # Number of entity-level observations
|
|
231
|
+
df_model = k - 1 if 'Intercept' in var_names else k # Slopes only
|
|
232
|
+
df_resid = n - k
|
|
233
|
+
|
|
234
|
+
# Ensure df_resid is positive
|
|
235
|
+
if df_resid <= 0:
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"Insufficient degrees of freedom: df_resid = {df_resid}. "
|
|
238
|
+
f"n_entities={n}, k={k}. Need more entities than parameters."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Compute covariance matrix
|
|
242
|
+
cov_type_lower = cov_type.lower()
|
|
243
|
+
|
|
244
|
+
if cov_type_lower == 'nonrobust':
|
|
245
|
+
vcov = compute_vcov_nonrobust(X_between, resid, df_resid)
|
|
246
|
+
|
|
247
|
+
elif cov_type_lower in ['robust', 'hc0', 'hc1', 'hc2', 'hc3']:
|
|
248
|
+
# Map 'robust' to 'hc1'
|
|
249
|
+
method = 'HC1' if cov_type_lower == 'robust' else cov_type_lower.upper()
|
|
250
|
+
result = robust_covariance(X_between, resid, method=method)
|
|
251
|
+
vcov = result.cov_matrix
|
|
252
|
+
|
|
253
|
+
elif cov_type_lower == 'clustered':
|
|
254
|
+
# For between estimator, clustering is less common but supported
|
|
255
|
+
# Default: cluster by entity (though each entity appears once)
|
|
256
|
+
# Could cluster by another grouping variable if specified
|
|
257
|
+
cluster_col = cov_kwds.get('cluster_col', None)
|
|
258
|
+
if cluster_col is None:
|
|
259
|
+
# Each entity is its own cluster - equivalent to robust
|
|
260
|
+
result = robust_covariance(X_between, resid, method='HC1')
|
|
261
|
+
else:
|
|
262
|
+
# Use custom clustering variable from entity_means
|
|
263
|
+
if cluster_col not in self.entity_means.columns:
|
|
264
|
+
raise ValueError(f"cluster_col '{cluster_col}' not found in entity means")
|
|
265
|
+
cluster_ids = self.entity_means[cluster_col].values
|
|
266
|
+
result = cluster_by_entity(X_between, resid, cluster_ids, df_correction=True)
|
|
267
|
+
vcov = result.cov_matrix
|
|
268
|
+
|
|
269
|
+
elif cov_type_lower == 'twoway':
|
|
270
|
+
# Two-way clustering at entity level
|
|
271
|
+
# This is unusual for between estimator but technically possible
|
|
272
|
+
# Would need entity-level time groupings
|
|
273
|
+
cluster_col1 = cov_kwds.get('cluster_col1', 'entity')
|
|
274
|
+
cluster_col2 = cov_kwds.get('cluster_col2', None)
|
|
275
|
+
|
|
276
|
+
if cluster_col2 is None:
|
|
277
|
+
raise ValueError("twoway clustering requires cluster_col2 in cov_kwds")
|
|
278
|
+
|
|
279
|
+
cluster_ids1 = self.entity_means[cluster_col1].values if cluster_col1 in self.entity_means.columns else unique_entities
|
|
280
|
+
cluster_ids2 = self.entity_means[cluster_col2].values
|
|
281
|
+
|
|
282
|
+
result = twoway_cluster(X_between, resid, cluster_ids1, cluster_ids2, df_correction=True)
|
|
283
|
+
vcov = result.cov_matrix
|
|
284
|
+
|
|
285
|
+
elif cov_type_lower == 'driscoll_kraay':
|
|
286
|
+
# Driscoll-Kraay at entity level
|
|
287
|
+
# Use entity index as "time" dimension
|
|
288
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
289
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
290
|
+
result = driscoll_kraay(X_between, resid, unique_entities, max_lags=max_lags, kernel=kernel)
|
|
291
|
+
vcov = result.cov_matrix
|
|
292
|
+
|
|
293
|
+
elif cov_type_lower == 'newey_west':
|
|
294
|
+
# Newey-West HAC
|
|
295
|
+
max_lags = cov_kwds.get('max_lags', None)
|
|
296
|
+
kernel = cov_kwds.get('kernel', 'bartlett')
|
|
297
|
+
result = newey_west(X_between, resid, max_lags=max_lags, kernel=kernel)
|
|
298
|
+
vcov = result.cov_matrix
|
|
299
|
+
|
|
300
|
+
elif cov_type_lower == 'pcse':
|
|
301
|
+
# Panel-Corrected Standard Errors
|
|
302
|
+
# For between estimator, each entity appears once
|
|
303
|
+
# PCSE is less meaningful but technically computable
|
|
304
|
+
result = pcse(X_between, resid, unique_entities, unique_entities)
|
|
305
|
+
vcov = result.cov_matrix
|
|
306
|
+
|
|
307
|
+
else:
|
|
308
|
+
raise ValueError(
|
|
309
|
+
f"cov_type must be one of: 'nonrobust', 'robust', 'hc0', 'hc1', 'hc2', 'hc3', "
|
|
310
|
+
f"'clustered', 'twoway', 'driscoll_kraay', 'newey_west', 'pcse', got '{cov_type}'"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Standard errors
|
|
314
|
+
std_errors = np.sqrt(np.diag(vcov))
|
|
315
|
+
|
|
316
|
+
# Compute R-squared measures
|
|
317
|
+
# For between estimator:
|
|
318
|
+
# - rsquared = between R² (primary measure)
|
|
319
|
+
# - within R² = 0 by construction (no within variation used)
|
|
320
|
+
# - overall R² computed from fitted values mapped back to all observations
|
|
321
|
+
|
|
322
|
+
# Between R² (on entity means)
|
|
323
|
+
tss_between = np.sum((y_between - y_between.mean()) ** 2)
|
|
324
|
+
ess_between = np.sum(resid ** 2)
|
|
325
|
+
rsquared_between = 1 - ess_between / tss_between if tss_between > 0 else 0.0
|
|
326
|
+
|
|
327
|
+
# Map fitted values back to original observations for overall R²
|
|
328
|
+
fitted_all = np.zeros(len(y_orig))
|
|
329
|
+
for i, entity in enumerate(unique_entities):
|
|
330
|
+
mask = entities == entity
|
|
331
|
+
fitted_all[mask] = fitted[i]
|
|
332
|
+
|
|
333
|
+
resid_all = y_orig - fitted_all
|
|
334
|
+
|
|
335
|
+
# Overall R² (on all NT observations)
|
|
336
|
+
tss_overall = np.sum((y_orig - y_orig.mean()) ** 2)
|
|
337
|
+
ess_overall = np.sum(resid_all ** 2)
|
|
338
|
+
rsquared_overall = 1 - ess_overall / tss_overall if tss_overall > 0 else 0.0
|
|
339
|
+
|
|
340
|
+
# Within R² is not meaningful for between estimator
|
|
341
|
+
# (would require comparing within variation, which BE ignores)
|
|
342
|
+
rsquared_within = 0.0
|
|
343
|
+
|
|
344
|
+
# Adjusted R-squared (based on between R²)
|
|
345
|
+
rsquared_adj = 1 - (1 - rsquared_between) * (n - 1) / df_resid
|
|
346
|
+
|
|
347
|
+
# Create Series/DataFrame with variable names
|
|
348
|
+
params = pd.Series(beta.ravel(), index=var_names)
|
|
349
|
+
std_errors_series = pd.Series(std_errors, index=var_names)
|
|
350
|
+
cov_params = pd.DataFrame(vcov, index=var_names, columns=var_names)
|
|
351
|
+
|
|
352
|
+
# Model information
|
|
353
|
+
model_info = {
|
|
354
|
+
'model_type': 'Between Estimator',
|
|
355
|
+
'formula': self.formula,
|
|
356
|
+
'cov_type': cov_type,
|
|
357
|
+
'cov_kwds': cov_kwds,
|
|
358
|
+
'entity_effects': False,
|
|
359
|
+
'time_effects': False,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
# Data information
|
|
363
|
+
data_info = {
|
|
364
|
+
'nobs': n, # Number of entity-level observations
|
|
365
|
+
'n_entities': self.data.n_entities,
|
|
366
|
+
'n_periods': self.data.n_periods,
|
|
367
|
+
'df_model': df_model,
|
|
368
|
+
'df_resid': df_resid,
|
|
369
|
+
'entity_index': unique_entities,
|
|
370
|
+
'time_index': None, # Not applicable for between estimator
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
# R-squared dictionary
|
|
374
|
+
rsquared_dict = {
|
|
375
|
+
'rsquared': rsquared_between, # For BE, R² = between R²
|
|
376
|
+
'rsquared_adj': rsquared_adj,
|
|
377
|
+
'rsquared_within': rsquared_within,
|
|
378
|
+
'rsquared_between': rsquared_between,
|
|
379
|
+
'rsquared_overall': rsquared_overall
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
# Create results object
|
|
383
|
+
results = PanelResults(
|
|
384
|
+
params=params,
|
|
385
|
+
std_errors=std_errors_series,
|
|
386
|
+
cov_params=cov_params,
|
|
387
|
+
resid=resid_all, # Residuals for all observations
|
|
388
|
+
fittedvalues=fitted_all, # Fitted values for all observations
|
|
389
|
+
model_info=model_info,
|
|
390
|
+
data_info=data_info,
|
|
391
|
+
rsquared_dict=rsquared_dict,
|
|
392
|
+
model=self
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Store results and update state
|
|
396
|
+
self._results = results
|
|
397
|
+
self._fitted = True
|
|
398
|
+
|
|
399
|
+
return results
|
|
400
|
+
|
|
401
|
+
def _estimate_coefficients(self) -> np.ndarray:
|
|
402
|
+
"""
|
|
403
|
+
Estimate coefficients (implementation of abstract method).
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
np.ndarray
|
|
408
|
+
Estimated coefficients
|
|
409
|
+
"""
|
|
410
|
+
# Build design matrices
|
|
411
|
+
y, X = self.formula_parser.build_design_matrices(
|
|
412
|
+
self.data.data,
|
|
413
|
+
return_type='array'
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Get entity identifiers
|
|
417
|
+
entities = self.data.data[self.data.entity_col].values
|
|
418
|
+
|
|
419
|
+
# Compute entity means
|
|
420
|
+
unique_entities = np.unique(entities)
|
|
421
|
+
n_entities = len(unique_entities)
|
|
422
|
+
k = X.shape[1]
|
|
423
|
+
|
|
424
|
+
y_between = np.zeros(n_entities)
|
|
425
|
+
X_between = np.zeros((n_entities, k))
|
|
426
|
+
|
|
427
|
+
for i, entity in enumerate(unique_entities):
|
|
428
|
+
mask = entities == entity
|
|
429
|
+
y_between[i] = y[mask].mean()
|
|
430
|
+
X_between[i] = X[mask].mean(axis=0)
|
|
431
|
+
|
|
432
|
+
# OLS on entity means
|
|
433
|
+
beta, _, _ = compute_ols(y_between, X_between, self.weights)
|
|
434
|
+
return beta
|