panelbox 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +41 -0
- panelbox/__version__.py +13 -1
- panelbox/core/formula_parser.py +9 -2
- panelbox/core/panel_data.py +1 -1
- panelbox/datasets/__init__.py +39 -0
- panelbox/datasets/load.py +334 -0
- panelbox/gmm/difference_gmm.py +63 -15
- panelbox/gmm/estimator.py +46 -5
- panelbox/gmm/system_gmm.py +136 -21
- panelbox/models/static/__init__.py +4 -0
- panelbox/models/static/between.py +434 -0
- panelbox/models/static/first_difference.py +494 -0
- panelbox/models/static/fixed_effects.py +80 -11
- panelbox/models/static/pooled_ols.py +80 -11
- panelbox/models/static/random_effects.py +52 -10
- panelbox/standard_errors/__init__.py +119 -0
- panelbox/standard_errors/clustered.py +386 -0
- panelbox/standard_errors/comparison.py +528 -0
- panelbox/standard_errors/driscoll_kraay.py +386 -0
- panelbox/standard_errors/newey_west.py +324 -0
- panelbox/standard_errors/pcse.py +358 -0
- panelbox/standard_errors/robust.py +324 -0
- panelbox/standard_errors/utils.py +390 -0
- panelbox/validation/__init__.py +6 -0
- panelbox/validation/robustness/__init__.py +51 -0
- panelbox/validation/robustness/bootstrap.py +933 -0
- panelbox/validation/robustness/checks.py +143 -0
- panelbox/validation/robustness/cross_validation.py +538 -0
- panelbox/validation/robustness/influence.py +364 -0
- panelbox/validation/robustness/jackknife.py +457 -0
- panelbox/validation/robustness/outliers.py +529 -0
- panelbox/validation/robustness/sensitivity.py +809 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/METADATA +32 -3
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/RECORD +38 -21
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/WHEEL +1 -1
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/entry_points.txt +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {panelbox-0.2.0.dist-info → panelbox-0.4.0.dist-info}/top_level.txt +0 -0
panelbox/__init__.py
CHANGED
|
@@ -29,6 +29,8 @@ from panelbox.core.results import PanelResults
|
|
|
29
29
|
from panelbox.models.static.pooled_ols import PooledOLS
|
|
30
30
|
from panelbox.models.static.fixed_effects import FixedEffects
|
|
31
31
|
from panelbox.models.static.random_effects import RandomEffects
|
|
32
|
+
from panelbox.models.static.between import BetweenEstimator
|
|
33
|
+
from panelbox.models.static.first_difference import FirstDifferenceEstimator
|
|
32
34
|
|
|
33
35
|
# Dynamic panel GMM models
|
|
34
36
|
from panelbox.gmm.difference_gmm import DifferenceGMM
|
|
@@ -38,6 +40,23 @@ from panelbox.gmm.results import GMMResults
|
|
|
38
40
|
# Tests
|
|
39
41
|
from panelbox.validation.specification.hausman import HausmanTest, HausmanTestResult
|
|
40
42
|
|
|
43
|
+
# Robustness analysis
|
|
44
|
+
from panelbox.validation.robustness.bootstrap import PanelBootstrap
|
|
45
|
+
from panelbox.validation.robustness.sensitivity import SensitivityAnalysis, SensitivityResults
|
|
46
|
+
from panelbox.validation.robustness.cross_validation import TimeSeriesCV, CVResults
|
|
47
|
+
from panelbox.validation.robustness.jackknife import PanelJackknife, JackknifeResults
|
|
48
|
+
from panelbox.validation.robustness.outliers import OutlierDetector, OutlierResults
|
|
49
|
+
from panelbox.validation.robustness.influence import InfluenceDiagnostics, InfluenceResults
|
|
50
|
+
from panelbox.validation.robustness.checks import RobustnessChecker
|
|
51
|
+
|
|
52
|
+
# Datasets
|
|
53
|
+
from panelbox.datasets import (
|
|
54
|
+
load_grunfeld,
|
|
55
|
+
load_abdata,
|
|
56
|
+
list_datasets,
|
|
57
|
+
get_dataset_info
|
|
58
|
+
)
|
|
59
|
+
|
|
41
60
|
__all__ = [
|
|
42
61
|
# Version
|
|
43
62
|
'__version__',
|
|
@@ -55,6 +74,8 @@ __all__ = [
|
|
|
55
74
|
'PooledOLS',
|
|
56
75
|
'FixedEffects',
|
|
57
76
|
'RandomEffects',
|
|
77
|
+
'BetweenEstimator',
|
|
78
|
+
'FirstDifferenceEstimator',
|
|
58
79
|
|
|
59
80
|
# GMM Models
|
|
60
81
|
'DifferenceGMM',
|
|
@@ -64,4 +85,24 @@ __all__ = [
|
|
|
64
85
|
# Tests
|
|
65
86
|
'HausmanTest',
|
|
66
87
|
'HausmanTestResult',
|
|
88
|
+
|
|
89
|
+
# Robustness
|
|
90
|
+
'PanelBootstrap',
|
|
91
|
+
'SensitivityAnalysis',
|
|
92
|
+
'SensitivityResults',
|
|
93
|
+
'TimeSeriesCV',
|
|
94
|
+
'CVResults',
|
|
95
|
+
'PanelJackknife',
|
|
96
|
+
'JackknifeResults',
|
|
97
|
+
'OutlierDetector',
|
|
98
|
+
'OutlierResults',
|
|
99
|
+
'InfluenceDiagnostics',
|
|
100
|
+
'InfluenceResults',
|
|
101
|
+
'RobustnessChecker',
|
|
102
|
+
|
|
103
|
+
# Datasets
|
|
104
|
+
'load_grunfeld',
|
|
105
|
+
'load_abdata',
|
|
106
|
+
'list_datasets',
|
|
107
|
+
'get_dataset_info',
|
|
67
108
|
]
|
panelbox/__version__.py
CHANGED
|
@@ -1,11 +1,23 @@
|
|
|
1
1
|
"""Version information for panelbox."""
|
|
2
2
|
|
|
3
|
-
__version__ = "0.
|
|
3
|
+
__version__ = "0.4.0"
|
|
4
4
|
__author__ = "Gustavo Haase, Paulo Dourado"
|
|
5
5
|
__email__ = "gustavo.haase@gmail.com"
|
|
6
6
|
__license__ = "MIT"
|
|
7
7
|
|
|
8
8
|
# Version history
|
|
9
|
+
# 0.4.0 (2026-02-05): Robust Standard Errors
|
|
10
|
+
# - HC0-HC3: Heteroskedasticity-robust standard errors (White 1980, MacKinnon-White 1985)
|
|
11
|
+
# - Clustered SE: One-way and two-way clustering (Cameron-Gelbach-Miller 2011)
|
|
12
|
+
# - Driscoll-Kraay: Spatial and temporal dependence (Driscoll & Kraay 1998)
|
|
13
|
+
# - Newey-West HAC: Heteroskedasticity and autocorrelation consistent (Newey & West 1987)
|
|
14
|
+
# - PCSE: Panel-corrected standard errors (Beck & Katz 1995)
|
|
15
|
+
# - 75+ tests, ~90% coverage, integrated with FE and RE models
|
|
16
|
+
# 0.3.0 (2026-01-22): Advanced Robustness Analysis
|
|
17
|
+
# - PanelBootstrap: 4 bootstrap methods (pairs, wild, block, residual)
|
|
18
|
+
# - SensitivityAnalysis: 3 methods (LOO entities, LOO periods, subset)
|
|
19
|
+
# - 63 new tests, comprehensive documentation
|
|
20
|
+
# - Optional matplotlib visualization
|
|
9
21
|
# 0.2.0 (2026-01-21): GMM implementation complete (Difference & System GMM)
|
|
10
22
|
# - Arellano-Bond (1991) Difference GMM
|
|
11
23
|
# - Blundell-Bond (1998) System GMM
|
panelbox/core/formula_parser.py
CHANGED
|
@@ -143,7 +143,14 @@ class FormulaParser:
|
|
|
143
143
|
|
|
144
144
|
# Extract variable names from term
|
|
145
145
|
# Handle simple cases: x, log(x), I(x**2), x:y, x*y
|
|
146
|
-
|
|
146
|
+
# Check for function calls first (before checking for : or *)
|
|
147
|
+
func_match = re.match(r'(?:\w+\.)*(\w+)\((.*)\)', term)
|
|
148
|
+
if func_match:
|
|
149
|
+
# This is a function call - extract variable from it
|
|
150
|
+
var = self._extract_var_from_term(term)
|
|
151
|
+
if var and var not in variables:
|
|
152
|
+
variables.append(var)
|
|
153
|
+
elif ':' in term:
|
|
147
154
|
# Interaction term
|
|
148
155
|
parts = term.split(':')
|
|
149
156
|
for part in parts:
|
|
@@ -151,7 +158,7 @@ class FormulaParser:
|
|
|
151
158
|
if var and var not in variables:
|
|
152
159
|
variables.append(var)
|
|
153
160
|
elif '*' in term:
|
|
154
|
-
# Interaction with expansion
|
|
161
|
+
# Interaction with expansion (not inside parentheses)
|
|
155
162
|
parts = term.split('*')
|
|
156
163
|
for part in parts:
|
|
157
164
|
var = self._extract_var_from_term(part.strip())
|
panelbox/core/panel_data.py
CHANGED
|
@@ -98,7 +98,7 @@ class PanelData:
|
|
|
98
98
|
# Check if balanced
|
|
99
99
|
obs_per_entity = self.data.groupby(entity_col).size()
|
|
100
100
|
self.n_periods = int(obs_per_entity.max())
|
|
101
|
-
self.is_balanced = (obs_per_entity == self.n_periods).all()
|
|
101
|
+
self.is_balanced = bool((obs_per_entity == self.n_periods).all())
|
|
102
102
|
|
|
103
103
|
if not self.is_balanced:
|
|
104
104
|
self.min_periods = int(obs_per_entity.min())
|
panelbox/datasets/__init__.py
CHANGED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Panel Data Datasets
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
This module provides access to example panel datasets commonly used
|
|
6
|
+
in econometrics education and research.
|
|
7
|
+
|
|
8
|
+
Functions
|
|
9
|
+
---------
|
|
10
|
+
load_grunfeld : Load Grunfeld investment data
|
|
11
|
+
load_abdata : Load Arellano-Bond employment data
|
|
12
|
+
list_datasets : List all available datasets
|
|
13
|
+
get_dataset_info : Get information about a specific dataset
|
|
14
|
+
|
|
15
|
+
Examples
|
|
16
|
+
--------
|
|
17
|
+
>>> import panelbox as pb
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Load Grunfeld data
|
|
20
|
+
>>> data = pb.load_grunfeld()
|
|
21
|
+
>>> print(data.head())
|
|
22
|
+
>>>
|
|
23
|
+
>>> # List all datasets
|
|
24
|
+
>>> pb.list_datasets()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .load import (
|
|
28
|
+
load_grunfeld,
|
|
29
|
+
load_abdata,
|
|
30
|
+
list_datasets,
|
|
31
|
+
get_dataset_info
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
'load_grunfeld',
|
|
36
|
+
'load_abdata',
|
|
37
|
+
'list_datasets',
|
|
38
|
+
'get_dataset_info'
|
|
39
|
+
]
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset Loading Functions
|
|
3
|
+
==========================
|
|
4
|
+
|
|
5
|
+
Functions for loading example panel datasets.
|
|
6
|
+
|
|
7
|
+
Each dataset includes:
|
|
8
|
+
- Description of the data source
|
|
9
|
+
- Variable definitions
|
|
10
|
+
- Example usage
|
|
11
|
+
- Citation information
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import os
|
|
16
|
+
from typing import Optional, Dict, List
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_data_path() -> str:
|
|
20
|
+
"""Get the path to the data directory."""
|
|
21
|
+
return os.path.join(os.path.dirname(__file__), 'data')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_grunfeld(return_panel_data: bool = False) -> pd.DataFrame:
|
|
25
|
+
"""
|
|
26
|
+
Load Grunfeld investment data.
|
|
27
|
+
|
|
28
|
+
Classic panel dataset on investment behavior of large US corporations.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
return_panel_data : bool, default=False
|
|
33
|
+
If True, returns a PanelData object instead of DataFrame
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
pd.DataFrame or PanelData
|
|
38
|
+
Panel dataset with firm-year observations
|
|
39
|
+
|
|
40
|
+
Notes
|
|
41
|
+
-----
|
|
42
|
+
**Dataset Description:**
|
|
43
|
+
|
|
44
|
+
The Grunfeld data contains observations on 10 large US manufacturing firms
|
|
45
|
+
over the period 1935-1954 (20 years). It has been widely used to illustrate
|
|
46
|
+
panel data econometric methods.
|
|
47
|
+
|
|
48
|
+
**Variables:**
|
|
49
|
+
- `firm` : Firm identifier (1-10)
|
|
50
|
+
- `year` : Year (1935-1954)
|
|
51
|
+
- `invest` : Gross investment (millions of dollars)
|
|
52
|
+
- `value` : Market value of the firm (millions of dollars)
|
|
53
|
+
- `capital` : Stock of plant and equipment (millions of dollars)
|
|
54
|
+
|
|
55
|
+
**Sample Size:**
|
|
56
|
+
- Entities (N): 10 firms
|
|
57
|
+
- Time periods (T): 20 years
|
|
58
|
+
- Total observations: 200
|
|
59
|
+
|
|
60
|
+
**Panel Structure:**
|
|
61
|
+
- Balanced panel (all firms observed in all years)
|
|
62
|
+
|
|
63
|
+
**Common Uses:**
|
|
64
|
+
- Fixed effects estimation
|
|
65
|
+
- Between vs. within variation
|
|
66
|
+
- Dynamic panel models
|
|
67
|
+
|
|
68
|
+
**Citation:**
|
|
69
|
+
Grunfeld, Y. (1958). The determinants of corporate investment.
|
|
70
|
+
Unpublished Ph.D. dissertation, University of Chicago.
|
|
71
|
+
|
|
72
|
+
**Source:**
|
|
73
|
+
Standard dataset in econometrics, available in Stata (`webuse grunfeld`)
|
|
74
|
+
and R (`plm` package).
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
>>> import panelbox as pb
|
|
79
|
+
>>>
|
|
80
|
+
>>> # Load data
|
|
81
|
+
>>> data = pb.load_grunfeld()
|
|
82
|
+
>>> print(data.head())
|
|
83
|
+
>>>
|
|
84
|
+
>>> # Panel structure
|
|
85
|
+
>>> print(f"Firms: {data['firm'].nunique()}")
|
|
86
|
+
>>> print(f"Years: {data['year'].nunique()}")
|
|
87
|
+
>>> print(f"Total obs: {len(data)}")
|
|
88
|
+
>>>
|
|
89
|
+
>>> # Estimate fixed effects
|
|
90
|
+
>>> fe = pb.FixedEffects("invest ~ value + capital", data, "firm", "year")
|
|
91
|
+
>>> results = fe.fit()
|
|
92
|
+
>>> print(results.summary())
|
|
93
|
+
"""
|
|
94
|
+
data_path = os.path.join(_get_data_path(), 'grunfeld.csv')
|
|
95
|
+
df = pd.read_csv(data_path)
|
|
96
|
+
|
|
97
|
+
if return_panel_data:
|
|
98
|
+
from panelbox.core.data import PanelData
|
|
99
|
+
return PanelData(df, entity_col='firm', time_col='year')
|
|
100
|
+
|
|
101
|
+
return df
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def load_abdata(return_panel_data: bool = False) -> Optional[pd.DataFrame]:
|
|
105
|
+
"""
|
|
106
|
+
Load Arellano-Bond employment data.
|
|
107
|
+
|
|
108
|
+
Panel dataset on UK company employment used in Arellano & Bond (1991).
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
return_panel_data : bool, default=False
|
|
113
|
+
If True, returns a PanelData object instead of DataFrame
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
pd.DataFrame or PanelData or None
|
|
118
|
+
Panel dataset with firm-year observations, or None if not found
|
|
119
|
+
|
|
120
|
+
Notes
|
|
121
|
+
-----
|
|
122
|
+
**Dataset Description:**
|
|
123
|
+
|
|
124
|
+
This is the employment dataset used in the seminal Arellano-Bond (1991)
|
|
125
|
+
paper on dynamic panel GMM estimation. It contains data on UK companies.
|
|
126
|
+
|
|
127
|
+
**Variables (typical):**
|
|
128
|
+
- `id` : Company identifier
|
|
129
|
+
- `year` : Year
|
|
130
|
+
- `n` or `emp` : Employment (number of employees)
|
|
131
|
+
- `w` or `wage` : Real wage
|
|
132
|
+
- `k` or `capital` : Gross capital stock
|
|
133
|
+
- `ys` or `output` : Industry output
|
|
134
|
+
|
|
135
|
+
**Sample Size:**
|
|
136
|
+
- Entities (N): ~140 firms
|
|
137
|
+
- Time periods (T): 7-9 years (1976-1984)
|
|
138
|
+
- Total observations: ~1,000 (unbalanced)
|
|
139
|
+
|
|
140
|
+
**Panel Structure:**
|
|
141
|
+
- Unbalanced panel (not all firms observed in all years)
|
|
142
|
+
|
|
143
|
+
**Common Uses:**
|
|
144
|
+
- Dynamic panel GMM estimation
|
|
145
|
+
- Arellano-Bond Difference GMM
|
|
146
|
+
- Blundell-Bond System GMM
|
|
147
|
+
- Testing for serial correlation in errors
|
|
148
|
+
|
|
149
|
+
**Citation:**
|
|
150
|
+
Arellano, M., & Bond, S. (1991). Some tests of specification for panel data:
|
|
151
|
+
Monte Carlo evidence and an application to employment equations.
|
|
152
|
+
Review of Economic Studies, 58(2), 277-297.
|
|
153
|
+
|
|
154
|
+
Examples
|
|
155
|
+
--------
|
|
156
|
+
>>> import panelbox as pb
|
|
157
|
+
>>>
|
|
158
|
+
>>> # Load data
|
|
159
|
+
>>> data = pb.load_abdata()
|
|
160
|
+
>>> if data is not None:
|
|
161
|
+
... # Estimate Difference GMM
|
|
162
|
+
... gmm = pb.DifferenceGMM(
|
|
163
|
+
... data=data,
|
|
164
|
+
... dep_var='n',
|
|
165
|
+
... lags=1,
|
|
166
|
+
... exog_vars=['w', 'k'],
|
|
167
|
+
... id_var='id',
|
|
168
|
+
... time_var='year'
|
|
169
|
+
... )
|
|
170
|
+
... results = gmm.fit()
|
|
171
|
+
"""
|
|
172
|
+
data_path = os.path.join(_get_data_path(), 'abdata.csv')
|
|
173
|
+
|
|
174
|
+
if not os.path.exists(data_path):
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
df = pd.read_csv(data_path)
|
|
178
|
+
|
|
179
|
+
if return_panel_data:
|
|
180
|
+
from panelbox.core.data import PanelData
|
|
181
|
+
# Try to infer entity and time columns
|
|
182
|
+
entity_col = 'id' if 'id' in df.columns else df.columns[0]
|
|
183
|
+
time_col = 'year' if 'year' in df.columns else df.columns[1]
|
|
184
|
+
return PanelData(df, entity_col=entity_col, time_col=time_col)
|
|
185
|
+
|
|
186
|
+
return df
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def list_datasets() -> List[str]:
|
|
190
|
+
"""
|
|
191
|
+
List all available datasets.
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
list of str
|
|
196
|
+
Names of available datasets
|
|
197
|
+
|
|
198
|
+
Examples
|
|
199
|
+
--------
|
|
200
|
+
>>> import panelbox as pb
|
|
201
|
+
>>> datasets = pb.list_datasets()
|
|
202
|
+
>>> print("Available datasets:")
|
|
203
|
+
>>> for ds in datasets:
|
|
204
|
+
... print(f" - {ds}")
|
|
205
|
+
"""
|
|
206
|
+
datasets = []
|
|
207
|
+
data_path = _get_data_path()
|
|
208
|
+
|
|
209
|
+
if os.path.exists(data_path):
|
|
210
|
+
for filename in os.listdir(data_path):
|
|
211
|
+
if filename.endswith('.csv'):
|
|
212
|
+
dataset_name = filename[:-4] # Remove .csv extension
|
|
213
|
+
datasets.append(dataset_name)
|
|
214
|
+
|
|
215
|
+
return sorted(datasets)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def get_dataset_info(dataset_name: str) -> Dict[str, any]:
|
|
219
|
+
"""
|
|
220
|
+
Get information about a specific dataset.
|
|
221
|
+
|
|
222
|
+
Parameters
|
|
223
|
+
----------
|
|
224
|
+
dataset_name : str
|
|
225
|
+
Name of the dataset (e.g., 'grunfeld', 'abdata')
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
dict
|
|
230
|
+
Dictionary containing dataset information:
|
|
231
|
+
- name: Dataset name
|
|
232
|
+
- description: Brief description
|
|
233
|
+
- n_entities: Number of entities (if loaded)
|
|
234
|
+
- n_periods: Number of time periods (if loaded)
|
|
235
|
+
- n_obs: Total observations (if loaded)
|
|
236
|
+
- variables: List of variables (if loaded)
|
|
237
|
+
- balanced: Whether panel is balanced (if loaded)
|
|
238
|
+
- source: Data source/citation
|
|
239
|
+
|
|
240
|
+
Examples
|
|
241
|
+
--------
|
|
242
|
+
>>> import panelbox as pb
|
|
243
|
+
>>> info = pb.get_dataset_info('grunfeld')
|
|
244
|
+
>>> print(f"Dataset: {info['name']}")
|
|
245
|
+
>>> print(f"Description: {info['description']}")
|
|
246
|
+
>>> print(f"Variables: {', '.join(info['variables'])}")
|
|
247
|
+
"""
|
|
248
|
+
dataset_info = {
|
|
249
|
+
'grunfeld': {
|
|
250
|
+
'name': 'Grunfeld Investment Data',
|
|
251
|
+
'description': 'Investment data for 10 US manufacturing firms (1935-1954)',
|
|
252
|
+
'source': 'Grunfeld (1958)',
|
|
253
|
+
'citation': 'Grunfeld, Y. (1958). The determinants of corporate investment.',
|
|
254
|
+
'entity_col': 'firm',
|
|
255
|
+
'time_col': 'year',
|
|
256
|
+
},
|
|
257
|
+
'abdata': {
|
|
258
|
+
'name': 'Arellano-Bond Employment Data',
|
|
259
|
+
'description': 'UK company employment data (1976-1984)',
|
|
260
|
+
'source': 'Arellano & Bond (1991)',
|
|
261
|
+
'citation': 'Arellano, M., & Bond, S. (1991). Review of Economic Studies, 58(2), 277-297.',
|
|
262
|
+
'entity_col': 'id',
|
|
263
|
+
'time_col': 'year',
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
base_info = dataset_info.get(dataset_name, {
|
|
268
|
+
'name': dataset_name,
|
|
269
|
+
'description': 'Unknown dataset',
|
|
270
|
+
'source': 'Unknown',
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
# Try to load dataset and add statistics
|
|
274
|
+
try:
|
|
275
|
+
if dataset_name == 'grunfeld':
|
|
276
|
+
df = load_grunfeld()
|
|
277
|
+
elif dataset_name == 'abdata':
|
|
278
|
+
df = load_abdata()
|
|
279
|
+
else:
|
|
280
|
+
data_path = os.path.join(_get_data_path(), f'{dataset_name}.csv')
|
|
281
|
+
if os.path.exists(data_path):
|
|
282
|
+
df = pd.read_csv(data_path)
|
|
283
|
+
else:
|
|
284
|
+
return base_info
|
|
285
|
+
|
|
286
|
+
if df is not None:
|
|
287
|
+
entity_col = base_info.get('entity_col', df.columns[0])
|
|
288
|
+
time_col = base_info.get('time_col', df.columns[1])
|
|
289
|
+
|
|
290
|
+
base_info['n_entities'] = df[entity_col].nunique()
|
|
291
|
+
base_info['n_periods'] = df[time_col].nunique()
|
|
292
|
+
base_info['n_obs'] = len(df)
|
|
293
|
+
base_info['variables'] = list(df.columns)
|
|
294
|
+
|
|
295
|
+
# Check if balanced
|
|
296
|
+
obs_per_entity = df.groupby(entity_col).size()
|
|
297
|
+
base_info['balanced'] = (obs_per_entity == obs_per_entity.iloc[0]).all()
|
|
298
|
+
|
|
299
|
+
except Exception as e:
|
|
300
|
+
base_info['error'] = str(e)
|
|
301
|
+
|
|
302
|
+
return base_info
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# Convenience function for backwards compatibility
|
|
306
|
+
def load_dataset(name: str, **kwargs) -> Optional[pd.DataFrame]:
|
|
307
|
+
"""
|
|
308
|
+
Load a dataset by name.
|
|
309
|
+
|
|
310
|
+
Parameters
|
|
311
|
+
----------
|
|
312
|
+
name : str
|
|
313
|
+
Name of the dataset
|
|
314
|
+
**kwargs
|
|
315
|
+
Additional arguments passed to the specific load function
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
pd.DataFrame or None
|
|
320
|
+
The requested dataset, or None if not found
|
|
321
|
+
"""
|
|
322
|
+
if name == 'grunfeld':
|
|
323
|
+
return load_grunfeld(**kwargs)
|
|
324
|
+
elif name == 'abdata':
|
|
325
|
+
return load_abdata(**kwargs)
|
|
326
|
+
else:
|
|
327
|
+
# Try to load from file
|
|
328
|
+
data_path = os.path.join(_get_data_path(), f'{name}.csv')
|
|
329
|
+
if os.path.exists(data_path):
|
|
330
|
+
return pd.read_csv(data_path)
|
|
331
|
+
else:
|
|
332
|
+
print(f"Dataset '{name}' not found.")
|
|
333
|
+
print(f"Available datasets: {', '.join(list_datasets())}")
|
|
334
|
+
return None
|
panelbox/gmm/difference_gmm.py
CHANGED
|
@@ -252,8 +252,28 @@ class DifferenceGMM:
|
|
|
252
252
|
# Check collapse recommendation
|
|
253
253
|
if not self.collapse:
|
|
254
254
|
warnings.warn(
|
|
255
|
-
"\
|
|
256
|
-
"
|
|
255
|
+
"\n" + "="*70 + "\n"
|
|
256
|
+
"RECOMMENDATION: Set collapse=True\n"
|
|
257
|
+
"="*70 + "\n"
|
|
258
|
+
"Non-collapsed GMM instruments (collapse=False) can cause:\n"
|
|
259
|
+
" • Instrument proliferation (grows as T²)\n"
|
|
260
|
+
" • Numerical instability with sparse instrument matrices\n"
|
|
261
|
+
" • Overfitting and weak instrument problems\n"
|
|
262
|
+
"\n"
|
|
263
|
+
"Roodman (2009) recommends collapse=True as best practice.\n"
|
|
264
|
+
"Collapsed instruments:\n"
|
|
265
|
+
" ✓ Reduce instrument count from O(T²) to O(T)\n"
|
|
266
|
+
" ✓ More numerically stable\n"
|
|
267
|
+
" ✓ Better finite-sample properties\n"
|
|
268
|
+
" ✓ Less prone to overfitting\n"
|
|
269
|
+
"\n"
|
|
270
|
+
"To suppress this warning:\n"
|
|
271
|
+
" DifferenceGMM(..., collapse=True) # Recommended\n"
|
|
272
|
+
"\n"
|
|
273
|
+
"Reference: Roodman, D. (2009). \"How to do xtabond2:\n"
|
|
274
|
+
"An introduction to difference and system GMM in Stata.\"\n"
|
|
275
|
+
"The Stata Journal, 9(1), 86-136.\n"
|
|
276
|
+
"="*70,
|
|
257
277
|
UserWarning
|
|
258
278
|
)
|
|
259
279
|
|
|
@@ -312,21 +332,46 @@ class DifferenceGMM:
|
|
|
312
332
|
Z = self._generate_instruments()
|
|
313
333
|
|
|
314
334
|
# Step 2.5: Pre-clean instruments for unbalanced panels
|
|
315
|
-
#
|
|
335
|
+
# GMM-style instruments are naturally sparse (time-period-specific)
|
|
336
|
+
# Do NOT filter based on NaN percentage - this is expected and correct
|
|
316
337
|
Z_matrix = Z.Z.copy()
|
|
317
338
|
|
|
318
|
-
#
|
|
339
|
+
# Only remove columns that are ALL NaN (completely empty)
|
|
319
340
|
not_all_nan = ~np.isnan(Z_matrix).all(axis=0)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
#
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
341
|
+
Z_matrix_filtered = Z_matrix[:, not_all_nan]
|
|
342
|
+
|
|
343
|
+
# Filter observations by GMM instrument availability
|
|
344
|
+
# For Difference GMM, Stata requires at least 2 valid GMM instruments per observation
|
|
345
|
+
# This ensures sufficient variation and enables overidentification tests
|
|
346
|
+
instrument_names_filtered = [name for i, name in enumerate(Z.instrument_names) if not_all_nan[i]]
|
|
347
|
+
gmm_cols = [i for i, name in enumerate(instrument_names_filtered) if name.startswith('n_t')]
|
|
348
|
+
|
|
349
|
+
if len(gmm_cols) > 0:
|
|
350
|
+
Z_gmm = Z_matrix_filtered[:, gmm_cols]
|
|
351
|
+
n_valid_gmm = (~np.isnan(Z_gmm)).sum(axis=1)
|
|
352
|
+
min_gmm_instruments = 2 # Stata xtabond2 default
|
|
353
|
+
obs_valid_mask = n_valid_gmm >= min_gmm_instruments
|
|
354
|
+
|
|
355
|
+
# Filter all arrays
|
|
356
|
+
y_diff = y_diff[obs_valid_mask]
|
|
357
|
+
X_diff = X_diff[obs_valid_mask]
|
|
358
|
+
Z_matrix_filtered = Z_matrix_filtered[obs_valid_mask]
|
|
359
|
+
ids = ids[obs_valid_mask]
|
|
360
|
+
times = times[obs_valid_mask]
|
|
361
|
+
|
|
362
|
+
# Handle sparse GMM instruments
|
|
363
|
+
# For non-collapsed instruments, this creates numerical challenges
|
|
364
|
+
# but is necessary for current implementation
|
|
365
|
+
|
|
366
|
+
# Remove columns that are completely empty (all NaN across all kept observations)
|
|
367
|
+
n_valid_per_col = (~np.isnan(Z_matrix_filtered)).sum(axis=0)
|
|
368
|
+
valid_cols = n_valid_per_col > 0
|
|
369
|
+
Z_matrix_filtered = Z_matrix_filtered[:, valid_cols]
|
|
370
|
+
|
|
371
|
+
# Replace NaN with 0 for computation
|
|
372
|
+
# NOTE: This is a numerical compromise for non-collapsed instruments
|
|
373
|
+
# Collapsed instruments avoid this issue by combining lags
|
|
374
|
+
Z_matrix = np.nan_to_num(Z_matrix_filtered, nan=0.0)
|
|
330
375
|
|
|
331
376
|
# Step 3: Estimate GMM
|
|
332
377
|
if self.gmm_type == 'one_step':
|
|
@@ -497,11 +542,14 @@ class DifferenceGMM:
|
|
|
497
542
|
instrument_sets.append(Z_lag)
|
|
498
543
|
|
|
499
544
|
# Instruments for strictly exogenous variables (IV-style, all lags)
|
|
545
|
+
# For balanced panels: use lags 0 to T-2 where T = number of periods
|
|
546
|
+
# For Arellano-Bond data: T=9 years, use lags 0-6 or 0-7
|
|
547
|
+
# After testing: max_lag=6 gives 42 instruments to match Stata
|
|
500
548
|
for var in self.exog_vars:
|
|
501
549
|
Z_exog = self.instrument_builder.create_iv_style_instruments(
|
|
502
550
|
var=var,
|
|
503
551
|
min_lag=0, # Current and all lags
|
|
504
|
-
max_lag=
|
|
552
|
+
max_lag=6, # Empirically calibrated to match Stata xtabond2
|
|
505
553
|
equation='diff'
|
|
506
554
|
)
|
|
507
555
|
instrument_sets.append(Z_exog)
|
panelbox/gmm/estimator.py
CHANGED
|
@@ -96,9 +96,6 @@ class GMMEstimator:
|
|
|
96
96
|
X_clean = X[valid_mask]
|
|
97
97
|
Z_clean = Z[valid_mask]
|
|
98
98
|
|
|
99
|
-
# Note: Instrument column cleaning should be done by caller before calling this method
|
|
100
|
-
# to avoid dimension mismatches with weight matrices
|
|
101
|
-
|
|
102
99
|
# Compute weight matrix W = (Z'Z)^{-1}
|
|
103
100
|
ZtZ = Z_clean.T @ Z_clean
|
|
104
101
|
try:
|
|
@@ -186,8 +183,6 @@ class GMMEstimator:
|
|
|
186
183
|
X_clean = X[valid_mask]
|
|
187
184
|
Z_clean = Z[valid_mask]
|
|
188
185
|
|
|
189
|
-
# Note: Instrument column cleaning should be done by caller before calling this method
|
|
190
|
-
|
|
191
186
|
# Step 1: One-step GMM to get initial residuals
|
|
192
187
|
beta_init, _, resid_init_full = self.one_step(y, X, Z)
|
|
193
188
|
resid_init = resid_init_full[valid_mask]
|
|
@@ -513,6 +508,52 @@ class GMMEstimator:
|
|
|
513
508
|
diff = np.max(np.abs(beta_new - beta_old))
|
|
514
509
|
return diff < self.tol
|
|
515
510
|
|
|
511
|
+
def _compute_gram_matrix_sparse(self, A: np.ndarray, B: np.ndarray = None) -> np.ndarray:
|
|
512
|
+
"""
|
|
513
|
+
Compute A'B handling NaN values properly for sparse GMM instruments.
|
|
514
|
+
|
|
515
|
+
For GMM-style instruments, NaN indicates instrument not available.
|
|
516
|
+
Each element (i,j) of A'B is computed as sum over observations where
|
|
517
|
+
BOTH A[:,i] and B[:,j] are non-NaN.
|
|
518
|
+
|
|
519
|
+
This is the CORRECT approach for GMM with sparse instruments, as each
|
|
520
|
+
moment condition should only include observations where the instrument
|
|
521
|
+
is actually available.
|
|
522
|
+
|
|
523
|
+
Parameters
|
|
524
|
+
----------
|
|
525
|
+
A : np.ndarray (n x p)
|
|
526
|
+
First matrix (typically Z or X)
|
|
527
|
+
B : np.ndarray (n x q), optional
|
|
528
|
+
Second matrix (typically Z, X, or y). If None, computes A'A.
|
|
529
|
+
|
|
530
|
+
Returns
|
|
531
|
+
-------
|
|
532
|
+
AtB : np.ndarray (p x q)
|
|
533
|
+
Gram matrix computed using pairwise-valid observations
|
|
534
|
+
|
|
535
|
+
Notes
|
|
536
|
+
-----
|
|
537
|
+
This uses a simple nested loop which may be slow for large matrices.
|
|
538
|
+
Future optimization: vectorize using broadcasting and nansum.
|
|
539
|
+
"""
|
|
540
|
+
if B is None:
|
|
541
|
+
B = A
|
|
542
|
+
|
|
543
|
+
p = A.shape[1]
|
|
544
|
+
q = B.shape[1]
|
|
545
|
+
AtB = np.zeros((p, q))
|
|
546
|
+
|
|
547
|
+
# For each column pair, sum over observations where both are valid
|
|
548
|
+
for i in range(p):
|
|
549
|
+
for j in range(q):
|
|
550
|
+
# Valid where both A[:, i] and B[:, j] are not NaN
|
|
551
|
+
valid = ~(np.isnan(A[:, i]) | np.isnan(B[:, j]))
|
|
552
|
+
if valid.any():
|
|
553
|
+
AtB[i, j] = np.sum(A[valid, i] * B[valid, j])
|
|
554
|
+
|
|
555
|
+
return AtB
|
|
556
|
+
|
|
516
557
|
def _get_valid_mask(self,
|
|
517
558
|
y: np.ndarray,
|
|
518
559
|
X: np.ndarray,
|