panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PanelData - Container for panel data with validation and transformations.
|
|
3
|
+
|
|
4
|
+
This module provides the core PanelData class for handling panel datasets.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, Union, List
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PanelData:
|
|
13
|
+
"""
|
|
14
|
+
Container for panel data with validation and transformations.
|
|
15
|
+
|
|
16
|
+
This class provides a structured way to work with panel data, ensuring
|
|
17
|
+
proper validation of the panel structure and offering common transformations
|
|
18
|
+
used in panel econometrics.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
data : pd.DataFrame
|
|
23
|
+
Panel data in long format (one row per entity-time observation)
|
|
24
|
+
entity_col : str
|
|
25
|
+
Name of the column identifying entities (e.g., 'firm', 'country', 'id')
|
|
26
|
+
time_col : str
|
|
27
|
+
Name of the column identifying time periods (e.g., 'year', 'quarter', 'time')
|
|
28
|
+
|
|
29
|
+
Attributes
|
|
30
|
+
----------
|
|
31
|
+
data : pd.DataFrame
|
|
32
|
+
Original data in long format
|
|
33
|
+
entity_col : str
|
|
34
|
+
Entity identifier column name
|
|
35
|
+
time_col : str
|
|
36
|
+
Time identifier column name
|
|
37
|
+
is_balanced : bool
|
|
38
|
+
Whether the panel is balanced (all entities have same number of periods)
|
|
39
|
+
n_entities : int
|
|
40
|
+
Number of unique entities
|
|
41
|
+
n_periods : int
|
|
42
|
+
Number of time periods (max if unbalanced)
|
|
43
|
+
n_obs : int
|
|
44
|
+
Total number of observations
|
|
45
|
+
entities : np.ndarray
|
|
46
|
+
Array of unique entity identifiers
|
|
47
|
+
time_periods : np.ndarray
|
|
48
|
+
Array of unique time periods
|
|
49
|
+
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
>>> import pandas as pd
|
|
53
|
+
>>> import panelbox as pb
|
|
54
|
+
>>>
|
|
55
|
+
>>> # Create sample panel data
|
|
56
|
+
>>> data = pd.DataFrame({
|
|
57
|
+
... 'firm': [1, 1, 1, 2, 2, 2],
|
|
58
|
+
... 'year': [2020, 2021, 2022, 2020, 2021, 2022],
|
|
59
|
+
... 'invest': [100, 110, 115, 200, 210, 220],
|
|
60
|
+
... 'value': [1000, 1100, 1200, 2000, 2100, 2200]
|
|
61
|
+
... })
|
|
62
|
+
>>>
|
|
63
|
+
>>> # Create PanelData object
|
|
64
|
+
>>> panel = pb.PanelData(data, entity_col='firm', time_col='year')
|
|
65
|
+
>>> print(panel.summary())
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
data: pd.DataFrame,
|
|
71
|
+
entity_col: str,
|
|
72
|
+
time_col: str
|
|
73
|
+
):
|
|
74
|
+
# Validate inputs
|
|
75
|
+
if not isinstance(data, pd.DataFrame):
|
|
76
|
+
raise TypeError("data must be a pandas DataFrame")
|
|
77
|
+
|
|
78
|
+
if entity_col not in data.columns:
|
|
79
|
+
raise ValueError(f"entity_col '{entity_col}' not found in data columns")
|
|
80
|
+
|
|
81
|
+
if time_col not in data.columns:
|
|
82
|
+
raise ValueError(f"time_col '{time_col}' not found in data columns")
|
|
83
|
+
|
|
84
|
+
# Store data and identifiers
|
|
85
|
+
self.data = data.copy()
|
|
86
|
+
self.entity_col = entity_col
|
|
87
|
+
self.time_col = time_col
|
|
88
|
+
|
|
89
|
+
# Sort by entity and time for consistency
|
|
90
|
+
self.data = self.data.sort_values([entity_col, time_col]).reset_index(drop=True)
|
|
91
|
+
|
|
92
|
+
# Compute panel structure
|
|
93
|
+
self.entities = self.data[entity_col].unique()
|
|
94
|
+
self.time_periods = self.data[time_col].unique()
|
|
95
|
+
self.n_entities = len(self.entities)
|
|
96
|
+
self.n_obs = len(self.data)
|
|
97
|
+
|
|
98
|
+
# Check if balanced
|
|
99
|
+
obs_per_entity = self.data.groupby(entity_col).size()
|
|
100
|
+
self.n_periods = int(obs_per_entity.max())
|
|
101
|
+
self.is_balanced = (obs_per_entity == self.n_periods).all()
|
|
102
|
+
|
|
103
|
+
if not self.is_balanced:
|
|
104
|
+
self.min_periods = int(obs_per_entity.min())
|
|
105
|
+
self.avg_periods = float(obs_per_entity.mean())
|
|
106
|
+
|
|
107
|
+
def demeaning(
|
|
108
|
+
self,
|
|
109
|
+
variables: Optional[Union[str, List[str]]] = None,
|
|
110
|
+
method: str = 'entity'
|
|
111
|
+
) -> pd.DataFrame:
|
|
112
|
+
"""
|
|
113
|
+
Remove means from variables (within transformation).
|
|
114
|
+
|
|
115
|
+
This is the core transformation for fixed effects estimation.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
variables : str or list of str, optional
|
|
120
|
+
Variables to demean. If None, demeans all numeric columns
|
|
121
|
+
except entity and time identifiers.
|
|
122
|
+
method : str, default='entity'
|
|
123
|
+
Type of demeaning:
|
|
124
|
+
- 'entity': Remove entity-specific means (within transformation)
|
|
125
|
+
- 'time': Remove time-specific means
|
|
126
|
+
- 'both': Remove both entity and time means (two-way demeaning)
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
pd.DataFrame
|
|
131
|
+
Demeaned data
|
|
132
|
+
|
|
133
|
+
Examples
|
|
134
|
+
--------
|
|
135
|
+
>>> demeaned = panel.demeaning(['invest', 'value'], method='entity')
|
|
136
|
+
"""
|
|
137
|
+
if variables is None:
|
|
138
|
+
# Demean all numeric columns except identifiers
|
|
139
|
+
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
|
140
|
+
variables = [col for col in numeric_cols
|
|
141
|
+
if col not in [self.entity_col, self.time_col]]
|
|
142
|
+
elif isinstance(variables, str):
|
|
143
|
+
variables = [variables]
|
|
144
|
+
|
|
145
|
+
# Validate variables
|
|
146
|
+
for var in variables:
|
|
147
|
+
if var not in self.data.columns:
|
|
148
|
+
raise ValueError(f"Variable '{var}' not found in data")
|
|
149
|
+
|
|
150
|
+
result = self.data.copy()
|
|
151
|
+
|
|
152
|
+
if method == 'entity':
|
|
153
|
+
# Remove entity means
|
|
154
|
+
group_means = result.groupby(self.entity_col)[variables].transform('mean')
|
|
155
|
+
result[variables] = result[variables] - group_means
|
|
156
|
+
|
|
157
|
+
elif method == 'time':
|
|
158
|
+
# Remove time means
|
|
159
|
+
group_means = result.groupby(self.time_col)[variables].transform('mean')
|
|
160
|
+
result[variables] = result[variables] - group_means
|
|
161
|
+
|
|
162
|
+
elif method == 'both':
|
|
163
|
+
# Two-way demeaning (entity and time)
|
|
164
|
+
# First remove entity means
|
|
165
|
+
entity_means = result.groupby(self.entity_col)[variables].transform('mean')
|
|
166
|
+
result[variables] = result[variables] - entity_means
|
|
167
|
+
|
|
168
|
+
# Then remove time means from demeaned data
|
|
169
|
+
time_means = result.groupby(self.time_col)[variables].transform('mean')
|
|
170
|
+
result[variables] = result[variables] - time_means
|
|
171
|
+
else:
|
|
172
|
+
raise ValueError("method must be 'entity', 'time', or 'both'")
|
|
173
|
+
|
|
174
|
+
return result
|
|
175
|
+
|
|
176
|
+
def first_difference(
|
|
177
|
+
self,
|
|
178
|
+
variables: Optional[Union[str, List[str]]] = None
|
|
179
|
+
) -> pd.DataFrame:
|
|
180
|
+
"""
|
|
181
|
+
Compute first differences (Δy_it = y_it - y_i,t-1).
|
|
182
|
+
|
|
183
|
+
This transformation eliminates time-invariant fixed effects.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
variables : str or list of str, optional
|
|
188
|
+
Variables to difference. If None, differences all numeric columns
|
|
189
|
+
except entity and time identifiers.
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
pd.DataFrame
|
|
194
|
+
First-differenced data (observations for t=1 are dropped)
|
|
195
|
+
|
|
196
|
+
Examples
|
|
197
|
+
--------
|
|
198
|
+
>>> diff_data = panel.first_difference(['invest', 'value'])
|
|
199
|
+
"""
|
|
200
|
+
if variables is None:
|
|
201
|
+
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
|
202
|
+
variables = [col for col in numeric_cols
|
|
203
|
+
if col not in [self.entity_col, self.time_col]]
|
|
204
|
+
elif isinstance(variables, str):
|
|
205
|
+
variables = [variables]
|
|
206
|
+
|
|
207
|
+
# Validate variables
|
|
208
|
+
for var in variables:
|
|
209
|
+
if var not in self.data.columns:
|
|
210
|
+
raise ValueError(f"Variable '{var}' not found in data")
|
|
211
|
+
|
|
212
|
+
result = self.data.copy()
|
|
213
|
+
|
|
214
|
+
# Compute differences within each entity
|
|
215
|
+
for var in variables:
|
|
216
|
+
result[var] = result.groupby(self.entity_col)[var].diff()
|
|
217
|
+
|
|
218
|
+
# Drop first observation for each entity (NaN from diff)
|
|
219
|
+
result = result.dropna(subset=variables)
|
|
220
|
+
|
|
221
|
+
return result
|
|
222
|
+
|
|
223
|
+
def lag(
|
|
224
|
+
self,
|
|
225
|
+
variable: str,
|
|
226
|
+
lags: Union[int, List[int]] = 1
|
|
227
|
+
) -> pd.DataFrame:
|
|
228
|
+
"""
|
|
229
|
+
Create lagged variables.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
variable : str
|
|
234
|
+
Variable to lag
|
|
235
|
+
lags : int or list of int, default=1
|
|
236
|
+
Lag order(s). Can be a single integer or list of integers.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
pd.DataFrame
|
|
241
|
+
Data with lagged variable(s) added.
|
|
242
|
+
Column names will be 'L{lag}.{variable}'
|
|
243
|
+
|
|
244
|
+
Examples
|
|
245
|
+
--------
|
|
246
|
+
>>> # Create single lag
|
|
247
|
+
>>> data_lag1 = panel.lag('invest', lags=1)
|
|
248
|
+
>>>
|
|
249
|
+
>>> # Create multiple lags
|
|
250
|
+
>>> data_lags = panel.lag('invest', lags=[1, 2, 3])
|
|
251
|
+
"""
|
|
252
|
+
if variable not in self.data.columns:
|
|
253
|
+
raise ValueError(f"Variable '{variable}' not found in data")
|
|
254
|
+
|
|
255
|
+
if isinstance(lags, int):
|
|
256
|
+
lags = [lags]
|
|
257
|
+
|
|
258
|
+
result = self.data.copy()
|
|
259
|
+
|
|
260
|
+
for lag in lags:
|
|
261
|
+
if lag < 1:
|
|
262
|
+
raise ValueError("Lag order must be >= 1")
|
|
263
|
+
|
|
264
|
+
lag_name = f'L{lag}.{variable}'
|
|
265
|
+
result[lag_name] = result.groupby(self.entity_col)[variable].shift(lag)
|
|
266
|
+
|
|
267
|
+
return result
|
|
268
|
+
|
|
269
|
+
def lead(
|
|
270
|
+
self,
|
|
271
|
+
variable: str,
|
|
272
|
+
leads: Union[int, List[int]] = 1
|
|
273
|
+
) -> pd.DataFrame:
|
|
274
|
+
"""
|
|
275
|
+
Create lead variables (forward lags).
|
|
276
|
+
|
|
277
|
+
Parameters
|
|
278
|
+
----------
|
|
279
|
+
variable : str
|
|
280
|
+
Variable to lead
|
|
281
|
+
leads : int or list of int, default=1
|
|
282
|
+
Lead order(s). Can be a single integer or list of integers.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
pd.DataFrame
|
|
287
|
+
Data with lead variable(s) added.
|
|
288
|
+
Column names will be 'F{lead}.{variable}'
|
|
289
|
+
|
|
290
|
+
Examples
|
|
291
|
+
--------
|
|
292
|
+
>>> data_lead = panel.lead('invest', leads=1)
|
|
293
|
+
"""
|
|
294
|
+
if variable not in self.data.columns:
|
|
295
|
+
raise ValueError(f"Variable '{variable}' not found in data")
|
|
296
|
+
|
|
297
|
+
if isinstance(leads, int):
|
|
298
|
+
leads = [leads]
|
|
299
|
+
|
|
300
|
+
result = self.data.copy()
|
|
301
|
+
|
|
302
|
+
for lead in leads:
|
|
303
|
+
if lead < 1:
|
|
304
|
+
raise ValueError("Lead order must be >= 1")
|
|
305
|
+
|
|
306
|
+
lead_name = f'F{lead}.{variable}'
|
|
307
|
+
result[lead_name] = result.groupby(self.entity_col)[variable].shift(-lead)
|
|
308
|
+
|
|
309
|
+
return result
|
|
310
|
+
|
|
311
|
+
def balance(self) -> 'PanelData':
|
|
312
|
+
"""
|
|
313
|
+
Balance the panel by keeping only entities with complete time series.
|
|
314
|
+
|
|
315
|
+
This removes any entities that don't have observations for all time periods.
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
PanelData
|
|
320
|
+
New PanelData object with balanced panel
|
|
321
|
+
|
|
322
|
+
Examples
|
|
323
|
+
--------
|
|
324
|
+
>>> balanced_panel = panel.balance()
|
|
325
|
+
>>> print(f"Original: {panel.n_entities} entities")
|
|
326
|
+
>>> print(f"Balanced: {balanced_panel.n_entities} entities")
|
|
327
|
+
"""
|
|
328
|
+
if self.is_balanced:
|
|
329
|
+
return self
|
|
330
|
+
|
|
331
|
+
# Count observations per entity
|
|
332
|
+
obs_counts = self.data.groupby(self.entity_col).size()
|
|
333
|
+
|
|
334
|
+
# Keep only entities with max number of periods
|
|
335
|
+
complete_entities = obs_counts[obs_counts == self.n_periods].index
|
|
336
|
+
|
|
337
|
+
# Filter data
|
|
338
|
+
balanced_data = self.data[self.data[self.entity_col].isin(complete_entities)]
|
|
339
|
+
|
|
340
|
+
return PanelData(balanced_data, self.entity_col, self.time_col)
|
|
341
|
+
|
|
342
|
+
def summary(self) -> str:
|
|
343
|
+
"""
|
|
344
|
+
Generate a summary of the panel structure.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
str
|
|
349
|
+
Formatted summary of panel characteristics
|
|
350
|
+
|
|
351
|
+
Examples
|
|
352
|
+
--------
|
|
353
|
+
>>> print(panel.summary())
|
|
354
|
+
"""
|
|
355
|
+
lines = []
|
|
356
|
+
lines.append("=" * 60)
|
|
357
|
+
lines.append("PANEL DATA SUMMARY")
|
|
358
|
+
lines.append("=" * 60)
|
|
359
|
+
lines.append(f"Entity identifier: {self.entity_col}")
|
|
360
|
+
lines.append(f"Time identifier: {self.time_col}")
|
|
361
|
+
lines.append("-" * 60)
|
|
362
|
+
lines.append(f"Number of entities: {self.n_entities:,}")
|
|
363
|
+
lines.append(f"Number of time periods: {len(self.time_periods):,}")
|
|
364
|
+
lines.append(f"Total observations: {self.n_obs:,}")
|
|
365
|
+
lines.append("-" * 60)
|
|
366
|
+
lines.append(f"Balanced: {'Yes' if self.is_balanced else 'No'}")
|
|
367
|
+
|
|
368
|
+
if not self.is_balanced:
|
|
369
|
+
lines.append(f"Min periods per entity: {self.min_periods}")
|
|
370
|
+
lines.append(f"Max periods per entity: {self.n_periods}")
|
|
371
|
+
lines.append(f"Avg periods per entity: {self.avg_periods:.1f}")
|
|
372
|
+
else:
|
|
373
|
+
lines.append(f"Periods per entity: {self.n_periods}")
|
|
374
|
+
|
|
375
|
+
lines.append("-" * 60)
|
|
376
|
+
lines.append(f"Time period range: {self.time_periods.min()} to {self.time_periods.max()}")
|
|
377
|
+
lines.append("=" * 60)
|
|
378
|
+
|
|
379
|
+
return "\n".join(lines)
|
|
380
|
+
|
|
381
|
+
def __repr__(self) -> str:
|
|
382
|
+
"""String representation of PanelData."""
|
|
383
|
+
balanced_str = "Balanced" if self.is_balanced else "Unbalanced"
|
|
384
|
+
return (f"PanelData({balanced_str}, "
|
|
385
|
+
f"n_entities={self.n_entities}, "
|
|
386
|
+
f"n_periods={self.n_periods}, "
|
|
387
|
+
f"n_obs={self.n_obs})")
|