panelbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. panelbox/__init__.py +67 -0
  2. panelbox/__version__.py +14 -0
  3. panelbox/cli/__init__.py +0 -0
  4. panelbox/cli/{commands}/__init__.py +0 -0
  5. panelbox/core/__init__.py +0 -0
  6. panelbox/core/base_model.py +164 -0
  7. panelbox/core/formula_parser.py +318 -0
  8. panelbox/core/panel_data.py +387 -0
  9. panelbox/core/results.py +366 -0
  10. panelbox/datasets/__init__.py +0 -0
  11. panelbox/datasets/{data}/__init__.py +0 -0
  12. panelbox/gmm/__init__.py +65 -0
  13. panelbox/gmm/difference_gmm.py +645 -0
  14. panelbox/gmm/estimator.py +562 -0
  15. panelbox/gmm/instruments.py +580 -0
  16. panelbox/gmm/results.py +550 -0
  17. panelbox/gmm/system_gmm.py +621 -0
  18. panelbox/gmm/tests.py +535 -0
  19. panelbox/models/__init__.py +11 -0
  20. panelbox/models/dynamic/__init__.py +0 -0
  21. panelbox/models/iv/__init__.py +0 -0
  22. panelbox/models/static/__init__.py +13 -0
  23. panelbox/models/static/fixed_effects.py +516 -0
  24. panelbox/models/static/pooled_ols.py +298 -0
  25. panelbox/models/static/random_effects.py +512 -0
  26. panelbox/report/__init__.py +61 -0
  27. panelbox/report/asset_manager.py +410 -0
  28. panelbox/report/css_manager.py +472 -0
  29. panelbox/report/exporters/__init__.py +15 -0
  30. panelbox/report/exporters/html_exporter.py +440 -0
  31. panelbox/report/exporters/latex_exporter.py +510 -0
  32. panelbox/report/exporters/markdown_exporter.py +446 -0
  33. panelbox/report/renderers/__init__.py +11 -0
  34. panelbox/report/renderers/static/__init__.py +0 -0
  35. panelbox/report/renderers/static_validation_renderer.py +341 -0
  36. panelbox/report/report_manager.py +502 -0
  37. panelbox/report/template_manager.py +337 -0
  38. panelbox/report/transformers/__init__.py +0 -0
  39. panelbox/report/transformers/static/__init__.py +0 -0
  40. panelbox/report/validation_transformer.py +449 -0
  41. panelbox/standard_errors/__init__.py +0 -0
  42. panelbox/templates/__init__.py +0 -0
  43. panelbox/templates/assets/css/base_styles.css +382 -0
  44. panelbox/templates/assets/css/report_components.css +747 -0
  45. panelbox/templates/assets/js/tab-navigation.js +161 -0
  46. panelbox/templates/assets/js/utils.js +276 -0
  47. panelbox/templates/common/footer.html +24 -0
  48. panelbox/templates/common/header.html +44 -0
  49. panelbox/templates/common/meta.html +5 -0
  50. panelbox/templates/validation/interactive/index.html +272 -0
  51. panelbox/templates/validation/interactive/partials/charts.html +58 -0
  52. panelbox/templates/validation/interactive/partials/methodology.html +201 -0
  53. panelbox/templates/validation/interactive/partials/overview.html +146 -0
  54. panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
  55. panelbox/templates/validation/interactive/partials/test_results.html +231 -0
  56. panelbox/utils/__init__.py +0 -0
  57. panelbox/utils/formatting.py +172 -0
  58. panelbox/utils/matrix_ops.py +233 -0
  59. panelbox/utils/statistical.py +173 -0
  60. panelbox/validation/__init__.py +58 -0
  61. panelbox/validation/base.py +175 -0
  62. panelbox/validation/cointegration/__init__.py +0 -0
  63. panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
  64. panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
  65. panelbox/validation/cross_sectional_dependence/frees.py +297 -0
  66. panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
  67. panelbox/validation/heteroskedasticity/__init__.py +13 -0
  68. panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
  69. panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
  70. panelbox/validation/heteroskedasticity/white.py +208 -0
  71. panelbox/validation/instruments/__init__.py +0 -0
  72. panelbox/validation/robustness/__init__.py +0 -0
  73. panelbox/validation/serial_correlation/__init__.py +13 -0
  74. panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
  75. panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
  76. panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
  77. panelbox/validation/specification/__init__.py +16 -0
  78. panelbox/validation/specification/chow.py +273 -0
  79. panelbox/validation/specification/hausman.py +264 -0
  80. panelbox/validation/specification/mundlak.py +331 -0
  81. panelbox/validation/specification/reset.py +273 -0
  82. panelbox/validation/unit_root/__init__.py +0 -0
  83. panelbox/validation/validation_report.py +257 -0
  84. panelbox/validation/validation_suite.py +401 -0
  85. panelbox-0.2.0.dist-info/METADATA +337 -0
  86. panelbox-0.2.0.dist-info/RECORD +90 -0
  87. panelbox-0.2.0.dist-info/WHEEL +5 -0
  88. panelbox-0.2.0.dist-info/entry_points.txt +2 -0
  89. panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
  90. panelbox-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,387 @@
1
+ """
2
+ PanelData - Container for panel data with validation and transformations.
3
+
4
+ This module provides the core PanelData class for handling panel datasets.
5
+ """
6
+
7
+ from typing import Optional, Union, List
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ class PanelData:
13
+ """
14
+ Container for panel data with validation and transformations.
15
+
16
+ This class provides a structured way to work with panel data, ensuring
17
+ proper validation of the panel structure and offering common transformations
18
+ used in panel econometrics.
19
+
20
+ Parameters
21
+ ----------
22
+ data : pd.DataFrame
23
+ Panel data in long format (one row per entity-time observation)
24
+ entity_col : str
25
+ Name of the column identifying entities (e.g., 'firm', 'country', 'id')
26
+ time_col : str
27
+ Name of the column identifying time periods (e.g., 'year', 'quarter', 'time')
28
+
29
+ Attributes
30
+ ----------
31
+ data : pd.DataFrame
32
+ Original data in long format
33
+ entity_col : str
34
+ Entity identifier column name
35
+ time_col : str
36
+ Time identifier column name
37
+ is_balanced : bool
38
+ Whether the panel is balanced (all entities have same number of periods)
39
+ n_entities : int
40
+ Number of unique entities
41
+ n_periods : int
42
+ Number of time periods (max if unbalanced)
43
+ n_obs : int
44
+ Total number of observations
45
+ entities : np.ndarray
46
+ Array of unique entity identifiers
47
+ time_periods : np.ndarray
48
+ Array of unique time periods
49
+
50
+ Examples
51
+ --------
52
+ >>> import pandas as pd
53
+ >>> import panelbox as pb
54
+ >>>
55
+ >>> # Create sample panel data
56
+ >>> data = pd.DataFrame({
57
+ ... 'firm': [1, 1, 1, 2, 2, 2],
58
+ ... 'year': [2020, 2021, 2022, 2020, 2021, 2022],
59
+ ... 'invest': [100, 110, 115, 200, 210, 220],
60
+ ... 'value': [1000, 1100, 1200, 2000, 2100, 2200]
61
+ ... })
62
+ >>>
63
+ >>> # Create PanelData object
64
+ >>> panel = pb.PanelData(data, entity_col='firm', time_col='year')
65
+ >>> print(panel.summary())
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ data: pd.DataFrame,
71
+ entity_col: str,
72
+ time_col: str
73
+ ):
74
+ # Validate inputs
75
+ if not isinstance(data, pd.DataFrame):
76
+ raise TypeError("data must be a pandas DataFrame")
77
+
78
+ if entity_col not in data.columns:
79
+ raise ValueError(f"entity_col '{entity_col}' not found in data columns")
80
+
81
+ if time_col not in data.columns:
82
+ raise ValueError(f"time_col '{time_col}' not found in data columns")
83
+
84
+ # Store data and identifiers
85
+ self.data = data.copy()
86
+ self.entity_col = entity_col
87
+ self.time_col = time_col
88
+
89
+ # Sort by entity and time for consistency
90
+ self.data = self.data.sort_values([entity_col, time_col]).reset_index(drop=True)
91
+
92
+ # Compute panel structure
93
+ self.entities = self.data[entity_col].unique()
94
+ self.time_periods = self.data[time_col].unique()
95
+ self.n_entities = len(self.entities)
96
+ self.n_obs = len(self.data)
97
+
98
+ # Check if balanced
99
+ obs_per_entity = self.data.groupby(entity_col).size()
100
+ self.n_periods = int(obs_per_entity.max())
101
+ self.is_balanced = (obs_per_entity == self.n_periods).all()
102
+
103
+ if not self.is_balanced:
104
+ self.min_periods = int(obs_per_entity.min())
105
+ self.avg_periods = float(obs_per_entity.mean())
106
+
107
+ def demeaning(
108
+ self,
109
+ variables: Optional[Union[str, List[str]]] = None,
110
+ method: str = 'entity'
111
+ ) -> pd.DataFrame:
112
+ """
113
+ Remove means from variables (within transformation).
114
+
115
+ This is the core transformation for fixed effects estimation.
116
+
117
+ Parameters
118
+ ----------
119
+ variables : str or list of str, optional
120
+ Variables to demean. If None, demeans all numeric columns
121
+ except entity and time identifiers.
122
+ method : str, default='entity'
123
+ Type of demeaning:
124
+ - 'entity': Remove entity-specific means (within transformation)
125
+ - 'time': Remove time-specific means
126
+ - 'both': Remove both entity and time means (two-way demeaning)
127
+
128
+ Returns
129
+ -------
130
+ pd.DataFrame
131
+ Demeaned data
132
+
133
+ Examples
134
+ --------
135
+ >>> demeaned = panel.demeaning(['invest', 'value'], method='entity')
136
+ """
137
+ if variables is None:
138
+ # Demean all numeric columns except identifiers
139
+ numeric_cols = self.data.select_dtypes(include=[np.number]).columns
140
+ variables = [col for col in numeric_cols
141
+ if col not in [self.entity_col, self.time_col]]
142
+ elif isinstance(variables, str):
143
+ variables = [variables]
144
+
145
+ # Validate variables
146
+ for var in variables:
147
+ if var not in self.data.columns:
148
+ raise ValueError(f"Variable '{var}' not found in data")
149
+
150
+ result = self.data.copy()
151
+
152
+ if method == 'entity':
153
+ # Remove entity means
154
+ group_means = result.groupby(self.entity_col)[variables].transform('mean')
155
+ result[variables] = result[variables] - group_means
156
+
157
+ elif method == 'time':
158
+ # Remove time means
159
+ group_means = result.groupby(self.time_col)[variables].transform('mean')
160
+ result[variables] = result[variables] - group_means
161
+
162
+ elif method == 'both':
163
+ # Two-way demeaning (entity and time)
164
+ # First remove entity means
165
+ entity_means = result.groupby(self.entity_col)[variables].transform('mean')
166
+ result[variables] = result[variables] - entity_means
167
+
168
+ # Then remove time means from demeaned data
169
+ time_means = result.groupby(self.time_col)[variables].transform('mean')
170
+ result[variables] = result[variables] - time_means
171
+ else:
172
+ raise ValueError("method must be 'entity', 'time', or 'both'")
173
+
174
+ return result
175
+
176
+ def first_difference(
177
+ self,
178
+ variables: Optional[Union[str, List[str]]] = None
179
+ ) -> pd.DataFrame:
180
+ """
181
+ Compute first differences (Δy_it = y_it - y_i,t-1).
182
+
183
+ This transformation eliminates time-invariant fixed effects.
184
+
185
+ Parameters
186
+ ----------
187
+ variables : str or list of str, optional
188
+ Variables to difference. If None, differences all numeric columns
189
+ except entity and time identifiers.
190
+
191
+ Returns
192
+ -------
193
+ pd.DataFrame
194
+ First-differenced data (observations for t=1 are dropped)
195
+
196
+ Examples
197
+ --------
198
+ >>> diff_data = panel.first_difference(['invest', 'value'])
199
+ """
200
+ if variables is None:
201
+ numeric_cols = self.data.select_dtypes(include=[np.number]).columns
202
+ variables = [col for col in numeric_cols
203
+ if col not in [self.entity_col, self.time_col]]
204
+ elif isinstance(variables, str):
205
+ variables = [variables]
206
+
207
+ # Validate variables
208
+ for var in variables:
209
+ if var not in self.data.columns:
210
+ raise ValueError(f"Variable '{var}' not found in data")
211
+
212
+ result = self.data.copy()
213
+
214
+ # Compute differences within each entity
215
+ for var in variables:
216
+ result[var] = result.groupby(self.entity_col)[var].diff()
217
+
218
+ # Drop first observation for each entity (NaN from diff)
219
+ result = result.dropna(subset=variables)
220
+
221
+ return result
222
+
223
+ def lag(
224
+ self,
225
+ variable: str,
226
+ lags: Union[int, List[int]] = 1
227
+ ) -> pd.DataFrame:
228
+ """
229
+ Create lagged variables.
230
+
231
+ Parameters
232
+ ----------
233
+ variable : str
234
+ Variable to lag
235
+ lags : int or list of int, default=1
236
+ Lag order(s). Can be a single integer or list of integers.
237
+
238
+ Returns
239
+ -------
240
+ pd.DataFrame
241
+ Data with lagged variable(s) added.
242
+ Column names will be 'L{lag}.{variable}'
243
+
244
+ Examples
245
+ --------
246
+ >>> # Create single lag
247
+ >>> data_lag1 = panel.lag('invest', lags=1)
248
+ >>>
249
+ >>> # Create multiple lags
250
+ >>> data_lags = panel.lag('invest', lags=[1, 2, 3])
251
+ """
252
+ if variable not in self.data.columns:
253
+ raise ValueError(f"Variable '{variable}' not found in data")
254
+
255
+ if isinstance(lags, int):
256
+ lags = [lags]
257
+
258
+ result = self.data.copy()
259
+
260
+ for lag in lags:
261
+ if lag < 1:
262
+ raise ValueError("Lag order must be >= 1")
263
+
264
+ lag_name = f'L{lag}.{variable}'
265
+ result[lag_name] = result.groupby(self.entity_col)[variable].shift(lag)
266
+
267
+ return result
268
+
269
+ def lead(
270
+ self,
271
+ variable: str,
272
+ leads: Union[int, List[int]] = 1
273
+ ) -> pd.DataFrame:
274
+ """
275
+ Create lead variables (forward lags).
276
+
277
+ Parameters
278
+ ----------
279
+ variable : str
280
+ Variable to lead
281
+ leads : int or list of int, default=1
282
+ Lead order(s). Can be a single integer or list of integers.
283
+
284
+ Returns
285
+ -------
286
+ pd.DataFrame
287
+ Data with lead variable(s) added.
288
+ Column names will be 'F{lead}.{variable}'
289
+
290
+ Examples
291
+ --------
292
+ >>> data_lead = panel.lead('invest', leads=1)
293
+ """
294
+ if variable not in self.data.columns:
295
+ raise ValueError(f"Variable '{variable}' not found in data")
296
+
297
+ if isinstance(leads, int):
298
+ leads = [leads]
299
+
300
+ result = self.data.copy()
301
+
302
+ for lead in leads:
303
+ if lead < 1:
304
+ raise ValueError("Lead order must be >= 1")
305
+
306
+ lead_name = f'F{lead}.{variable}'
307
+ result[lead_name] = result.groupby(self.entity_col)[variable].shift(-lead)
308
+
309
+ return result
310
+
311
+ def balance(self) -> 'PanelData':
312
+ """
313
+ Balance the panel by keeping only entities with complete time series.
314
+
315
+ This removes any entities that don't have observations for all time periods.
316
+
317
+ Returns
318
+ -------
319
+ PanelData
320
+ New PanelData object with balanced panel
321
+
322
+ Examples
323
+ --------
324
+ >>> balanced_panel = panel.balance()
325
+ >>> print(f"Original: {panel.n_entities} entities")
326
+ >>> print(f"Balanced: {balanced_panel.n_entities} entities")
327
+ """
328
+ if self.is_balanced:
329
+ return self
330
+
331
+ # Count observations per entity
332
+ obs_counts = self.data.groupby(self.entity_col).size()
333
+
334
+ # Keep only entities with max number of periods
335
+ complete_entities = obs_counts[obs_counts == self.n_periods].index
336
+
337
+ # Filter data
338
+ balanced_data = self.data[self.data[self.entity_col].isin(complete_entities)]
339
+
340
+ return PanelData(balanced_data, self.entity_col, self.time_col)
341
+
342
+ def summary(self) -> str:
343
+ """
344
+ Generate a summary of the panel structure.
345
+
346
+ Returns
347
+ -------
348
+ str
349
+ Formatted summary of panel characteristics
350
+
351
+ Examples
352
+ --------
353
+ >>> print(panel.summary())
354
+ """
355
+ lines = []
356
+ lines.append("=" * 60)
357
+ lines.append("PANEL DATA SUMMARY")
358
+ lines.append("=" * 60)
359
+ lines.append(f"Entity identifier: {self.entity_col}")
360
+ lines.append(f"Time identifier: {self.time_col}")
361
+ lines.append("-" * 60)
362
+ lines.append(f"Number of entities: {self.n_entities:,}")
363
+ lines.append(f"Number of time periods: {len(self.time_periods):,}")
364
+ lines.append(f"Total observations: {self.n_obs:,}")
365
+ lines.append("-" * 60)
366
+ lines.append(f"Balanced: {'Yes' if self.is_balanced else 'No'}")
367
+
368
+ if not self.is_balanced:
369
+ lines.append(f"Min periods per entity: {self.min_periods}")
370
+ lines.append(f"Max periods per entity: {self.n_periods}")
371
+ lines.append(f"Avg periods per entity: {self.avg_periods:.1f}")
372
+ else:
373
+ lines.append(f"Periods per entity: {self.n_periods}")
374
+
375
+ lines.append("-" * 60)
376
+ lines.append(f"Time period range: {self.time_periods.min()} to {self.time_periods.max()}")
377
+ lines.append("=" * 60)
378
+
379
+ return "\n".join(lines)
380
+
381
+ def __repr__(self) -> str:
382
+ """String representation of PanelData."""
383
+ balanced_str = "Balanced" if self.is_balanced else "Unbalanced"
384
+ return (f"PanelData({balanced_str}, "
385
+ f"n_entities={self.n_entities}, "
386
+ f"n_periods={self.n_periods}, "
387
+ f"n_obs={self.n_obs})")