panelbox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelbox/__init__.py +67 -0
- panelbox/__version__.py +14 -0
- panelbox/cli/__init__.py +0 -0
- panelbox/cli/{commands}/__init__.py +0 -0
- panelbox/core/__init__.py +0 -0
- panelbox/core/base_model.py +164 -0
- panelbox/core/formula_parser.py +318 -0
- panelbox/core/panel_data.py +387 -0
- panelbox/core/results.py +366 -0
- panelbox/datasets/__init__.py +0 -0
- panelbox/datasets/{data}/__init__.py +0 -0
- panelbox/gmm/__init__.py +65 -0
- panelbox/gmm/difference_gmm.py +645 -0
- panelbox/gmm/estimator.py +562 -0
- panelbox/gmm/instruments.py +580 -0
- panelbox/gmm/results.py +550 -0
- panelbox/gmm/system_gmm.py +621 -0
- panelbox/gmm/tests.py +535 -0
- panelbox/models/__init__.py +11 -0
- panelbox/models/dynamic/__init__.py +0 -0
- panelbox/models/iv/__init__.py +0 -0
- panelbox/models/static/__init__.py +13 -0
- panelbox/models/static/fixed_effects.py +516 -0
- panelbox/models/static/pooled_ols.py +298 -0
- panelbox/models/static/random_effects.py +512 -0
- panelbox/report/__init__.py +61 -0
- panelbox/report/asset_manager.py +410 -0
- panelbox/report/css_manager.py +472 -0
- panelbox/report/exporters/__init__.py +15 -0
- panelbox/report/exporters/html_exporter.py +440 -0
- panelbox/report/exporters/latex_exporter.py +510 -0
- panelbox/report/exporters/markdown_exporter.py +446 -0
- panelbox/report/renderers/__init__.py +11 -0
- panelbox/report/renderers/static/__init__.py +0 -0
- panelbox/report/renderers/static_validation_renderer.py +341 -0
- panelbox/report/report_manager.py +502 -0
- panelbox/report/template_manager.py +337 -0
- panelbox/report/transformers/__init__.py +0 -0
- panelbox/report/transformers/static/__init__.py +0 -0
- panelbox/report/validation_transformer.py +449 -0
- panelbox/standard_errors/__init__.py +0 -0
- panelbox/templates/__init__.py +0 -0
- panelbox/templates/assets/css/base_styles.css +382 -0
- panelbox/templates/assets/css/report_components.css +747 -0
- panelbox/templates/assets/js/tab-navigation.js +161 -0
- panelbox/templates/assets/js/utils.js +276 -0
- panelbox/templates/common/footer.html +24 -0
- panelbox/templates/common/header.html +44 -0
- panelbox/templates/common/meta.html +5 -0
- panelbox/templates/validation/interactive/index.html +272 -0
- panelbox/templates/validation/interactive/partials/charts.html +58 -0
- panelbox/templates/validation/interactive/partials/methodology.html +201 -0
- panelbox/templates/validation/interactive/partials/overview.html +146 -0
- panelbox/templates/validation/interactive/partials/recommendations.html +101 -0
- panelbox/templates/validation/interactive/partials/test_results.html +231 -0
- panelbox/utils/__init__.py +0 -0
- panelbox/utils/formatting.py +172 -0
- panelbox/utils/matrix_ops.py +233 -0
- panelbox/utils/statistical.py +173 -0
- panelbox/validation/__init__.py +58 -0
- panelbox/validation/base.py +175 -0
- panelbox/validation/cointegration/__init__.py +0 -0
- panelbox/validation/cross_sectional_dependence/__init__.py +13 -0
- panelbox/validation/cross_sectional_dependence/breusch_pagan_lm.py +222 -0
- panelbox/validation/cross_sectional_dependence/frees.py +297 -0
- panelbox/validation/cross_sectional_dependence/pesaran_cd.py +188 -0
- panelbox/validation/heteroskedasticity/__init__.py +13 -0
- panelbox/validation/heteroskedasticity/breusch_pagan.py +222 -0
- panelbox/validation/heteroskedasticity/modified_wald.py +172 -0
- panelbox/validation/heteroskedasticity/white.py +208 -0
- panelbox/validation/instruments/__init__.py +0 -0
- panelbox/validation/robustness/__init__.py +0 -0
- panelbox/validation/serial_correlation/__init__.py +13 -0
- panelbox/validation/serial_correlation/baltagi_wu.py +220 -0
- panelbox/validation/serial_correlation/breusch_godfrey.py +260 -0
- panelbox/validation/serial_correlation/wooldridge_ar.py +200 -0
- panelbox/validation/specification/__init__.py +16 -0
- panelbox/validation/specification/chow.py +273 -0
- panelbox/validation/specification/hausman.py +264 -0
- panelbox/validation/specification/mundlak.py +331 -0
- panelbox/validation/specification/reset.py +273 -0
- panelbox/validation/unit_root/__init__.py +0 -0
- panelbox/validation/validation_report.py +257 -0
- panelbox/validation/validation_suite.py +401 -0
- panelbox-0.2.0.dist-info/METADATA +337 -0
- panelbox-0.2.0.dist-info/RECORD +90 -0
- panelbox-0.2.0.dist-info/WHEEL +5 -0
- panelbox-0.2.0.dist-info/entry_points.txt +2 -0
- panelbox-0.2.0.dist-info/licenses/LICENSE +21 -0
- panelbox-0.2.0.dist-info/top_level.txt +1 -0
panelbox/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PanelBox - Panel Data Econometrics in Python
|
|
3
|
+
|
|
4
|
+
PanelBox provides comprehensive tools for panel data econometrics,
|
|
5
|
+
inspired by Stata (xtabond2), R (plm), and statsmodels.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Static panel models: Pooled OLS, Fixed Effects, Random Effects
|
|
9
|
+
- Dynamic panel GMM: Arellano-Bond (1991), Blundell-Bond (1998)
|
|
10
|
+
- Robust to unbalanced panels
|
|
11
|
+
- Comprehensive specification tests
|
|
12
|
+
- Publication-ready reporting
|
|
13
|
+
|
|
14
|
+
Quick Start:
|
|
15
|
+
>>> from panelbox import DifferenceGMM
|
|
16
|
+
>>> gmm = DifferenceGMM(data=df, dep_var='y', lags=1, id_var='id', time_var='year')
|
|
17
|
+
>>> results = gmm.fit()
|
|
18
|
+
>>> print(results.summary())
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from panelbox.__version__ import __version__, __author__, __email__, __license__
|
|
22
|
+
|
|
23
|
+
# Core classes
|
|
24
|
+
from panelbox.core.panel_data import PanelData
|
|
25
|
+
from panelbox.core.formula_parser import FormulaParser, parse_formula
|
|
26
|
+
from panelbox.core.results import PanelResults
|
|
27
|
+
|
|
28
|
+
# Static panel models
|
|
29
|
+
from panelbox.models.static.pooled_ols import PooledOLS
|
|
30
|
+
from panelbox.models.static.fixed_effects import FixedEffects
|
|
31
|
+
from panelbox.models.static.random_effects import RandomEffects
|
|
32
|
+
|
|
33
|
+
# Dynamic panel GMM models
|
|
34
|
+
from panelbox.gmm.difference_gmm import DifferenceGMM
|
|
35
|
+
from panelbox.gmm.system_gmm import SystemGMM
|
|
36
|
+
from panelbox.gmm.results import GMMResults
|
|
37
|
+
|
|
38
|
+
# Tests
|
|
39
|
+
from panelbox.validation.specification.hausman import HausmanTest, HausmanTestResult
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Version
|
|
43
|
+
'__version__',
|
|
44
|
+
'__author__',
|
|
45
|
+
'__email__',
|
|
46
|
+
'__license__',
|
|
47
|
+
|
|
48
|
+
# Core
|
|
49
|
+
'PanelData',
|
|
50
|
+
'FormulaParser',
|
|
51
|
+
'parse_formula',
|
|
52
|
+
'PanelResults',
|
|
53
|
+
|
|
54
|
+
# Static Models
|
|
55
|
+
'PooledOLS',
|
|
56
|
+
'FixedEffects',
|
|
57
|
+
'RandomEffects',
|
|
58
|
+
|
|
59
|
+
# GMM Models
|
|
60
|
+
'DifferenceGMM',
|
|
61
|
+
'SystemGMM',
|
|
62
|
+
'GMMResults',
|
|
63
|
+
|
|
64
|
+
# Tests
|
|
65
|
+
'HausmanTest',
|
|
66
|
+
'HausmanTestResult',
|
|
67
|
+
]
|
panelbox/__version__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Version information for panelbox."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.2.0"
|
|
4
|
+
__author__ = "Gustavo Haase, Paulo Dourado"
|
|
5
|
+
__email__ = "gustavo.haase@gmail.com"
|
|
6
|
+
__license__ = "MIT"
|
|
7
|
+
|
|
8
|
+
# Version history
|
|
9
|
+
# 0.2.0 (2026-01-21): GMM implementation complete (Difference & System GMM)
|
|
10
|
+
# - Arellano-Bond (1991) Difference GMM
|
|
11
|
+
# - Blundell-Bond (1998) System GMM
|
|
12
|
+
# - Robust to unbalanced panels
|
|
13
|
+
# - Comprehensive documentation
|
|
14
|
+
# 0.1.0 (Initial): Core panel data models (FE, RE, OLS)
|
panelbox/cli/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base model class for panel econometric models.
|
|
3
|
+
|
|
4
|
+
This module provides the abstract base class that all panel models inherit from.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Optional, Any
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from panelbox.core.panel_data import PanelData
|
|
13
|
+
from panelbox.core.formula_parser import FormulaParser
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PanelModel(ABC):
|
|
17
|
+
"""
|
|
18
|
+
Abstract base class for panel econometric models.
|
|
19
|
+
|
|
20
|
+
All panel models (PooledOLS, FixedEffects, RandomEffects, GMM, etc.)
|
|
21
|
+
inherit from this class and must implement the abstract methods.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
formula : str
|
|
26
|
+
Model formula in R-style syntax (e.g., "y ~ x1 + x2")
|
|
27
|
+
data : pd.DataFrame
|
|
28
|
+
Panel data in long format
|
|
29
|
+
entity_col : str
|
|
30
|
+
Name of the column identifying entities
|
|
31
|
+
time_col : str
|
|
32
|
+
Name of the column identifying time periods
|
|
33
|
+
weights : np.ndarray, optional
|
|
34
|
+
Observation weights
|
|
35
|
+
|
|
36
|
+
Attributes
|
|
37
|
+
----------
|
|
38
|
+
formula : str
|
|
39
|
+
Model formula
|
|
40
|
+
data : PanelData
|
|
41
|
+
Panel data container
|
|
42
|
+
weights : np.ndarray, optional
|
|
43
|
+
Observation weights
|
|
44
|
+
formula_parser : FormulaParser
|
|
45
|
+
Parsed formula object
|
|
46
|
+
_fitted : bool
|
|
47
|
+
Whether model has been fitted
|
|
48
|
+
_results : PanelResults, optional
|
|
49
|
+
Fitted model results
|
|
50
|
+
|
|
51
|
+
Examples
|
|
52
|
+
--------
|
|
53
|
+
This is an abstract class. See concrete implementations like
|
|
54
|
+
PooledOLS, FixedEffects, etc. for usage examples.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
formula: str,
|
|
60
|
+
data: pd.DataFrame,
|
|
61
|
+
entity_col: str,
|
|
62
|
+
time_col: str,
|
|
63
|
+
weights: Optional[np.ndarray] = None
|
|
64
|
+
):
|
|
65
|
+
# Store formula
|
|
66
|
+
self.formula = formula
|
|
67
|
+
|
|
68
|
+
# Create PanelData container
|
|
69
|
+
if not isinstance(data, PanelData):
|
|
70
|
+
self.data = PanelData(data, entity_col, time_col)
|
|
71
|
+
else:
|
|
72
|
+
self.data = data
|
|
73
|
+
|
|
74
|
+
# Store weights
|
|
75
|
+
self.weights = weights
|
|
76
|
+
if weights is not None:
|
|
77
|
+
if len(weights) != self.data.n_obs:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"weights must have length {self.data.n_obs}, got {len(weights)}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Parse formula
|
|
83
|
+
self.formula_parser = FormulaParser(formula).parse()
|
|
84
|
+
|
|
85
|
+
# Model state
|
|
86
|
+
self._fitted = False
|
|
87
|
+
self._results: Optional[Any] = None
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def fit(self, **kwargs) -> 'PanelResults':
|
|
91
|
+
"""
|
|
92
|
+
Fit the model.
|
|
93
|
+
|
|
94
|
+
This method must be implemented by subclasses.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
**kwargs
|
|
99
|
+
Model-specific fitting options
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
PanelResults
|
|
104
|
+
Fitted model results
|
|
105
|
+
"""
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def _estimate_coefficients(self) -> np.ndarray:
|
|
110
|
+
"""
|
|
111
|
+
Estimate model coefficients.
|
|
112
|
+
|
|
113
|
+
This method contains the core estimation logic and must be
|
|
114
|
+
implemented by subclasses.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
np.ndarray
|
|
119
|
+
Estimated coefficients
|
|
120
|
+
"""
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def validate(
|
|
124
|
+
self,
|
|
125
|
+
tests: Optional[list] = None,
|
|
126
|
+
verbose: bool = True
|
|
127
|
+
) -> 'ValidationReport':
|
|
128
|
+
"""
|
|
129
|
+
Run validation suite on fitted model.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
tests : list, optional
|
|
134
|
+
Specific tests to run. If None, runs all applicable tests.
|
|
135
|
+
verbose : bool, default=True
|
|
136
|
+
Print progress during validation
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
ValidationReport
|
|
141
|
+
Validation results
|
|
142
|
+
|
|
143
|
+
Raises
|
|
144
|
+
------
|
|
145
|
+
ValueError
|
|
146
|
+
If model has not been fitted
|
|
147
|
+
"""
|
|
148
|
+
if not self._fitted:
|
|
149
|
+
raise ValueError("Model must be fitted before validation. Call fit() first.")
|
|
150
|
+
|
|
151
|
+
# Import here to avoid circular dependency
|
|
152
|
+
from panelbox.validation.validation_suite import ValidationSuite
|
|
153
|
+
|
|
154
|
+
suite = ValidationSuite(self._results)
|
|
155
|
+
return suite.run(tests=tests, verbose=verbose)
|
|
156
|
+
|
|
157
|
+
def __repr__(self) -> str:
|
|
158
|
+
"""String representation."""
|
|
159
|
+
status = "fitted" if self._fitted else "not fitted"
|
|
160
|
+
return (f"{self.__class__.__name__}("
|
|
161
|
+
f"formula='{self.formula}', "
|
|
162
|
+
f"n_entities={self.data.n_entities}, "
|
|
163
|
+
f"n_obs={self.data.n_obs}, "
|
|
164
|
+
f"status={status})")
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FormulaParser - Parser for R-style formulas for panel models.
|
|
3
|
+
|
|
4
|
+
This module provides formula parsing functionality similar to R's formula syntax,
|
|
5
|
+
adapted for panel data econometrics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
9
|
+
import re
|
|
10
|
+
import patsy
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FormulaParser:
|
|
16
|
+
"""
|
|
17
|
+
Parser for R-style formulas with panel-specific extensions.
|
|
18
|
+
|
|
19
|
+
Supports standard R formula syntax:
|
|
20
|
+
- Basic: "y ~ x1 + x2"
|
|
21
|
+
- Interactions: "y ~ x1 * x2" (expands to x1 + x2 + x1:x2)
|
|
22
|
+
- Transformations: "y ~ log(x1) + I(x2**2)"
|
|
23
|
+
|
|
24
|
+
For GMM models (future implementation), supports:
|
|
25
|
+
- Lags: "y ~ L(y, 1:2)" for lagged variables
|
|
26
|
+
- Instruments: "y ~ x1 + x2 | gmm(y, 2:4) + iv(x1)"
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
formula : str
|
|
31
|
+
Formula string in R-style syntax
|
|
32
|
+
|
|
33
|
+
Attributes
|
|
34
|
+
----------
|
|
35
|
+
formula : str
|
|
36
|
+
Original formula string
|
|
37
|
+
dependent : str
|
|
38
|
+
Name of dependent variable
|
|
39
|
+
regressors : List[str]
|
|
40
|
+
List of regressor variable names
|
|
41
|
+
has_intercept : bool
|
|
42
|
+
Whether model includes intercept
|
|
43
|
+
has_instruments : bool
|
|
44
|
+
Whether formula includes instrument specification (for GMM)
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
>>> parser = FormulaParser("y ~ x1 + x2")
|
|
49
|
+
>>> parser.parse()
|
|
50
|
+
>>> print(parser.dependent)
|
|
51
|
+
'y'
|
|
52
|
+
>>> print(parser.regressors)
|
|
53
|
+
['x1', 'x2']
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, formula: str):
|
|
57
|
+
if not isinstance(formula, str):
|
|
58
|
+
raise TypeError("formula must be a string")
|
|
59
|
+
|
|
60
|
+
if '~' not in formula:
|
|
61
|
+
raise ValueError("formula must contain '~' separating dependent and independent variables")
|
|
62
|
+
|
|
63
|
+
self.formula = formula.strip()
|
|
64
|
+
self.dependent: Optional[str] = None
|
|
65
|
+
self.regressors: List[str] = []
|
|
66
|
+
self.has_intercept: bool = True
|
|
67
|
+
self.has_instruments: bool = False
|
|
68
|
+
self._instrument_spec: Optional[str] = None
|
|
69
|
+
self._parsed: bool = False
|
|
70
|
+
|
|
71
|
+
def parse(self) -> 'FormulaParser':
|
|
72
|
+
"""
|
|
73
|
+
Parse the formula string.
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
FormulaParser
|
|
78
|
+
Self (for method chaining)
|
|
79
|
+
|
|
80
|
+
Examples
|
|
81
|
+
--------
|
|
82
|
+
>>> parser = FormulaParser("y ~ x1 + x2").parse()
|
|
83
|
+
"""
|
|
84
|
+
# Split on ~ to get LHS and RHS
|
|
85
|
+
parts = self.formula.split('~')
|
|
86
|
+
if len(parts) != 2:
|
|
87
|
+
raise ValueError("formula must have exactly one '~'")
|
|
88
|
+
|
|
89
|
+
lhs, rhs = parts[0].strip(), parts[1].strip()
|
|
90
|
+
|
|
91
|
+
# Parse dependent variable (LHS)
|
|
92
|
+
self.dependent = lhs
|
|
93
|
+
|
|
94
|
+
# Check for instruments (for GMM - future)
|
|
95
|
+
if '|' in rhs:
|
|
96
|
+
self.has_instruments = True
|
|
97
|
+
rhs_parts = rhs.split('|')
|
|
98
|
+
rhs = rhs_parts[0].strip()
|
|
99
|
+
self._instrument_spec = rhs_parts[1].strip()
|
|
100
|
+
|
|
101
|
+
# Check for no intercept
|
|
102
|
+
if '-1' in rhs or '- 1' in rhs or '+0' in rhs or '+ 0' in rhs:
|
|
103
|
+
self.has_intercept = False
|
|
104
|
+
# Remove the -1 or +0 from RHS
|
|
105
|
+
rhs = re.sub(r'[+-]\s*[01]', '', rhs)
|
|
106
|
+
|
|
107
|
+
# Store RHS for later use with patsy
|
|
108
|
+
self._rhs = rhs.strip()
|
|
109
|
+
|
|
110
|
+
# Parse regressors (will be expanded by patsy later)
|
|
111
|
+
# For now, just store the basic variable names
|
|
112
|
+
self.regressors = self._extract_variable_names(rhs)
|
|
113
|
+
|
|
114
|
+
self._parsed = True
|
|
115
|
+
return self
|
|
116
|
+
|
|
117
|
+
def _extract_variable_names(self, rhs: str) -> List[str]:
|
|
118
|
+
"""
|
|
119
|
+
Extract basic variable names from RHS.
|
|
120
|
+
|
|
121
|
+
This is a simple extraction that doesn't handle all transformations.
|
|
122
|
+
Patsy will handle the full parsing when building design matrices.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
rhs : str
|
|
127
|
+
Right-hand side of formula
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
List[str]
|
|
132
|
+
List of variable names
|
|
133
|
+
"""
|
|
134
|
+
# Split on + but respect parentheses
|
|
135
|
+
# This is a simplified version - patsy will do the heavy lifting
|
|
136
|
+
terms = re.split(r'\s*\+\s*', rhs)
|
|
137
|
+
variables = []
|
|
138
|
+
|
|
139
|
+
for term in terms:
|
|
140
|
+
term = term.strip()
|
|
141
|
+
if not term or term in ['-1', '- 1', '0', '1']:
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Extract variable names from term
|
|
145
|
+
# Handle simple cases: x, log(x), I(x**2), x:y, x*y
|
|
146
|
+
if ':' in term:
|
|
147
|
+
# Interaction term
|
|
148
|
+
parts = term.split(':')
|
|
149
|
+
for part in parts:
|
|
150
|
+
var = self._extract_var_from_term(part.strip())
|
|
151
|
+
if var and var not in variables:
|
|
152
|
+
variables.append(var)
|
|
153
|
+
elif '*' in term:
|
|
154
|
+
# Interaction with expansion
|
|
155
|
+
parts = term.split('*')
|
|
156
|
+
for part in parts:
|
|
157
|
+
var = self._extract_var_from_term(part.strip())
|
|
158
|
+
if var and var not in variables:
|
|
159
|
+
variables.append(var)
|
|
160
|
+
else:
|
|
161
|
+
var = self._extract_var_from_term(term)
|
|
162
|
+
if var and var not in variables:
|
|
163
|
+
variables.append(var)
|
|
164
|
+
|
|
165
|
+
return variables
|
|
166
|
+
|
|
167
|
+
def _extract_var_from_term(self, term: str) -> Optional[str]:
|
|
168
|
+
"""
|
|
169
|
+
Extract variable name from a single term.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
term : str
|
|
174
|
+
Single term from formula
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
Optional[str]
|
|
179
|
+
Variable name, or None if not extractable
|
|
180
|
+
"""
|
|
181
|
+
term = term.strip()
|
|
182
|
+
|
|
183
|
+
# Function call like log(x), np.log(x), I(x**2)
|
|
184
|
+
func_match = re.match(r'(?:\w+\.)*(\w+)\((.*)\)', term)
|
|
185
|
+
if func_match:
|
|
186
|
+
func_name = func_match.group(1)
|
|
187
|
+
arg = func_match.group(2)
|
|
188
|
+
|
|
189
|
+
# For I(), extract variable from expression
|
|
190
|
+
if func_name == 'I':
|
|
191
|
+
# Extract variable names from expression
|
|
192
|
+
var_matches = re.findall(r'\b([a-zA-Z_]\w*)\b', arg)
|
|
193
|
+
return var_matches[0] if var_matches else None
|
|
194
|
+
else:
|
|
195
|
+
# For other functions, return argument if it's a variable name
|
|
196
|
+
if re.match(r'^[a-zA-Z_]\w*$', arg.strip()):
|
|
197
|
+
return arg.strip()
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
# Simple variable name
|
|
201
|
+
if re.match(r'^[a-zA-Z_]\w*$', term):
|
|
202
|
+
return term
|
|
203
|
+
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
def build_design_matrices(
|
|
207
|
+
self,
|
|
208
|
+
data: pd.DataFrame,
|
|
209
|
+
return_type: str = 'dataframe'
|
|
210
|
+
) -> Tuple[Any, Any]:
|
|
211
|
+
"""
|
|
212
|
+
Build design matrices using patsy.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
data : pd.DataFrame
|
|
217
|
+
Data containing variables referenced in formula
|
|
218
|
+
return_type : str, default='dataframe'
|
|
219
|
+
Return type: 'dataframe', 'matrix', or 'array'
|
|
220
|
+
- 'dataframe': returns pandas DataFrames
|
|
221
|
+
- 'matrix': returns patsy DesignMatrix objects
|
|
222
|
+
- 'array': returns numpy arrays
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
y : DataFrame, DesignMatrix, or ndarray
|
|
227
|
+
Dependent variable
|
|
228
|
+
X : DataFrame, DesignMatrix, or ndarray
|
|
229
|
+
Design matrix for independent variables
|
|
230
|
+
|
|
231
|
+
Examples
|
|
232
|
+
--------
|
|
233
|
+
>>> parser = FormulaParser("y ~ x1 + x2").parse()
|
|
234
|
+
>>> y, X = parser.build_design_matrices(data)
|
|
235
|
+
"""
|
|
236
|
+
if not self._parsed:
|
|
237
|
+
self.parse()
|
|
238
|
+
|
|
239
|
+
# Build formula for patsy
|
|
240
|
+
# Patsy will handle intercept automatically unless we specify -1
|
|
241
|
+
if self.has_intercept:
|
|
242
|
+
patsy_formula = f"{self.dependent} ~ {self._rhs}"
|
|
243
|
+
else:
|
|
244
|
+
patsy_formula = f"{self.dependent} ~ {self._rhs} - 1"
|
|
245
|
+
|
|
246
|
+
# Use patsy to build design matrices
|
|
247
|
+
y_mat, X_mat = patsy.dmatrices(patsy_formula, data, return_type='dataframe')
|
|
248
|
+
|
|
249
|
+
if return_type == 'dataframe':
|
|
250
|
+
# y_mat is a DataFrame with one column, extract as Series
|
|
251
|
+
y = y_mat.iloc[:, 0]
|
|
252
|
+
X = X_mat
|
|
253
|
+
elif return_type == 'matrix':
|
|
254
|
+
# Return patsy DesignMatrix objects
|
|
255
|
+
y, X = patsy.dmatrices(patsy_formula, data, return_type='matrix')
|
|
256
|
+
elif return_type == 'array':
|
|
257
|
+
# Return numpy arrays
|
|
258
|
+
y, X = patsy.dmatrices(patsy_formula, data, return_type='dataframe')
|
|
259
|
+
y = y.values.ravel()
|
|
260
|
+
X = X.values
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError("return_type must be 'dataframe', 'matrix', or 'array'")
|
|
263
|
+
|
|
264
|
+
return y, X
|
|
265
|
+
|
|
266
|
+
def get_variable_names(self, data: pd.DataFrame) -> List[str]:
|
|
267
|
+
"""
|
|
268
|
+
Get the names of variables in the design matrix.
|
|
269
|
+
|
|
270
|
+
Parameters
|
|
271
|
+
----------
|
|
272
|
+
data : pd.DataFrame
|
|
273
|
+
Data containing variables
|
|
274
|
+
|
|
275
|
+
Returns
|
|
276
|
+
-------
|
|
277
|
+
List[str]
|
|
278
|
+
List of column names in design matrix
|
|
279
|
+
|
|
280
|
+
Examples
|
|
281
|
+
--------
|
|
282
|
+
>>> parser = FormulaParser("y ~ x1 + x2").parse()
|
|
283
|
+
>>> var_names = parser.get_variable_names(data)
|
|
284
|
+
>>> print(var_names)
|
|
285
|
+
['Intercept', 'x1', 'x2']
|
|
286
|
+
"""
|
|
287
|
+
_, X = self.build_design_matrices(data, return_type='dataframe')
|
|
288
|
+
return list(X.columns)
|
|
289
|
+
|
|
290
|
+
def __repr__(self) -> str:
|
|
291
|
+
"""String representation."""
|
|
292
|
+
if self._parsed:
|
|
293
|
+
return f"FormulaParser('{self.formula}', dependent='{self.dependent}', k={len(self.regressors)})"
|
|
294
|
+
else:
|
|
295
|
+
return f"FormulaParser('{self.formula}', unparsed)"
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def parse_formula(formula: str) -> FormulaParser:
|
|
299
|
+
"""
|
|
300
|
+
Convenience function to parse a formula.
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
formula : str
|
|
305
|
+
Formula string
|
|
306
|
+
|
|
307
|
+
Returns
|
|
308
|
+
-------
|
|
309
|
+
FormulaParser
|
|
310
|
+
Parsed formula object
|
|
311
|
+
|
|
312
|
+
Examples
|
|
313
|
+
--------
|
|
314
|
+
>>> parser = parse_formula("y ~ x1 + x2")
|
|
315
|
+
>>> print(parser.dependent)
|
|
316
|
+
'y'
|
|
317
|
+
"""
|
|
318
|
+
return FormulaParser(formula).parse()
|