statgpu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statgpu/__init__.py +174 -0
- statgpu/_base.py +544 -0
- statgpu/_config.py +127 -0
- statgpu/anova/__init__.py +5 -0
- statgpu/anova/_oneway.py +194 -0
- statgpu/backends/__init__.py +83 -0
- statgpu/backends/_array_ops.py +529 -0
- statgpu/backends/_base.py +184 -0
- statgpu/backends/_cupy.py +453 -0
- statgpu/backends/_factory.py +65 -0
- statgpu/backends/_gpu_inference_cupy.py +214 -0
- statgpu/backends/_gpu_inference_torch.py +422 -0
- statgpu/backends/_numpy.py +324 -0
- statgpu/backends/_torch.py +685 -0
- statgpu/backends/_torch_safe.py +47 -0
- statgpu/backends/_utils.py +423 -0
- statgpu/core/__init__.py +10 -0
- statgpu/core/formula/__init__.py +33 -0
- statgpu/core/formula/_design.py +99 -0
- statgpu/core/formula/_parser.py +191 -0
- statgpu/core/formula/_terms.py +70 -0
- statgpu/core/formula/tests/__init__.py +0 -0
- statgpu/core/formula/tests/test_parser.py +194 -0
- statgpu/covariance/__init__.py +6 -0
- statgpu/covariance/_empirical.py +310 -0
- statgpu/covariance/_shrinkage.py +248 -0
- statgpu/cross_validation/__init__.py +31 -0
- statgpu/cross_validation/_base.py +410 -0
- statgpu/cross_validation/_engine.py +167 -0
- statgpu/diagnostics/__init__.py +7 -0
- statgpu/diagnostics/_regression_diagnostics.py +188 -0
- statgpu/feature_selection/__init__.py +24 -0
- statgpu/feature_selection/_knockoff.py +870 -0
- statgpu/feature_selection/_knockoff_utils.py +1003 -0
- statgpu/feature_selection/_stepwise.py +300 -0
- statgpu/glm_core/__init__.py +81 -0
- statgpu/glm_core/_base.py +202 -0
- statgpu/glm_core/_family.py +362 -0
- statgpu/glm_core/_fused.py +149 -0
- statgpu/glm_core/_gamma.py +111 -0
- statgpu/glm_core/_inverse_gaussian.py +62 -0
- statgpu/glm_core/_irls.py +561 -0
- statgpu/glm_core/_logistic.py +82 -0
- statgpu/glm_core/_negative_binomial.py +68 -0
- statgpu/glm_core/_poisson.py +60 -0
- statgpu/glm_core/_solver_legacy.py +100 -0
- statgpu/glm_core/_squared.py +53 -0
- statgpu/glm_core/_tweedie.py +74 -0
- statgpu/inference/__init__.py +239 -0
- statgpu/inference/_distributions_backend.py +2610 -0
- statgpu/inference/_multiple_testing.py +391 -0
- statgpu/inference/_resampling.py +1400 -0
- statgpu/inference/_results.py +265 -0
- statgpu/linear_model/__init__.py +75 -0
- statgpu/linear_model/_gaussian_inference.py +306 -0
- statgpu/linear_model/_glm_base.py +1261 -0
- statgpu/linear_model/_ordered_logit.py +52 -0
- statgpu/linear_model/_ordered_probit.py +50 -0
- statgpu/linear_model/_stats.py +170 -0
- statgpu/linear_model/cv/__init__.py +13 -0
- statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
- statgpu/linear_model/cv/_lasso_cv.py +253 -0
- statgpu/linear_model/cv/_logistic_cv.py +895 -0
- statgpu/linear_model/cv/_ridge_cv.py +1160 -0
- statgpu/linear_model/legacy/__init__.py +1 -0
- statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
- statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
- statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
- statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
- statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
- statgpu/linear_model/legacy/_solver_legacy.py +104 -0
- statgpu/linear_model/penalized/__init__.py +25 -0
- statgpu/linear_model/penalized/_base.py +437 -0
- statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
- statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
- statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
- statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
- statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
- statgpu/linear_model/penalized/_penalized_linear.py +236 -0
- statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
- statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
- statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
- statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
- statgpu/linear_model/penalized/_predict_mixin.py +182 -0
- statgpu/linear_model/wrappers/__init__.py +31 -0
- statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
- statgpu/linear_model/wrappers/_elasticnet.py +75 -0
- statgpu/linear_model/wrappers/_gamma.py +67 -0
- statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
- statgpu/linear_model/wrappers/_lasso.py +2124 -0
- statgpu/linear_model/wrappers/_linear.py +1127 -0
- statgpu/linear_model/wrappers/_logistic.py +1435 -0
- statgpu/linear_model/wrappers/_mcp.py +58 -0
- statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
- statgpu/linear_model/wrappers/_poisson.py +48 -0
- statgpu/linear_model/wrappers/_ridge.py +166 -0
- statgpu/linear_model/wrappers/_scad.py +58 -0
- statgpu/linear_model/wrappers/_tweedie.py +57 -0
- statgpu/metrics/__init__.py +21 -0
- statgpu/metrics/_classification.py +591 -0
- statgpu/nonparametric/__init__.py +50 -0
- statgpu/nonparametric/kernel_methods/__init__.py +25 -0
- statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
- statgpu/nonparametric/kernel_methods/_krr.py +234 -0
- statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
- statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
- statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
- statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
- statgpu/nonparametric/splines/__init__.py +5 -0
- statgpu/nonparametric/splines/_bspline_basis.py +336 -0
- statgpu/nonparametric/splines/_penalized.py +349 -0
- statgpu/panel/__init__.py +19 -0
- statgpu/panel/_covariance.py +140 -0
- statgpu/panel/_fixed_effects.py +420 -0
- statgpu/panel/_random_effects.py +385 -0
- statgpu/panel/_utils.py +482 -0
- statgpu/penalties/__init__.py +139 -0
- statgpu/penalties/_adaptive_l1.py +313 -0
- statgpu/penalties/_base.py +261 -0
- statgpu/penalties/_categories.py +39 -0
- statgpu/penalties/_elasticnet.py +98 -0
- statgpu/penalties/_group_lasso.py +678 -0
- statgpu/penalties/_group_mcp.py +553 -0
- statgpu/penalties/_group_scad.py +605 -0
- statgpu/penalties/_l1.py +107 -0
- statgpu/penalties/_l2.py +77 -0
- statgpu/penalties/_mcp.py +237 -0
- statgpu/penalties/_scad.py +260 -0
- statgpu/semiparametric/__init__.py +5 -0
- statgpu/semiparametric/_gam.py +401 -0
- statgpu/solvers/__init__.py +24 -0
- statgpu/solvers/_admm.py +241 -0
- statgpu/solvers/_constants.py +15 -0
- statgpu/solvers/_convergence.py +6 -0
- statgpu/solvers/_fista.py +436 -0
- statgpu/solvers/_fista_bb.py +513 -0
- statgpu/solvers/_fista_lla.py +541 -0
- statgpu/solvers/_lbfgs.py +206 -0
- statgpu/solvers/_newton.py +149 -0
- statgpu/solvers/_utils.py +277 -0
- statgpu/survival/__init__.py +14 -0
- statgpu/survival/_cox.py +3974 -0
- statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
- statgpu/survival/_cox_cv.py +1159 -0
- statgpu/survival/_cox_efron_cuda.py +1280 -0
- statgpu/survival/_cox_efron_triton.py +359 -0
- statgpu/unsupervised/__init__.py +29 -0
- statgpu/unsupervised/_agglomerative.py +307 -0
- statgpu/unsupervised/_dbscan.py +263 -0
- statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
- statgpu/unsupervised/_gmm.py +332 -0
- statgpu/unsupervised/_incremental_pca.py +176 -0
- statgpu/unsupervised/_kmeans.py +261 -0
- statgpu/unsupervised/_minibatch_kmeans.py +299 -0
- statgpu/unsupervised/_minibatch_nmf.py +252 -0
- statgpu/unsupervised/_nmf.py +190 -0
- statgpu/unsupervised/_pca.py +189 -0
- statgpu/unsupervised/_truncated_svd.py +132 -0
- statgpu/unsupervised/_tsne.py +192 -0
- statgpu/unsupervised/_umap.py +224 -0
- statgpu/unsupervised/_utils.py +134 -0
- statgpu-0.1.0.dist-info/METADATA +245 -0
- statgpu-0.1.0.dist-info/RECORD +168 -0
- statgpu-0.1.0.dist-info/WHEEL +5 -0
- statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
- statgpu-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FormulaParser – R-style formula parser wrapping patsy.
|
|
3
|
+
|
|
4
|
+
Provides the ``FormulaParser`` class that converts R-style formulas like
|
|
5
|
+
``"y ~ x1 + x2 + C(sex)"`` into design matrices, using `patsy` internally.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional, Tuple, List, Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FormulaParser:
|
|
15
|
+
"""R-style formula parser that builds design matrices via patsy.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
formula : str
|
|
20
|
+
R-style formula string, e.g. ``"y ~ x1 + x2 + C(sex)"``.
|
|
21
|
+
|
|
22
|
+
Attributes
|
|
23
|
+
----------
|
|
24
|
+
formula : str
|
|
25
|
+
The original formula string.
|
|
26
|
+
design_info : patsy.DesignInfo or None
|
|
27
|
+
Design matrix metadata (column names, term definitions).
|
|
28
|
+
Set after :meth:`eval` is called.
|
|
29
|
+
column_names : list[str] or None
|
|
30
|
+
Names of the predictor columns (excluding the response).
|
|
31
|
+
Set after :meth:`eval` is called.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
>>> import pandas as pd
|
|
36
|
+
>>> import numpy as np
|
|
37
|
+
>>> df = pd.DataFrame({
|
|
38
|
+
... "y": np.random.randn(100),
|
|
39
|
+
... "x1": np.random.randn(100),
|
|
40
|
+
... "x2": np.random.randn(100),
|
|
41
|
+
... })
|
|
42
|
+
>>> parser = FormulaParser("y ~ x1 + x2")
|
|
43
|
+
>>> y, X, info = parser.eval(df)
|
|
44
|
+
>>> parser.column_names
|
|
45
|
+
['x1', 'x2']
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, formula: str):
|
|
49
|
+
self.formula = formula
|
|
50
|
+
self._design_info = None
|
|
51
|
+
self._y_names: Optional[List[str]] = None
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def design_info(self):
|
|
55
|
+
"""Design matrix metadata, available after :meth:`eval`."""
|
|
56
|
+
return self._design_info
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def column_names(self) -> Optional[List[str]]:
|
|
60
|
+
"""Predictor column names, available after :meth:`eval`."""
|
|
61
|
+
if self._design_info is None:
|
|
62
|
+
return None
|
|
63
|
+
return list(self._design_info.column_names)
|
|
64
|
+
|
|
65
|
+
def _require_patsy(self):
|
|
66
|
+
"""Return patsy module or raise ImportError with guidance."""
|
|
67
|
+
try:
|
|
68
|
+
import patsy
|
|
69
|
+
except ImportError:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"The 'patsy' package is required for formula-based model fitting. "
|
|
72
|
+
"Install it with: pip install statgpu[formula] "
|
|
73
|
+
"or: pip install patsy"
|
|
74
|
+
)
|
|
75
|
+
return patsy
|
|
76
|
+
|
|
77
|
+
def eval(
|
|
78
|
+
self,
|
|
79
|
+
data: pd.DataFrame,
|
|
80
|
+
eval_env: int = 0,
|
|
81
|
+
) -> Tuple[np.ndarray, np.ndarray, Any]:
|
|
82
|
+
"""Parse formula and build design matrices from a DataFrame.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
data : pd.DataFrame
|
|
87
|
+
DataFrame containing the columns referenced in the formula.
|
|
88
|
+
eval_env : int, default=0
|
|
89
|
+
Evaluation frame depth for patsy name resolution.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
y : ndarray of shape (n_obs,) or (n_obs, n_responses)
|
|
94
|
+
Response variable(s).
|
|
95
|
+
X : ndarray of shape (n_obs, n_predictors)
|
|
96
|
+
Predictor design matrix.
|
|
97
|
+
design_info : patsy.DesignInfo
|
|
98
|
+
Metadata for the predictor design (column names, term info).
|
|
99
|
+
"""
|
|
100
|
+
patsy = self._require_patsy()
|
|
101
|
+
data = data.copy()
|
|
102
|
+
|
|
103
|
+
y, X = patsy.dmatrices(
|
|
104
|
+
self.formula,
|
|
105
|
+
data,
|
|
106
|
+
eval_env=eval_env + 1,
|
|
107
|
+
return_type="matrix",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
self._y_names = list(y.design_info.column_names)
|
|
111
|
+
self._design_info = X.design_info
|
|
112
|
+
|
|
113
|
+
y_arr = np.asarray(y)
|
|
114
|
+
if y_arr.ndim == 2 and y_arr.shape[1] == 1:
|
|
115
|
+
y_arr = y_arr.ravel()
|
|
116
|
+
X_arr = np.asarray(X)
|
|
117
|
+
|
|
118
|
+
return y_arr, X_arr, X.design_info
|
|
119
|
+
|
|
120
|
+
def transform(
|
|
121
|
+
self,
|
|
122
|
+
new_data: pd.DataFrame,
|
|
123
|
+
eval_env: int = 0,
|
|
124
|
+
) -> np.ndarray:
|
|
125
|
+
"""Build a design matrix for new data using the stored design_info.
|
|
126
|
+
|
|
127
|
+
Used during :meth:`predict` to ensure new data is encoded
|
|
128
|
+
with the same column structure (including categorical coding)
|
|
129
|
+
as the training data.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
new_data : pd.DataFrame
|
|
134
|
+
DataFrame with the same columns as the training data.
|
|
135
|
+
eval_env : int, default=0
|
|
136
|
+
Evaluation frame depth for patsy name resolution.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
X_new : ndarray of shape (n_new_obs, n_predictors)
|
|
141
|
+
Design matrix aligned with the training design.
|
|
142
|
+
|
|
143
|
+
Raises
|
|
144
|
+
------
|
|
145
|
+
RuntimeError
|
|
146
|
+
If :meth:`eval` has not been called yet (no design_info available).
|
|
147
|
+
ValueError
|
|
148
|
+
If new_data has columns that don't match the training structure.
|
|
149
|
+
"""
|
|
150
|
+
if self._design_info is None:
|
|
151
|
+
raise RuntimeError(
|
|
152
|
+
"Cannot transform: no design_info available. "
|
|
153
|
+
"Call eval() first on training data."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
patsy = self._require_patsy()
|
|
157
|
+
|
|
158
|
+
X_new = patsy.build_design_matrices(
|
|
159
|
+
[self._design_info],
|
|
160
|
+
new_data,
|
|
161
|
+
return_type="matrix",
|
|
162
|
+
)[0]
|
|
163
|
+
|
|
164
|
+
return np.asarray(X_new)
|
|
165
|
+
|
|
166
|
+
def summary(self) -> str:
|
|
167
|
+
"""Return a human-readable summary of the formula parsing.
|
|
168
|
+
|
|
169
|
+
Shows the formula string, response variables, predictor names,
|
|
170
|
+
and term definitions (useful for debugging categorical encoding).
|
|
171
|
+
"""
|
|
172
|
+
lines = [f"Formula: {self.formula}"]
|
|
173
|
+
|
|
174
|
+
if self._design_info is None:
|
|
175
|
+
lines.append("(Not yet evaluated. Call eval() to parse.)")
|
|
176
|
+
return "\n".join(lines)
|
|
177
|
+
|
|
178
|
+
lines.append(f"Response: {self._y_names}")
|
|
179
|
+
lines.append(f"Predictors ({len(self.column_names)}):")
|
|
180
|
+
for name in self.column_names:
|
|
181
|
+
lines.append(f" - {name}")
|
|
182
|
+
|
|
183
|
+
lines.append("\nTerms:")
|
|
184
|
+
for term in self._design_info.term_name_slices.keys():
|
|
185
|
+
lines.append(f" {term}")
|
|
186
|
+
|
|
187
|
+
return "\n".join(lines)
|
|
188
|
+
|
|
189
|
+
def __repr__(self) -> str:
|
|
190
|
+
evaluated = "evaluated" if self._design_info is not None else "pending"
|
|
191
|
+
return f"FormulaParser({self.formula!r}, {evaluated})"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Formula term helpers and custom evaluation environments.
|
|
3
|
+
|
|
4
|
+
Patsy natively supports R-style formula terms:
|
|
5
|
+
|
|
6
|
+
- ``C(var)`` — treat as categorical (one-hot encoding)
|
|
7
|
+
- ``np.func(var)`` — apply numpy function (e.g. ``np.log(x)``)
|
|
8
|
+
- ``x1:x2`` — interaction only
|
|
9
|
+
- ``x1*x2`` — main effects + interaction
|
|
10
|
+
- ``x1 + x2`` — additive
|
|
11
|
+
- ``x1 + x2 - 1`` — additive without intercept
|
|
12
|
+
- ``np.log(x)`` — transformations
|
|
13
|
+
|
|
14
|
+
This module provides helper functions for constructing custom
|
|
15
|
+
patsy evaluation environments, needed for model-specific syntax
|
|
16
|
+
like ``Surv(time, event)`` in Cox PH models.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from typing import Dict, Any, Optional
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _surv(time, event):
|
|
25
|
+
"""Survival function for patsy formula parsing.
|
|
26
|
+
|
|
27
|
+
Mimics R's survival::Surv() function for use in patsy formulas::
|
|
28
|
+
|
|
29
|
+
"Surv(time, event) ~ x1 + x2"
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
time : array-like
|
|
34
|
+
Survival/follow-up times.
|
|
35
|
+
event : array-like
|
|
36
|
+
Event indicator (1 = event occurred, 0 = censored).
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
result : ndarray of shape (n, 2)
|
|
41
|
+
Column 0: time, Column 1: event.
|
|
42
|
+
"""
|
|
43
|
+
time = np.asarray(time, dtype=np.float64).ravel()
|
|
44
|
+
event = np.asarray(event, dtype=np.float64).ravel()
|
|
45
|
+
|
|
46
|
+
if len(time) != len(event):
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"time ({len(time)} elements) and event ({len(event)} elements) "
|
|
49
|
+
"must have the same length."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return np.column_stack([time, event])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def make_surv_env() -> Dict[str, Any]:
|
|
56
|
+
"""Create a patsy evaluation environment with ``Surv`` function.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
env : dict
|
|
61
|
+
Custom functions for patsy's ``EvalEnvironment``.
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> from statgpu.core.formula._terms import make_surv_env
|
|
66
|
+
>>> import patsy
|
|
67
|
+
>>> env = make_surv_env()
|
|
68
|
+
>>> # Then pass env to patsy.dmatrices or dmatrix
|
|
69
|
+
"""
|
|
70
|
+
return {"Surv": _surv}
|
|
File without changes
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for statgpu.core.formula module.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def sample_df():
|
|
12
|
+
"""Standard test DataFrame."""
|
|
13
|
+
np.random.seed(42)
|
|
14
|
+
n = 200
|
|
15
|
+
return pd.DataFrame({
|
|
16
|
+
"y": np.random.randn(n),
|
|
17
|
+
"x1": np.random.randn(n),
|
|
18
|
+
"x2": np.random.randn(n),
|
|
19
|
+
"cat": pd.Categorical(np.random.choice(["A", "B", "C"], n)),
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestFormulaParserBasic:
|
|
24
|
+
"""Test basic formula parsing."""
|
|
25
|
+
|
|
26
|
+
def test_simple_formula(self, sample_df):
|
|
27
|
+
"""Test simple y ~ x1 + x2 formula."""
|
|
28
|
+
from statgpu.core.formula import FormulaParser
|
|
29
|
+
|
|
30
|
+
parser = FormulaParser("y ~ x1 + x2")
|
|
31
|
+
y, X, info = parser.eval(sample_df)
|
|
32
|
+
|
|
33
|
+
assert y.shape == (200,)
|
|
34
|
+
assert X.shape == (200, 3) # intercept + x1 + x2
|
|
35
|
+
assert parser.column_names == ["Intercept", "x1", "x2"]
|
|
36
|
+
|
|
37
|
+
def test_no_intercept(self, sample_df):
|
|
38
|
+
"""Test y ~ x1 + x2 - 1 (no intercept)."""
|
|
39
|
+
from statgpu.core.formula import FormulaParser
|
|
40
|
+
|
|
41
|
+
parser = FormulaParser("y ~ x1 + x2 - 1")
|
|
42
|
+
y, X, info = parser.eval(sample_df)
|
|
43
|
+
|
|
44
|
+
assert X.shape == (200, 2)
|
|
45
|
+
assert parser.column_names == ["x1", "x2"]
|
|
46
|
+
|
|
47
|
+
def test_categorical_encoding(self, sample_df):
|
|
48
|
+
"""Test C() for categorical variables."""
|
|
49
|
+
from statgpu.core.formula import FormulaParser
|
|
50
|
+
|
|
51
|
+
parser = FormulaParser("y ~ x1 + C(cat)")
|
|
52
|
+
y, X, info = parser.eval(sample_df)
|
|
53
|
+
|
|
54
|
+
# Intercept + x1 + cat[T.B] + cat[T.C] = 4 columns
|
|
55
|
+
assert X.shape[1] == 4
|
|
56
|
+
assert "x1" in parser.column_names
|
|
57
|
+
assert any("cat" in name for name in parser.column_names)
|
|
58
|
+
|
|
59
|
+
def test_interaction(self, sample_df):
|
|
60
|
+
"""Test x1:x2 interaction."""
|
|
61
|
+
from statgpu.core.formula import FormulaParser
|
|
62
|
+
|
|
63
|
+
parser = FormulaParser("y ~ x1 + x2 + x1:x2")
|
|
64
|
+
y, X, info = parser.eval(sample_df)
|
|
65
|
+
|
|
66
|
+
assert X.shape[1] == 4 # intercept + x1 + x2 + x1:x2
|
|
67
|
+
|
|
68
|
+
def test_star_operator(self, sample_df):
|
|
69
|
+
"""Test x1*x2 (main effects + interaction)."""
|
|
70
|
+
from statgpu.core.formula import FormulaParser
|
|
71
|
+
|
|
72
|
+
parser = FormulaParser("y ~ x1 * x2")
|
|
73
|
+
y, X, info = parser.eval(sample_df)
|
|
74
|
+
|
|
75
|
+
assert X.shape[1] == 4 # intercept + x1 + x2 + x1:x2
|
|
76
|
+
|
|
77
|
+
def test_transform(self, sample_df):
|
|
78
|
+
"""Test np() transformations."""
|
|
79
|
+
from statgpu.core.formula import FormulaParser
|
|
80
|
+
|
|
81
|
+
parser = FormulaParser("y ~ np.log(np.abs(x1)) + x2")
|
|
82
|
+
y, X, info = parser.eval(sample_df)
|
|
83
|
+
|
|
84
|
+
assert y.shape == (200,)
|
|
85
|
+
assert X.shape[1] == 3 # intercept + transformed_x1 + x2
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestFormulaParserTransform:
|
|
89
|
+
"""Test transform (predict-time) functionality."""
|
|
90
|
+
|
|
91
|
+
def test_transform_new_data(self, sample_df):
|
|
92
|
+
"""Test transform on new data with same structure."""
|
|
93
|
+
from statgpu.core.formula import FormulaParser
|
|
94
|
+
|
|
95
|
+
parser = FormulaParser("y ~ x1 + x2")
|
|
96
|
+
parser.eval(sample_df)
|
|
97
|
+
|
|
98
|
+
new_data = pd.DataFrame({
|
|
99
|
+
"x1": [0.5, -0.3],
|
|
100
|
+
"x2": [1.2, 0.8],
|
|
101
|
+
})
|
|
102
|
+
X_new = parser.transform(new_data)
|
|
103
|
+
|
|
104
|
+
assert X_new.shape == (2, 3) # 2 rows, intercept + 2 cols
|
|
105
|
+
|
|
106
|
+
def test_transform_with_categorical(self, sample_df):
|
|
107
|
+
"""Test transform handles categorical encoding from training."""
|
|
108
|
+
from statgpu.core.formula import FormulaParser
|
|
109
|
+
|
|
110
|
+
parser = FormulaParser("y ~ x1 + C(cat)")
|
|
111
|
+
parser.eval(sample_df)
|
|
112
|
+
|
|
113
|
+
new_data = pd.DataFrame({
|
|
114
|
+
"x1": [0.5],
|
|
115
|
+
"cat": pd.Categorical(["A"]),
|
|
116
|
+
})
|
|
117
|
+
X_new = parser.transform(new_data)
|
|
118
|
+
|
|
119
|
+
assert X_new.shape == (1, 4) # intercept + x1 + cat[B] + cat[C]
|
|
120
|
+
|
|
121
|
+
def test_transform_no_design_info(self):
|
|
122
|
+
"""Test transform raises when not yet evaluated."""
|
|
123
|
+
from statgpu.core.formula import FormulaParser
|
|
124
|
+
|
|
125
|
+
parser = FormulaParser("y ~ x1")
|
|
126
|
+
new_data = pd.DataFrame({"x1": [1.0]})
|
|
127
|
+
|
|
128
|
+
with pytest.raises(RuntimeError, match="no design_info available"):
|
|
129
|
+
parser.transform(new_data)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class TestParseFormulaSafe:
|
|
133
|
+
"""Test parse_formula_safe fallback logic."""
|
|
134
|
+
|
|
135
|
+
def test_formula_path(self, sample_df):
|
|
136
|
+
"""Test formula path works."""
|
|
137
|
+
from statgpu.core.formula import parse_formula_safe
|
|
138
|
+
|
|
139
|
+
y, X, info = parse_formula_safe("y ~ x1", data=sample_df)
|
|
140
|
+
assert y.shape == (200,)
|
|
141
|
+
assert info is not None
|
|
142
|
+
|
|
143
|
+
def test_array_path(self, sample_df):
|
|
144
|
+
"""Test array path when formula is None."""
|
|
145
|
+
from statgpu.core.formula import parse_formula_safe
|
|
146
|
+
|
|
147
|
+
X = sample_df[["x1", "x2"]].values
|
|
148
|
+
y = sample_df["y"].values
|
|
149
|
+
y_out, X_out, info = parse_formula_safe(None, None, X=X, y=y)
|
|
150
|
+
|
|
151
|
+
assert info is None
|
|
152
|
+
np.testing.assert_array_equal(y_out, y)
|
|
153
|
+
np.testing.assert_array_equal(X_out, X)
|
|
154
|
+
|
|
155
|
+
def test_formula_without_data_raises(self):
|
|
156
|
+
"""Test that formula without data raises."""
|
|
157
|
+
from statgpu.core.formula import parse_formula_safe
|
|
158
|
+
|
|
159
|
+
with pytest.raises(ValueError, match="data"):
|
|
160
|
+
parse_formula_safe("y ~ x1", None)
|
|
161
|
+
|
|
162
|
+
def test_no_input_raises(self):
|
|
163
|
+
"""Test that no input raises."""
|
|
164
|
+
from statgpu.core.formula import parse_formula_safe
|
|
165
|
+
|
|
166
|
+
with pytest.raises(ValueError, match="Either formula"):
|
|
167
|
+
parse_formula_safe(None, None)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class TestFormulaParserSummary:
|
|
171
|
+
"""Test FormulaParser.summary() output."""
|
|
172
|
+
|
|
173
|
+
def test_summary_before_eval(self, sample_df):
|
|
174
|
+
"""Test summary shows pending state."""
|
|
175
|
+
from statgpu.core.formula import FormulaParser
|
|
176
|
+
|
|
177
|
+
parser = FormulaParser("y ~ x1 + x2")
|
|
178
|
+
s = parser.summary()
|
|
179
|
+
|
|
180
|
+
assert "y ~ x1 + x2" in s
|
|
181
|
+
assert "pending" in s.lower() or "Not yet evaluated" in s
|
|
182
|
+
|
|
183
|
+
def test_summary_after_eval(self, sample_df):
|
|
184
|
+
"""Test summary shows parsed info."""
|
|
185
|
+
from statgpu.core.formula import FormulaParser
|
|
186
|
+
|
|
187
|
+
parser = FormulaParser("y ~ x1 + x2")
|
|
188
|
+
parser.eval(sample_df)
|
|
189
|
+
s = parser.summary()
|
|
190
|
+
|
|
191
|
+
assert "y ~ x1 + x2" in s
|
|
192
|
+
assert "x1" in s
|
|
193
|
+
assert "x2" in s
|
|
194
|
+
assert "Predictors (3)" in s
|