statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1261 @@
1
+ """
2
+ Generalized Linear Model base classes.
3
+
4
+ Uses sklearn pattern: base class with subclasses overriding _get_family()
5
+ and, when needed, the family-to-GLM-loss mapping.
6
+ Supports IRLS (smooth penalty) and FISTA (any penalty) solvers.
7
+ """
8
+
9
+ from typing import Optional, Union, Dict
10
+ import numpy as np
11
+
12
+
13
+ def _parse_formula_if_provided(formula, data, X, y):
14
+ """Parse formula+data or fall back to raw arrays. Returns (y, X, info)."""
15
+ if formula is not None:
16
+ from statgpu.core.formula import parse_formula
17
+ return parse_formula(formula, data)
18
+ y = np.asarray(y)
19
+ if y.ndim == 2 and y.shape[1] == 1:
20
+ y = y.ravel()
21
+ return y, np.asarray(X), None
22
+
23
+ from statgpu._base import BaseEstimator
24
+ from statgpu._config import Device
25
+ from statgpu.backends import _to_numpy, _resolve_backend
26
+ from statgpu.glm_core._irls import IRLSSolver
27
+ from statgpu.solvers import fista_solver
28
+ from statgpu.glm_core._family import (
29
+ Gaussian,
30
+ Binomial,
31
+ Poisson,
32
+ Gamma,
33
+ InverseGaussian,
34
+ NegativeBinomial,
35
+ Tweedie,
36
+ )
37
+
38
+
39
+ def _np_compat_xp(arr):
40
+ """Get numpy-compatible array module from array type.
41
+
42
+ Returns cupy for cupy arrays, numpy for everything else (including torch).
43
+ Used for operations that need numpy-style indexing (e.g., ordered model).
44
+ """
45
+ from statgpu.backends._utils import _get_xp
46
+ backend = _resolve_backend("auto", arr)
47
+ if backend == "cupy":
48
+ return _get_xp("cupy")
49
+ return np
50
+
51
+
52
+ def _torch_promoted_float_dtype(X, y):
53
+ """Return a floating dtype that can safely combine Torch X and y."""
54
+ import torch
55
+
56
+ x_dtype = X.dtype if X.is_floating_point() else torch.float64
57
+ y_is_float = getattr(y, "is_floating_point", lambda: False)()
58
+ y_dtype = y.dtype if y_is_float else torch.float64
59
+ return torch.promote_types(x_dtype, y_dtype)
60
+
61
+
62
+ def _add_intercept_column(X, backend_name):
63
+ """Prepend an intercept column of ones to X. Works for numpy/cupy/torch."""
64
+ from statgpu.backends._utils import _get_xp, xp_ones
65
+ xp = _get_xp(backend_name)
66
+ n = X.shape[0]
67
+ ones = xp_ones((n, 1), dtype=X.dtype, xp=xp, ref_arr=X)
68
+ return xp.column_stack([ones, X])
69
+
70
+
71
+ class GeneralizedLinearModel(BaseEstimator):
72
+ """GLM base class with shared IRLS + FISTA paths.
73
+
74
+ Subclasses override _get_family() and optionally the GLM loss mapping.
75
+
76
+ Parameters
77
+ ----------
78
+ family : str, default='gaussian'
79
+ Distribution family: 'gaussian', 'binomial', 'poisson'.
80
+ fit_intercept : bool, default=True
81
+ Whether to calculate the intercept.
82
+ max_iter : int, default=100
83
+ Maximum iterations.
84
+ tol : float, default=1e-4
85
+ Convergence tolerance.
86
+ C : float, default=1.0
87
+ Inverse regularization strength (for IRLS L2).
88
+ device : str or Device, default='auto'
89
+ solver : str, default='auto'
90
+ 'auto', 'irls', 'fista', 'newton', or 'lbfgs'.
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ family: str = "gaussian",
96
+ fit_intercept: bool = True,
97
+ max_iter: int = 100,
98
+ tol: float = 1e-4,
99
+ C: float = 1.0,
100
+ device: Union[str, Device] = Device.AUTO,
101
+ n_jobs: Optional[int] = None,
102
+ solver: str = "auto",
103
+ gpu_memory_cleanup: bool = False,
104
+ ):
105
+ super().__init__(device=device, n_jobs=n_jobs)
106
+ self.family = family
107
+ self.fit_intercept = fit_intercept
108
+ self.max_iter = max_iter
109
+ self.tol = tol
110
+ self.C = C
111
+ self.solver = solver
112
+ self.gpu_memory_cleanup = gpu_memory_cleanup
113
+
114
+ self.coef_ = None
115
+ self.intercept_ = None
116
+ self.n_iter_ = None
117
+ self._nobs = None
118
+ self._df_resid = None
119
+ self._params = None
120
+ self._feature_names = None
121
+ self._design_info = None
122
+ self._formula_has_intercept = None
123
+ self._use_intercept = None # formula-derived override; None = use fit_intercept
124
+
125
+ @property
126
+ def _effective_intercept(self):
127
+ """Return the effective intercept flag.
128
+
129
+ When formula is used, the formula's intercept semantics take priority
130
+ (stored in ``_use_intercept``). Otherwise, ``fit_intercept`` is used.
131
+ This avoids mutating ``fit_intercept`` which would break ``sklearn.clone``.
132
+ """
133
+ if self._use_intercept is not None:
134
+ return self._use_intercept
135
+ return self.fit_intercept
136
+
137
+ def _get_family(self):
138
+ """Return the GLM Family instance. Override in subclass."""
139
+ family_map = {
140
+ "gaussian": Gaussian,
141
+ "binomial": Binomial,
142
+ "poisson": Poisson,
143
+ "gamma": Gamma,
144
+ "inverse_gaussian": InverseGaussian,
145
+ "negative_binomial": NegativeBinomial,
146
+ "tweedie": Tweedie,
147
+ }
148
+ if self.family not in family_map:
149
+ raise ValueError(
150
+ f"Unknown family '{self.family}'. "
151
+ f"Supported families: {list(family_map.keys())}"
152
+ )
153
+ kwargs = self._get_loss_kwargs()
154
+ return family_map[self.family](**kwargs)
155
+
156
+ def _get_penalty_alpha(self):
157
+ """L2 regularization alpha for IRLS: lambda = 1/(2*C)."""
158
+ return 1.0 / (2.0 * self.C) if self.C > 0 else 0.0
159
+
160
+ def _cleanup_cuda_memory(self):
161
+ """Best-effort CuPy memory pool cleanup."""
162
+ if not bool(self.gpu_memory_cleanup):
163
+ return
164
+ try:
165
+ import cupy as cp
166
+ cp.get_default_memory_pool().free_all_blocks()
167
+ cp.get_default_pinned_memory_pool().free_all_blocks()
168
+ except Exception:
169
+ pass
170
+
171
+ def _cleanup_torch_memory(self):
172
+ """Best-effort Torch CUDA memory cleanup."""
173
+ if not bool(self.gpu_memory_cleanup):
174
+ return
175
+ try:
176
+ import torch
177
+ if torch.cuda.is_available():
178
+ torch.cuda.empty_cache()
179
+ except Exception:
180
+ pass
181
+
182
+ def _cleanup_backend_memory(self, backend_name):
183
+ if backend_name == "cupy":
184
+ self._cleanup_cuda_memory()
185
+ elif backend_name == "torch":
186
+ self._cleanup_torch_memory()
187
+
188
+ def __del__(self):
189
+ try:
190
+ self._cleanup_cuda_memory()
191
+ self._cleanup_torch_memory()
192
+ except Exception:
193
+ pass
194
+
195
+ def fit(self, X=None, y=None, sample_weight=None, formula=None, data=None):
196
+ """Fit GLM model.
197
+
198
+ Parameters
199
+ ----------
200
+ X : array-like or None
201
+ Predictor matrix. Required if ``formula`` is None.
202
+ y : array-like or None
203
+ Response vector. Required if ``formula`` is None.
204
+ sample_weight : array-like or None
205
+ Sample weights.
206
+ formula : str or None
207
+ R-style formula string (e.g. ``"y ~ x1 + x2"``).
208
+ data : pd.DataFrame or None
209
+ DataFrame used with ``formula`` for column lookup.
210
+ """
211
+ # Handle formula interface
212
+ if formula is not None:
213
+ if data is None:
214
+ raise ValueError(
215
+ "formula was provided but data is None. "
216
+ "Pass data=your_dataframe when using formula."
217
+ )
218
+ y_arr, X_arr, design_info = _parse_formula_if_provided(
219
+ formula, data, None, None
220
+ )
221
+ self._design_info = design_info
222
+ formula_column_names = list(design_info.column_names)
223
+ self._formula_has_intercept = "Intercept" in formula_column_names
224
+ self._feature_names = [name for name in formula_column_names if name != "Intercept"]
225
+ if self._formula_has_intercept:
226
+ intercept_idx = formula_column_names.index("Intercept")
227
+ X_arr = np.delete(X_arr, intercept_idx, axis=1)
228
+ # Store formula-derived intercept decision in internal attribute
229
+ # to avoid mutating self.fit_intercept (breaks sklearn clone).
230
+ self._use_intercept = True
231
+ else:
232
+ # Formula syntax owns intercept semantics, matching statsmodels/R.
233
+ self._use_intercept = False
234
+ else:
235
+ if X is None or y is None:
236
+ raise ValueError(
237
+ "Either formula+data or X+y must be provided."
238
+ )
239
+ self._feature_names = None
240
+ self._design_info = None
241
+ self._formula_has_intercept = None
242
+ self._use_intercept = None
243
+ y_arr = np.asarray(y)
244
+ if y_arr.ndim == 2 and y_arr.shape[1] == 1:
245
+ y_arr = y_arr.ravel()
246
+ X_arr = np.asarray(X)
247
+
248
+ backend = self._get_backend(backend="auto")
249
+ backend_name = backend.name
250
+
251
+ # Convert to backend arrays using xp_asarray for proper device placement
252
+ from statgpu.backends._utils import _get_xp, xp_asarray
253
+ xp = _get_xp(backend_name)
254
+ # For torch backend, ensure arrays land on CUDA (not CPU)
255
+ _ref = None
256
+ if backend_name == "torch":
257
+ import torch
258
+ _ref = torch.empty(0, dtype=torch.float64, device="cuda")
259
+ X_arr = xp_asarray(X_arr, dtype=xp.float64, xp=xp, ref_arr=_ref)
260
+ y_arr = xp_asarray(y_arr, dtype=xp.float64, xp=xp, ref_arr=_ref)
261
+ self._nobs = X_arr.shape[0]
262
+
263
+ family = self._get_family()
264
+ _solver_lower = self.solver.lower() if isinstance(self.solver, str) else self.solver
265
+ if _solver_lower == "auto":
266
+ # Heuristic: IRLS for smooth/no penalties, FISTA for non-smooth
267
+ _pen = getattr(self, "_penalty", None)
268
+ _pname = str(getattr(_pen, "name", "none")).lower() if _pen is not None else "none"
269
+ if _pname in ("l1", "scad", "mcp", "adaptive_l1", "adaptive_lasso",
270
+ "group_lasso", "group_mcp", "group_scad"):
271
+ solver_name = "fista"
272
+ else:
273
+ solver_name = "irls"
274
+ else:
275
+ solver_name = _solver_lower
276
+
277
+ if solver_name == "irls":
278
+ self._fit_irls(X_arr, y_arr, sample_weight, family, backend_name)
279
+ elif solver_name == "fista":
280
+ self._fit_fista(X_arr, y_arr, sample_weight, family, backend_name)
281
+ elif solver_name in ("newton", "lbfgs"):
282
+ self._fit_smooth_solver(
283
+ X_arr, y_arr, sample_weight, solver_name, backend_name
284
+ )
285
+ else:
286
+ raise ValueError(
287
+ "solver must be one of: 'auto', 'irls', 'fista', 'newton', 'lbfgs'"
288
+ )
289
+
290
+ self._fitted = True
291
+ self._cleanup_backend_memory(backend_name)
292
+ return self
293
+
294
+ def _fit_irls(self, X, y, sample_weight, family, backend_name="numpy"):
295
+ """Fit using IRLS (per-iteration weighted least squares)."""
296
+ # IRLSSolver solves the unnormalized WLS normal equations
297
+ # X'WX + lambda I, while _get_penalty_alpha() is the normalized
298
+ # objective penalty. Scale by n to keep C semantics consistent.
299
+ ridge_alpha = X.shape[0] * self._get_penalty_alpha()
300
+
301
+ if self._effective_intercept:
302
+ X_design = _add_intercept_column(X, backend_name)
303
+ else:
304
+ X_design = X
305
+
306
+ solver = IRLSSolver(family, max_iter=self.max_iter, tol=self.tol)
307
+ params, n_iter = solver.fit(
308
+ X_design, y,
309
+ sample_weight=sample_weight,
310
+ ridge_alpha=ridge_alpha,
311
+ ridge_penalize_intercept=not self._effective_intercept,
312
+ backend=backend_name,
313
+ )
314
+
315
+ self.n_iter_ = n_iter
316
+ self._params = params
317
+
318
+ # Convert to numpy (params may be cupy/torch array)
319
+ params_np = _to_numpy(params)
320
+
321
+ if self._effective_intercept:
322
+ self.intercept_ = float(params_np[0])
323
+ self.coef_ = params_np[1:]
324
+ else:
325
+ self.intercept_ = 0.0
326
+ self.coef_ = params_np.copy()
327
+
328
+ self._df_resid = self._nobs - (X.shape[1] + (1 if self._effective_intercept else 0))
329
+
330
+ def _fit_fista(self, X, y, sample_weight, family, backend_name="numpy"):
331
+ """Fit using FISTA (no penalty; pure loss minimization).
332
+
333
+ For GLM losses with intercept, uses iterated intercept estimation
334
+ + coef refinement to converge to the correct joint optimum.
335
+ """
336
+ from statgpu.glm_core import get_glm_loss
337
+ from statgpu.penalties._l2 import L2Penalty
338
+
339
+ loss_kwargs = self._get_loss_kwargs()
340
+ loss = get_glm_loss(self.family_to_loss(), **loss_kwargs)
341
+
342
+ if not self._effective_intercept:
343
+ X_centered = X
344
+ if backend_name == "torch":
345
+ dtype = _torch_promoted_float_dtype(X_centered, y)
346
+ X_centered = X_centered.to(dtype=dtype)
347
+ y = y.to(X_centered.device).to(dtype)
348
+ init = None
349
+ if self.family == "gamma" and loss_kwargs.get("link") == "inverse_power":
350
+ eta_lo = float(getattr(loss, "_ETA_LO", 1e-4))
351
+ if backend_name == "cupy":
352
+ import cupy as cp
353
+ if not cp.issubdtype(X_centered.dtype, cp.floating):
354
+ X_centered = X_centered.astype(cp.float64)
355
+ y_cp = cp.asarray(y, dtype=cp.float64)
356
+ X_cp = cp.asarray(X_centered, dtype=cp.float64)
357
+ eta_raw = 1.0 / cp.clip(y_cp, 1e-6, None)
358
+ eta_target = eta_raw - cp.mean(eta_raw)
359
+ try:
360
+ init_cp, *_ = cp.linalg.lstsq(X_cp, eta_target, rcond=None)
361
+ except cp.linalg.LinAlgError:
362
+ init_cp = cp.zeros(X.shape[1], dtype=cp.float64)
363
+ eta_init = X_cp @ init_cp
364
+ eta_abs_max = cp.max(cp.abs(eta_init))
365
+ min_scale = eta_lo * 10.0
366
+ if float(eta_abs_max) < min_scale:
367
+ scale = min_scale / (float(eta_abs_max) + 1e-12)
368
+ init_cp = init_cp * scale
369
+ eta_init = X_cp @ init_cp
370
+ near_zero_frac = cp.mean((cp.abs(eta_init) < (eta_lo * 10.0)).astype(cp.float64))
371
+ if float(near_zero_frac) > 0.5:
372
+ g = X_cp.T @ (y_cp - cp.mean(y_cp))
373
+ g_norm = cp.sqrt(cp.sum(g * g))
374
+ if float(g_norm) > 0:
375
+ init_cp = g / g_norm
376
+ eta_g = X_cp @ init_cp
377
+ med_abs = float(cp.median(cp.abs(eta_g)))
378
+ target = eta_lo * 20.0
379
+ init_cp = init_cp * (target / (med_abs + 1e-12))
380
+ coef_dtype = (
381
+ X_centered.dtype
382
+ if cp.issubdtype(X_centered.dtype, cp.floating)
383
+ else cp.float64
384
+ )
385
+ init = init_cp.astype(coef_dtype, copy=False)
386
+ elif backend_name == "torch":
387
+ import torch
388
+ dtype = X_centered.dtype
389
+ y_t = y.to(X.device).to(torch.float64)
390
+ X_t = X_centered.to(X.device).to(torch.float64)
391
+ eta_raw = 1.0 / torch.clamp(y_t, min=1e-6)
392
+ eta_target = eta_raw - torch.mean(eta_raw)
393
+ try:
394
+ init_t = torch.linalg.lstsq(X_t, eta_target).solution
395
+ except RuntimeError:
396
+ init_t = torch.zeros(X.shape[1], dtype=torch.float64, device=X.device)
397
+ eta_init = X_t @ init_t
398
+ eta_abs_max = torch.max(torch.abs(eta_init))
399
+ min_scale = eta_lo * 10.0
400
+ if float(eta_abs_max.item()) < min_scale:
401
+ scale = min_scale / (float(eta_abs_max.item()) + 1e-12)
402
+ init_t = init_t * scale
403
+ eta_init = X_t @ init_t
404
+ near_zero_frac = torch.mean((torch.abs(eta_init) < (eta_lo * 10.0)).to(torch.float64))
405
+ if float(near_zero_frac.item()) > 0.5:
406
+ g = X_t.T @ (y_t - torch.mean(y_t))
407
+ g_norm = torch.sqrt(torch.sum(g * g))
408
+ if float(g_norm.item()) > 0:
409
+ init_t = g / g_norm
410
+ eta_g = X_t @ init_t
411
+ med_abs = float(torch.median(torch.abs(eta_g)).item())
412
+ target = eta_lo * 20.0
413
+ init_t = init_t * (target / (med_abs + 1e-12))
414
+ init = init_t.to(dtype)
415
+ else:
416
+ if not np.issubdtype(X_centered.dtype, np.floating):
417
+ X_centered = X_centered.astype(np.float64)
418
+ y_np = np.asarray(y, dtype=np.float64)
419
+ X_np = np.asarray(X_centered, dtype=np.float64)
420
+ eta_raw = 1.0 / np.clip(y_np, 1e-6, None)
421
+ eta_target = eta_raw - np.mean(eta_raw)
422
+ try:
423
+ init = np.linalg.lstsq(X_np, eta_target, rcond=None)[0]
424
+ except np.linalg.LinAlgError:
425
+ init = np.zeros(X.shape[1], dtype=np.float64)
426
+ eta_init = X_np @ init
427
+ eta_abs_max = float(np.max(np.abs(eta_init))) if eta_init.size else 0.0
428
+ min_scale = eta_lo * 10.0
429
+ if eta_abs_max < min_scale:
430
+ init = init * (min_scale / (eta_abs_max + 1e-12))
431
+ eta_init = X_np @ init
432
+ near_zero_frac = float(np.mean(np.abs(eta_init) < (eta_lo * 10.0))) if eta_init.size else 1.0
433
+ if near_zero_frac > 0.5:
434
+ g = X_np.T @ (y_np - np.mean(y_np))
435
+ g_norm = float(np.sqrt(np.sum(g * g)))
436
+ if g_norm > 0:
437
+ init = g / g_norm
438
+ eta_g = X_np @ init
439
+ med_abs = float(np.median(np.abs(eta_g)))
440
+ target = eta_lo * 20.0
441
+ init = init * (target / (med_abs + 1e-12))
442
+ coef, n_iter = fista_solver(
443
+ loss, L2Penalty(alpha=0.0), X_centered, y,
444
+ max_iter=self.max_iter, tol=self.tol,
445
+ init_coef=init, sample_weight=sample_weight,
446
+ )
447
+ self.coef_ = _to_numpy(coef)
448
+ self.n_iter_ = n_iter
449
+ self.intercept_ = 0.0
450
+ self._params = self.coef_.copy()
451
+ self._df_resid = self._nobs - X.shape[1]
452
+ return
453
+
454
+ if loss.name != "squared_error":
455
+ # All non-Gaussian GLM losses must optimize intercept jointly with
456
+ # coefficients. Centering y is only valid for squared-error loss.
457
+ # Augment X with intercept column (no penalty in _fit_fista).
458
+ from statgpu.backends._utils import _get_xp
459
+ xp = _get_xp(backend_name)
460
+ if backend_name == "cupy":
461
+ x_dtype = X.dtype if xp.issubdtype(X.dtype, xp.floating) else xp.float64
462
+ X_float = X.astype(x_dtype, copy=False)
463
+ X_aug = xp.column_stack([X_float, xp.ones(X.shape[0], dtype=x_dtype)])
464
+ elif backend_name == "torch":
465
+ import torch
466
+ x_dtype = _torch_promoted_float_dtype(X, y)
467
+ X_float = X.to(dtype=x_dtype)
468
+ y = y.to(X.device).to(x_dtype)
469
+ X_aug = torch.column_stack([X_float, torch.ones(X.shape[0], dtype=x_dtype, device=X.device)])
470
+ else:
471
+ X_aug = np.column_stack([X, np.ones(X.shape[0])])
472
+ p = X.shape[1]
473
+ # Compute mean on native backend to avoid GPU→CPU transfer
474
+ _xp_mod = _get_xp(backend_name) if backend_name != "numpy" else np
475
+ y_mean = max(float(_xp_mod.mean(y)), 1e-3)
476
+ init = np.zeros(p + 1, dtype=np.float64)
477
+ if self.family == "binomial":
478
+ p_mean = np.clip(y_mean, 1e-3, 1.0 - 1e-3)
479
+ init[-1] = np.log(p_mean / (1.0 - p_mean))
480
+ elif self.family == "gamma" and loss_kwargs.get("link") == "inverse_power":
481
+ init[-1] = 1.0 / y_mean
482
+ elif self.family in (
483
+ "poisson", "gamma", "inverse_gaussian",
484
+ "negative_binomial", "tweedie",
485
+ ):
486
+ init[-1] = np.log(y_mean)
487
+ if backend_name == "cupy":
488
+ init = _xp_mod.asarray(init, dtype=x_dtype)
489
+ elif backend_name == "torch":
490
+ init = torch.from_numpy(init).to(X.device).to(x_dtype)
491
+
492
+ full_coef, n_iter = fista_solver(
493
+ loss, L2Penalty(alpha=0.0), X_aug, y,
494
+ max_iter=self.max_iter, tol=self.tol,
495
+ init_coef=init, sample_weight=sample_weight,
496
+ )
497
+
498
+ full_np = _to_numpy(full_coef)
499
+ self.coef_ = full_np[:p]
500
+ self.intercept_ = float(full_np[p])
501
+ self.n_iter_ = n_iter
502
+ self._params = np.concatenate([[self.intercept_], self.coef_])
503
+ else:
504
+ # Squared error: centering X and y preserves the objective.
505
+ from statgpu.backends._utils import _get_xp
506
+ xp = _get_xp(backend_name)
507
+ if backend_name == "cupy":
508
+ X_centered = X - xp.mean(X, axis=0)
509
+ y_centered = y - xp.mean(y)
510
+ elif backend_name == "torch":
511
+ import torch
512
+ x_dtype = _torch_promoted_float_dtype(X, y)
513
+ X_float = X.to(dtype=x_dtype)
514
+ y_float = y.to(X.device).to(x_dtype)
515
+ X_centered = X_float - torch.mean(X_float, dim=0)
516
+ y_centered = y_float - torch.mean(y_float)
517
+ else:
518
+ X_centered = X - X.mean(axis=0)
519
+ y_centered = y - y.mean()
520
+
521
+ coef, n_iter = fista_solver(
522
+ loss, L2Penalty(alpha=0.0), X_centered, y_centered,
523
+ max_iter=self.max_iter, tol=self.tol,
524
+ init_coef=None, sample_weight=sample_weight,
525
+ )
526
+
527
+ _xp_mod = _get_xp(backend_name) if backend_name != "numpy" else np
528
+ X_mean = _to_numpy(_xp_mod.mean(X, axis=0))
529
+ y_mean = float(_xp_mod.mean(y))
530
+ self.coef_ = _to_numpy(coef)
531
+ self.intercept_ = float(y_mean - X_mean @ self.coef_)
532
+ self.n_iter_ = n_iter
533
+ self._params = np.concatenate([[self.intercept_], self.coef_])
534
+
535
+ self._df_resid = self._nobs - (X.shape[1] + 1)
536
+
537
+ def _fit_smooth_solver(self, X, y, sample_weight, solver_name, backend_name):
538
+ """Fit ordinary GLM with backend-native Newton or L-BFGS."""
539
+ from statgpu.glm_core import get_glm_loss
540
+ from statgpu.solvers import lbfgs_solver, newton_solver
541
+
542
+ if sample_weight is not None:
543
+ raise ValueError(
544
+ f"solver='{solver_name}' does not support sample_weight yet; "
545
+ "use solver='irls' or solver='fista'."
546
+ )
547
+
548
+ loss_kwargs = self._get_loss_kwargs()
549
+ loss = get_glm_loss(self.family_to_loss(), **loss_kwargs)
550
+ if not getattr(loss, "has_hessian", False):
551
+ raise ValueError(f"solver='{solver_name}' requires a Hessian.")
552
+
553
+ if self._effective_intercept:
554
+ from statgpu.backends._utils import _get_xp
555
+ xp = _get_xp(backend_name)
556
+ if backend_name == "cupy":
557
+ x_dtype = X.dtype if getattr(X.dtype, "kind", "") == "f" else xp.float64
558
+ X_float = X.astype(x_dtype, copy=False)
559
+ X_work = xp.column_stack([X_float, xp.ones(X.shape[0], dtype=x_dtype)])
560
+ elif backend_name == "torch":
561
+ import torch
562
+ x_dtype = _torch_promoted_float_dtype(X, y)
563
+ X_float = X.to(dtype=x_dtype)
564
+ y = y.to(X.device).to(x_dtype)
565
+ X_work = torch.column_stack([
566
+ X_float,
567
+ torch.ones(X.shape[0], dtype=x_dtype, device=X.device),
568
+ ])
569
+ else:
570
+ x_dtype = X.dtype if np.issubdtype(X.dtype, np.floating) else np.float64
571
+ X_float = X.astype(x_dtype, copy=False)
572
+ X_work = np.column_stack([X_float, np.ones(X.shape[0], dtype=x_dtype)])
573
+ p = X.shape[1]
574
+ else:
575
+ if backend_name == "torch":
576
+ x_dtype = _torch_promoted_float_dtype(X, y)
577
+ X_work = X.to(dtype=x_dtype)
578
+ y = y.to(X.device).to(x_dtype)
579
+ else:
580
+ X_work = X
581
+ p = X.shape[1]
582
+
583
+ if solver_name == "newton":
584
+ params, n_iter = newton_solver(
585
+ loss, None, X_work, y, max_iter=self.max_iter, tol=self.tol
586
+ )
587
+ else:
588
+ params, n_iter = lbfgs_solver(
589
+ loss, None, X_work, y, max_iter=self.max_iter, tol=self.tol
590
+ )
591
+
592
+ params_np = _to_numpy(params)
593
+ self.n_iter_ = n_iter
594
+ if self._effective_intercept:
595
+ self.coef_ = params_np[:p]
596
+ self.intercept_ = float(params_np[p])
597
+ else:
598
+ self.coef_ = params_np.copy()
599
+ self.intercept_ = 0.0
600
+ self._params = (
601
+ np.concatenate([[self.intercept_], self.coef_])
602
+ if self._effective_intercept
603
+ else self.coef_.copy()
604
+ )
605
+ self._df_resid = self._nobs - (
606
+ X.shape[1] + (1 if self._effective_intercept else 0)
607
+ )
608
+
609
+ def _get_loss_kwargs(self):
610
+ """Override in subclass to pass extra kwargs to family/loss."""
611
+ return {}
612
+
613
+ def family_to_loss(self):
614
+ """Map family name to loss name."""
615
+ mapping = {
616
+ "gaussian": "squared_error",
617
+ "binomial": "logistic",
618
+ "poisson": "poisson",
619
+ "gamma": "gamma",
620
+ "inverse_gaussian": "inverse_gaussian",
621
+ "negative_binomial": "negative_binomial",
622
+ "tweedie": "tweedie",
623
+ }
624
+ if self.family not in mapping:
625
+ raise ValueError(
626
+ f"Unknown family '{self.family}'. "
627
+ f"Supported families: {list(mapping.keys())}"
628
+ )
629
+ return mapping[self.family]
630
+
631
+ def predict(self, X):
632
+ """Predict using fitted model."""
633
+ if self.coef_ is None:
634
+ raise RuntimeError("Model has not been fitted yet.")
635
+
636
+ if self._design_info is not None:
637
+ try:
638
+ import pandas as pd
639
+ except ImportError:
640
+ pd = None
641
+ if pd is not None and isinstance(X, pd.DataFrame):
642
+ from statgpu.core.formula import FormulaParser
643
+
644
+ parser = FormulaParser.__new__(FormulaParser)
645
+ parser._design_info = self._design_info
646
+ parser.formula = None
647
+ X = parser.transform(X)
648
+ col_names = list(self._design_info.column_names)
649
+ if self._formula_has_intercept and "Intercept" in col_names:
650
+ X = np.delete(X, col_names.index("Intercept"), axis=1)
651
+
652
+ device = self._get_compute_device()
653
+ family = self._get_family()
654
+ from statgpu.backends._utils import _get_xp, xp_asarray
655
+ if device in (Device.CUDA, Device.TORCH):
656
+ backend_name = "cupy" if device == Device.CUDA else "torch"
657
+ xp = _get_xp(backend_name)
658
+ Xb = xp_asarray(self._to_array(X, device), xp=xp)
659
+ coef = xp_asarray(self.coef_, xp=xp, ref_arr=Xb)
660
+ # Ensure float dtype for matmul (CUDA doesn't support Long matmul)
661
+ if hasattr(Xb, 'is_floating_point') and not Xb.is_floating_point():
662
+ Xb = Xb.float()
663
+ elif not hasattr(Xb, 'is_floating_point') and hasattr(Xb, 'dtype') and 'int' in str(Xb.dtype):
664
+ Xb = xp_asarray(Xb, dtype=xp.float64, xp=xp)
665
+ # Align dtypes for torch matmul compatibility
666
+ if hasattr(Xb, 'dtype') and hasattr(coef, 'dtype') and Xb.dtype != coef.dtype:
667
+ coef = coef.to(Xb.dtype) if hasattr(coef, 'to') else xp_asarray(coef, dtype=Xb.dtype, xp=xp)
668
+ raw = Xb @ coef
669
+ if self._effective_intercept:
670
+ raw = raw + xp_asarray(self.intercept_, xp=xp, ref_arr=Xb)
671
+ out = family.link.inverse(raw)
672
+ if device == Device.CUDA:
673
+ self._cleanup_cuda_memory()
674
+ else:
675
+ self._cleanup_torch_memory()
676
+ return out
677
+
678
+ X = np.asarray(X)
679
+ raw = X @ self.coef_
680
+ if self._effective_intercept:
681
+ raw += self.intercept_
682
+ return family.link.inverse(raw)
683
+
684
+
685
+ class OrderedGeneralizedLinearModel(GeneralizedLinearModel):
686
+ """Ordered GLM base class.
687
+
688
+ Jointly estimates coefficients + (K-1) thresholds.
689
+ P(y <= j | X) = F(theta_j - X * beta)
690
+
691
+ Parameters
692
+ ----------
693
+ n_categories : int, default=3
694
+ Number of ordinal categories.
695
+ family : str, default='binomial'
696
+ Distribution family (should be Binomial for ordered models).
697
+ ... : same as GeneralizedLinearModel
698
+ """
699
+
700
+ def __init__(
701
+ self,
702
+ n_categories: int = 3,
703
+ family: str = "binomial",
704
+ fit_intercept: bool = True,
705
+ max_iter: int = 100,
706
+ tol: float = 1e-4,
707
+ C: float = 1.0,
708
+ device: Union[str, Device] = Device.AUTO,
709
+ n_jobs: Optional[int] = None,
710
+ solver: str = "auto",
711
+ gpu_memory_cleanup: bool = False,
712
+ ):
713
+ super().__init__(
714
+ family=family,
715
+ fit_intercept=fit_intercept,
716
+ max_iter=max_iter,
717
+ tol=tol,
718
+ C=C,
719
+ device=device,
720
+ n_jobs=n_jobs,
721
+ solver=solver,
722
+ gpu_memory_cleanup=gpu_memory_cleanup,
723
+ )
724
+ self.n_categories = n_categories
725
+ self.thresholds_ = None
726
+
727
+ def fit(self, X, y, sample_weight=None):
728
+ """Fit ordered GLM using L-BFGS.
729
+
730
+ Supports numpy (CPU via scipy), cupy (GPU via native L-BFGS),
731
+ and torch (GPU via torch.optim.LBFGS).
732
+ """
733
+ if sample_weight is not None:
734
+ raise ValueError(
735
+ "OrderedGeneralizedLinearModel does not support sample_weight yet."
736
+ )
737
+
738
+ backend = self._get_backend(backend="auto")
739
+ backend_name = backend.name
740
+ self._selected_backend_name = backend_name
741
+ self._nobs = X.shape[0]
742
+
743
+ # Convert to backend format (cupy→cupy zero-copy, numpy→cupy/torch)
744
+ X = self._to_array(X, backend=backend_name)
745
+ y = self._to_array(y, backend=backend_name)
746
+
747
+ family = self._get_family()
748
+ K = self.n_categories
749
+ n = X.shape[0]
750
+ p = X.shape[1]
751
+
752
+ if backend_name == "cupy":
753
+ self._fit_cupy_ordered(X, y, family, K, n, p)
754
+ elif backend_name == "torch":
755
+ self._fit_torch_ordered(X, y, family, K, n, p)
756
+ else:
757
+ self._fit_scipy_ordered(X, y, family, K, n, p)
758
+
759
+ self._df_resid = self._nobs - (p + K - 1)
760
+ self._fitted = True
761
+ self._cleanup_backend_memory(backend_name)
762
+ return self
763
+
764
+ def _fit_scipy_ordered(self, X, y, family, K, n, p):
765
+ """Fit ordered GLM using scipy.optimize.minimize(L-BFGS-B)."""
766
+ from scipy.optimize import minimize
767
+
768
+ X = np.asarray(X, dtype=np.float64)
769
+ y = np.asarray(y, dtype=np.int64)
770
+
771
+ X_mean = X.mean(axis=0)
772
+ X_std = X.std(axis=0)
773
+ X_std[X_std < 1e-10] = 1.0
774
+ Xs = (X - X_mean) / X_std
775
+
776
+ theta_init = np.zeros(p + K - 1)
777
+ theta_init[p:] = np.arange(0.5, K - 0.5, dtype=np.float64)
778
+
779
+ cache = {"nll": None, "grad": None, "theta": None}
780
+
781
+ def nll_and_grad(theta):
782
+ if cache["theta"] is not None and np.array_equal(cache["theta"], theta):
783
+ return cache["nll"], cache["grad"]
784
+
785
+ beta = theta[:p]
786
+ thresh = theta[p:]
787
+
788
+ prob = self._ordered_category_probs(Xs, beta, thresh, family, K)
789
+ prob_c = np.clip(prob, 1e-15, None)
790
+ nll = -np.sum(np.log(prob_c[y, np.arange(n)])) / n
791
+
792
+ grad = self._ordered_gradient(
793
+ Xs, y, beta, thresh, prob, prob_c, family, K, n
794
+ )
795
+
796
+ cache["nll"] = nll
797
+ cache["grad"] = grad
798
+ cache["theta"] = theta
799
+ return nll, grad
800
+
801
+ def nll_func(theta):
802
+ val, _ = nll_and_grad(theta)
803
+ return val
804
+
805
+ def grad_func(theta):
806
+ _, g = nll_and_grad(theta)
807
+ return g
808
+
809
+ result = minimize(
810
+ nll_func, theta_init, jac=grad_func, method="L-BFGS-B",
811
+ options={"maxiter": self.max_iter, "ftol": self.tol * 1e-3,
812
+ "gtol": self.tol, "disp": False},
813
+ )
814
+
815
+ theta = result.x
816
+ beta_scaled = theta[:p]
817
+ self.coef_ = beta_scaled / X_std
818
+ thresh_est = np.sort(theta[p:])
819
+ self.thresholds_ = np.concatenate([[-np.inf], thresh_est, [np.inf]])
820
+ self._X_mean = X_mean
821
+ self._X_std = X_std
822
+ self.n_iter_ = result.nit if hasattr(result, "nit") else result.nfev
823
+
824
+ def _fit_cupy_ordered(self, X, y, family, K, n, p):
825
+ """Fit ordered GLM using full CuPy L-BFGS on GPU.
826
+
827
+ All computation stays on GPU — no scipy bridge, no CPU round-trips.
828
+ Pre-allocates arrays (prob, prob_c, eta, diff, deriv_all) to amortize
829
+ GPU memory allocation overhead across iterations.
830
+
831
+ Warm start: reuses previous fit's solution if available (self.coef_ exists).
832
+
833
+ For n=5000: GPU NLL+grad ≈ 2.4ms/call, ~67 evals ≈ 160ms total.
834
+ For n=50000+: GPU compute dominates, kernel launch overhead is negligible.
835
+ """
836
+ import cupy as cp
837
+
838
+ X = cp.asarray(X, dtype=cp.float64)
839
+ y = cp.asarray(y, dtype=cp.int64)
840
+
841
+ X_mean = X.mean(axis=0)
842
+ X_std = X.std(axis=0)
843
+ X_std[X_std < 1e-10] = 1.0
844
+ Xs = (X - X_mean) / X_std
845
+
846
+ # Pre-allocate reusable arrays (amortize GPU alloc overhead)
847
+ _prob_pre = cp.zeros((K, n), dtype=cp.float64)
848
+ _prob_c_pre = cp.zeros((K, n), dtype=cp.float64)
849
+ _eta_pre = cp.zeros(n, dtype=cp.float64)
850
+ _diff_pre = cp.zeros((K - 1, n), dtype=cp.float64)
851
+ _deriv_all = cp.zeros((K - 1, n), dtype=cp.float64)
852
+ _scalar = cp.zeros(n, dtype=cp.float64)
853
+ _inv_prob = cp.zeros(n, dtype=cp.float64)
854
+ _y_idx = cp.arange(n)
855
+
856
+ def nll_and_grad_prealloc(theta_cp):
857
+ """NLL + gradient with pre-allocated arrays."""
858
+ beta = theta_cp[:p]
859
+ thresh = theta_cp[p:]
860
+
861
+ # Inline category probs using pre-allocated arrays
862
+ _eta_pre[:] = Xs @ beta
863
+ _diff_pre[:] = thresh[:, None] - _eta_pre[None, :]
864
+ pi = family.link.inverse(_diff_pre) # (K-1, n)
865
+
866
+ _prob_pre[0] = pi[0]
867
+ for j in range(1, K - 1):
868
+ _prob_pre[j] = pi[j] - pi[j - 1]
869
+ _prob_pre[K - 1] = 1.0 - pi[K - 2]
870
+ _prob_c_pre[:] = cp.clip(_prob_pre, 1e-15, None)
871
+
872
+ nll = -cp.sum(cp.log(_prob_c_pre[y, _y_idx])) / n
873
+
874
+ # Gradient with pre-allocated arrays
875
+ grad = cp.zeros(p + K - 1)
876
+ for j in range(K - 1):
877
+ _deriv_all[j] = self._ordered_link_derivative(_diff_pre[j], family)
878
+ _inv_prob[:] = 1.0 / _prob_c_pre[y, _y_idx]
879
+
880
+ for j in range(K - 1):
881
+ mask_pos = (y == j)
882
+ mask_neg = (y == j + 1)
883
+ grad[p + j] = -cp.sum(
884
+ _inv_prob * (_deriv_all[j] * mask_pos - _deriv_all[j] * mask_neg)
885
+ ) / n
886
+
887
+ _scalar[:] = 0.0
888
+ mask0 = (y == 0)
889
+ mask_last = (y == K - 1)
890
+ mask_mid = ~mask0 & ~mask_last
891
+ _scalar[mask0] = -_deriv_all[0, mask0]
892
+ _scalar[mask_last] = _deriv_all[K - 2, mask_last]
893
+ idx_mid = cp.where(mask_mid)[0]
894
+ _scalar[idx_mid] = (_deriv_all[y[idx_mid] - 1, idx_mid]
895
+ - _deriv_all[y[idx_mid], idx_mid])
896
+ grad[:p] -= Xs.T @ (_inv_prob * _scalar) / n
897
+
898
+ return nll, grad
899
+
900
+ # Initial theta (matching scipy and torch: start from scratch)
901
+ theta = cp.zeros(p + K - 1, dtype=cp.float64)
902
+ theta[p:] = cp.arange(0.5, K - 0.5, dtype=cp.float64)
903
+
904
+ # L-BFGS parameters
905
+ c1, c2 = 1e-4, 0.9
906
+ max_ls = 25
907
+ m_hist = 15
908
+ min_iter = 5 # small guard against premature stop
909
+
910
+ nll, grad = nll_and_grad_prealloc(theta)
911
+ # Use infinity norm of gradient for convergence (matching scipy's gtol).
912
+ gtol = self.tol
913
+ grad_inf = float(cp.max(cp.abs(grad)))
914
+ s_hist, y_hist, rho_hist = [], [], []
915
+ H0 = 1.0
916
+ n_iter = 0
917
+
918
+ while n_iter < self.max_iter:
919
+ # Check convergence using infinity norm (after min_iter iterations)
920
+ if n_iter >= min_iter and grad_inf <= gtol:
921
+ break
922
+ s_old = theta.copy()
923
+ g_old = grad.copy()
924
+ nll_old = nll
925
+
926
+ # Two-loop recursion
927
+ q = grad.copy()
928
+ alphas = []
929
+ for i in range(len(s_hist) - 1, -1, -1):
930
+ a = rho_hist[i] * cp.dot(s_hist[i], q)
931
+ alphas.insert(0, a)
932
+ q = q - a * y_hist[i]
933
+
934
+ if s_hist:
935
+ sy = float(cp.dot(s_hist[-1], y_hist[-1]))
936
+ yy = float(cp.dot(y_hist[-1], y_hist[-1]))
937
+ H0 = sy / (yy + 1e-30)
938
+
939
+ r = H0 * q
940
+ for i in range(len(s_hist)):
941
+ b = rho_hist[i] * cp.dot(y_hist[i], r)
942
+ r = r + s_hist[i] * (alphas[i] - b)
943
+
944
+ d = -r
945
+ gd = float(cp.dot(grad, d))
946
+ if gd >= -1e-12:
947
+ d = -grad
948
+ gd = float(cp.dot(grad, d))
949
+
950
+ slope = gd
951
+ step = 1.0
952
+
953
+ # Armijo line search
954
+ for _ in range(max_ls):
955
+ theta_new = theta + step * d
956
+ nll_new, grad_new = nll_and_grad_prealloc(theta_new)
957
+ if nll_new <= nll_old + c1 * step * slope:
958
+ break
959
+ step *= 0.5
960
+ else:
961
+ theta_new = theta + step * d
962
+ nll_new, grad_new = nll_and_grad_prealloc(theta_new)
963
+
964
+ # Update L-BFGS history
965
+ s_new = theta_new - s_old
966
+ y_new_arr = grad_new - g_old
967
+ sy_val = float(cp.dot(s_new, y_new_arr))
968
+ if sy_val > 1e-12:
969
+ if len(s_hist) >= m_hist:
970
+ s_hist.pop(0)
971
+ y_hist.pop(0)
972
+ rho_hist.pop(0)
973
+ s_hist.append(s_new)
974
+ y_hist.append(y_new_arr)
975
+ rho_hist.append(1.0 / sy_val)
976
+
977
+ theta = theta_new
978
+ nll = nll_new
979
+ grad = grad_new
980
+ grad_inf = float(cp.max(cp.abs(grad)))
981
+ n_iter += 1
982
+
983
+ # Extract results
984
+ beta_scaled = theta[:p]
985
+ self.coef_ = (beta_scaled / X_std).get()
986
+ thresh_est = cp.sort(theta[p:])
987
+ self.thresholds_ = np.concatenate([[-np.inf], thresh_est.get(), [np.inf]])
988
+ self._X_mean = X_mean.get()
989
+ self._X_std = X_std.get()
990
+ self.n_iter_ = n_iter
991
+
992
+ def _fit_torch_ordered(self, X, y, family, K, n, p):
993
+ """Fit ordered GLM using PyTorch autograd + LBFGS on GPU.
994
+
995
+ X and y are already torch.Tensor on CUDA (converted by _to_array in fit()).
996
+ No CuPy/NumPy bridge needed here — device purity is enforced upstream.
997
+ """
998
+ import torch
999
+
1000
+ assert isinstance(X, torch.Tensor), (
1001
+ f"_fit_torch_ordered expects torch.Tensor, got {type(X)}. "
1002
+ "Input should be converted by _to_array() before entering this method."
1003
+ )
1004
+
1005
+ torch_device = X.device
1006
+ if X.dtype != torch.float64:
1007
+ X = X.to(torch.float64)
1008
+ if not isinstance(y, torch.Tensor):
1009
+ y = torch.from_numpy(np.asarray(y, dtype=np.int64)).to(torch_device)
1010
+ elif y.dtype != torch.int64:
1011
+ y = y.to(torch.int64)
1012
+
1013
+ X_mean = X.mean(dim=0)
1014
+ X_std = X.std(dim=0)
1015
+ X_std = torch.where(X_std < 1e-10, torch.ones_like(X_std), X_std)
1016
+ Xs = (X - X_mean) / X_std
1017
+
1018
+ # Parameters: [beta (p), thresholds (K-1)]
1019
+ # Initialize thresholds uniformly
1020
+ theta_init = torch.zeros(p + K - 1, dtype=torch.float64, device=torch_device)
1021
+ theta_init[p:] = torch.arange(0.5, K - 0.5, dtype=torch.float64, device=torch_device)
1022
+ theta = torch.nn.Parameter(theta_init.clone())
1023
+
1024
+ n_samples = torch.tensor(float(n), dtype=torch.float64, device=torch_device)
1025
+ y_idx = torch.arange(n, device=torch_device)
1026
+
1027
+ def closure():
1028
+ optimizer.zero_grad()
1029
+ beta = theta[:p]
1030
+ thresh = theta[p:]
1031
+
1032
+ # Compute category probabilities with autograd
1033
+ eta = Xs @ beta # (n,)
1034
+ diff = thresh[:, None] - eta[None, :] # (K-1, n)
1035
+
1036
+ # Link inverse via family
1037
+ pi = family.link.inverse(diff) # (K-1, n)
1038
+
1039
+ # Category probabilities P(y=j)
1040
+ prob = torch.zeros((K, n), dtype=torch.float64, device=torch_device)
1041
+ prob[0] = pi[0]
1042
+ for j in range(1, K - 1):
1043
+ prob[j] = pi[j] - pi[j - 1]
1044
+ prob[K - 1] = 1.0 - pi[K - 2]
1045
+
1046
+ # Negative log-likelihood
1047
+ prob_c = torch.clamp(prob[y, y_idx], 1e-15, None)
1048
+ nll = -torch.mean(torch.log(prob_c))
1049
+
1050
+ nll.backward()
1051
+ return nll
1052
+
1053
+ # Torch L-BFGS — use strong_wolfe line_search for robust convergence
1054
+ # (ordered logit NLL landscape has steep gradients that cause lr=1.0
1055
+ # without line search to diverge into degenerate local minima)
1056
+ try:
1057
+ optimizer = torch.optim.LBFGS(
1058
+ [theta],
1059
+ lr=1.0,
1060
+ max_iter=self.max_iter,
1061
+ tolerance_grad=self.tol,
1062
+ tolerance_change=self.tol * 1e-3,
1063
+ line_search_fn='strong_wolfe',
1064
+ max_eval=self.max_iter * 25,
1065
+ )
1066
+ except TypeError:
1067
+ raise RuntimeError(
1068
+ "torch.optim.LBFGS with line_search_fn='strong_wolfe' is required "
1069
+ "for ordered model fitting. Upgrade to PyTorch >= 1.13 or use "
1070
+ "a different backend (numpy or cupy)."
1071
+ )
1072
+
1073
+ loss = optimizer.step(closure)
1074
+
1075
+ # Extract results
1076
+ theta_final = theta.detach()
1077
+ beta_scaled = theta_final[:p]
1078
+ thresh_est = torch.sort(theta_final[p:])[0]
1079
+
1080
+ self.coef_ = (beta_scaled / X_std).cpu().numpy()
1081
+ self.thresholds_ = np.concatenate([[-np.inf], thresh_est.cpu().numpy(), [np.inf]])
1082
+ self._X_mean = X_mean.cpu().numpy()
1083
+ self._X_std = X_std.cpu().numpy()
1084
+ try:
1085
+ state_dict = optimizer.state_dict()
1086
+ n_iter = 0
1087
+ for group in state_dict.get('state', {}).values():
1088
+ n_iter = max(n_iter, group.get('n_iter', 0))
1089
+ self.n_iter_ = n_iter if n_iter > 0 else self.max_iter
1090
+ except Exception:
1091
+ self.n_iter_ = self.max_iter
1092
+
1093
+ def _ordered_category_probs(self, X, beta, thresh, family, K):
1094
+ """Compute category probabilities P(y=j|X), shape (K, n)."""
1095
+ eta = X @ beta # (n,)
1096
+ pi = family.link.inverse(thresh[:, None] - eta[None, :]) # (K-1, n)
1097
+
1098
+ xp = _np_compat_xp(X)
1099
+ prob = xp.zeros((K, X.shape[0]), dtype=getattr(X, 'dtype', None))
1100
+ prob[0] = pi[0]
1101
+ for j in range(1, K - 1):
1102
+ prob[j] = pi[j] - pi[j - 1]
1103
+ prob[K - 1] = 1.0 - pi[K - 2]
1104
+ return prob
1105
+
1106
+ def _ordered_gradient(self, X, y, beta, thresh, prob, prob_clipped, family, K, n):
1107
+ """Compute analytical gradient of the negative log-likelihood (vectorized)."""
1108
+ xp = _np_compat_xp(X)
1109
+ p = X.shape[1]
1110
+ n_thresh = K - 1
1111
+ dim = p + n_thresh
1112
+ grad = xp.zeros(dim)
1113
+
1114
+ eta = X @ beta # (n,)
1115
+
1116
+ # Link derivative at all threshold positions: shape (n_thresh, n)
1117
+ diff = thresh[:, None] - eta[None, :] # (n_thresh, n)
1118
+ deriv_all = xp.empty_like(diff)
1119
+ for j in range(n_thresh):
1120
+ deriv_all[j] = self._ordered_link_derivative(diff[j], family)
1121
+
1122
+ # inv_prob[i] = 1 / P(y[i] | X[i]), shape (n,)
1123
+ inv_prob = 1.0 / prob_clipped[y, xp.arange(n)] # (n,)
1124
+
1125
+ # dP_dthresh contribution for each (j, i):
1126
+ # +deriv_all[j, i] if j == y[i]
1127
+ # -deriv_all[j, i] if j == y[i] - 1
1128
+ # Vectorized: for each j, count how many samples have y==j (positive)
1129
+ # and y==j+1 (negative).
1130
+ dP_dthresh_j = xp.zeros(n_thresh)
1131
+ for j in range(n_thresh):
1132
+ mask_pos = (y == j)
1133
+ mask_neg = (y == j + 1)
1134
+ dP_dthresh_j[j] = xp.sum(inv_prob * (deriv_all[j] * mask_pos - deriv_all[j] * mask_neg))
1135
+
1136
+ grad[p:] -= dP_dthresh_j / n
1137
+
1138
+ # dP_dbeta for sample i: X[i] * scalar_i
1139
+ # scalar_i = -(deriv_all[0, i]) if y[i]==0
1140
+ # (deriv_all[y[i]-1, i] - deriv_all[y[i], i]) if 0 < y[i] < K-1
1141
+ # (deriv_all[n_thresh-1, i]) if y[i]==K-1
1142
+ scalar = xp.empty(n)
1143
+ mask0 = (y == 0)
1144
+ mask_last = (y == K - 1)
1145
+ mask_mid = ~mask0 & ~mask_last
1146
+ scalar[mask0] = -deriv_all[0, mask0]
1147
+ scalar[mask_last] = deriv_all[n_thresh - 1, mask_last]
1148
+ # For middle: deriv[y[i]-1] - deriv[y[i]]
1149
+ idx_mid = xp.where(mask_mid)[0]
1150
+ scalar[idx_mid] = (deriv_all[y[idx_mid] - 1, idx_mid]
1151
+ - deriv_all[y[idx_mid], idx_mid])
1152
+
1153
+ grad[:p] -= X.T @ (inv_prob * scalar) / n
1154
+
1155
+ return grad
1156
+
1157
+ def _ordered_link_derivative(self, x, family):
1158
+ """First derivative of link inverse F'(x) = density at x.
1159
+
1160
+ For logit: sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
1161
+ For probit: normal PDF φ(x).
1162
+ Both paths are backend-agnostic (numpy/cupy/torch).
1163
+ """
1164
+ if family.link.name == "probit":
1165
+ from statgpu.backends._array_ops import _xp, _exp, _scalar_tensor
1166
+ xp = _xp(x)
1167
+ two_pi = _scalar_tensor(2.0 * np.pi, x)
1168
+ return _exp(-0.5 * x * x) / xp.sqrt(two_pi)
1169
+ # logit: F * (1 - F) — element-wise, works for any backend
1170
+ F = family.link.inverse(x)
1171
+ return F * (1.0 - F)
1172
+
1173
+ def _ordered_link_second_derivative(self, x, family):
1174
+ """Second derivative of link inverse F''(x)."""
1175
+ mod = type(x).__module__
1176
+ is_cupy = mod.startswith('cupy')
1177
+ is_torch = mod.startswith('torch')
1178
+
1179
+ if family.link.name == "logit":
1180
+ F = family.link.inverse(x)
1181
+ return F * (1.0 - F) * (1.0 - 2.0 * F)
1182
+ elif family.link.name == "probit":
1183
+ # F''(x) = -x * φ(x) for standard normal PDF φ
1184
+ from statgpu.backends._array_ops import _xp, _exp, _scalar_tensor
1185
+ xp = _xp(x)
1186
+ two_pi = _scalar_tensor(2.0 * np.pi, x)
1187
+ phi = _exp(-0.5 * x * x) / xp.sqrt(two_pi)
1188
+ return -x * phi
1189
+ F = family.link.inverse(x)
1190
+ return F * (1.0 - F) * (1.0 - 2.0 * F)
1191
+
1192
+ def predict_proba(self, X):
1193
+ """Predict class probabilities P(y=j|X).
1194
+
1195
+ Backend-agnostic: uses the same backend that was used during fit().
1196
+ Returns a NumPy array by convention (small output, consumed on CPU).
1197
+ """
1198
+ self._check_is_fitted()
1199
+ if self.coef_ is None:
1200
+ raise RuntimeError("Model has not been fitted yet.")
1201
+ K = self.n_categories
1202
+
1203
+ backend = self._get_backend(backend="auto")
1204
+ backend_name = backend.name
1205
+ X_arr = self._to_array(X, backend=backend_name)
1206
+
1207
+ from statgpu.backends._utils import _get_xp, xp_asarray
1208
+ xp = _get_xp(backend_name)
1209
+ coef = xp_asarray(self.coef_, xp=xp, ref_arr=X_arr)
1210
+ X_mean = xp_asarray(self._X_mean, xp=xp, ref_arr=X_arr)
1211
+ X_std = xp_asarray(self._X_std, xp=xp, ref_arr=X_arr)
1212
+ thresholds = xp_asarray(self.thresholds_, xp=xp, ref_arr=X_arr)
1213
+
1214
+ X_scaled = (X_arr - X_mean) / X_std
1215
+ eta = X_scaled @ coef
1216
+ family = self._get_family()
1217
+ diff = thresholds[:, None] - eta[None, :]
1218
+ pi = family.link.inverse(diff) # (K+1, n) with -inf/+inf thresholds
1219
+
1220
+ if hasattr(xp, '__name__') and xp.__name__ == "torch":
1221
+ proba = xp.diff(pi, dim=0).T # (n, K)
1222
+ else:
1223
+ proba = xp.diff(pi, axis=0).T # (n, K)
1224
+ if backend_name != "numpy":
1225
+ out = _to_numpy(proba)
1226
+ self._cleanup_backend_memory(backend_name)
1227
+ return out
1228
+ return proba
1229
+
1230
+ def predict(self, X):
1231
+ """Predict class labels.
1232
+
1233
+ Backend-agnostic: computes argmax on the native backend, returns NumPy.
1234
+ """
1235
+ self._check_is_fitted()
1236
+
1237
+ backend = self._get_backend(backend="auto")
1238
+ backend_name = backend.name
1239
+ proba = self.predict_proba(X)
1240
+ return np.argmax(proba, axis=1)
1241
+
1242
+ def score(self, X, y):
1243
+ """Return mean accuracy on the given test data and labels.
1244
+
1245
+ Uses the same backend as fit() for the computation.
1246
+ """
1247
+ self._check_is_fitted()
1248
+
1249
+ backend = self._get_backend(backend="auto")
1250
+ backend_name = backend.name
1251
+ y_true = self._to_array(y, backend=backend_name)
1252
+ y_pred = self.predict(X)
1253
+ y_pred_arr = self._to_array(y_pred, backend=backend_name)
1254
+
1255
+ from statgpu.backends._utils import _get_xp, _to_float_scalar
1256
+ xp = _get_xp(backend_name)
1257
+ matches = xp.asarray(y_pred_arr == y_true, dtype=xp.float64)
1258
+ out = _to_float_scalar(xp.mean(matches))
1259
+ if backend_name != "numpy":
1260
+ self._cleanup_backend_memory(backend_name)
1261
+ return out