statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1160 @@
1
+ """
2
+ RidgeCV: Cross-validated Ridge regression with GPU support.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __all__ = ["RidgeCV"]
8
+
9
+ from typing import Any, Dict, Optional, Tuple, Union
10
+ from collections import OrderedDict
11
+ import hashlib
12
+ import warnings
13
+ import numpy as np
14
+
15
+ from statgpu._config import Device
16
+ from statgpu.cross_validation._base import CVEstimatorBase
17
+ from statgpu.backends import get_backend, _torch_dev
18
+ from statgpu.backends._factory import _cupy_backend, _torch_backend
19
+ from statgpu.linear_model.wrappers._ridge import Ridge
20
+
21
+
22
+ # =============================================================================
23
+ # CV Cache for Ridge
24
+ # =============================================================================
25
+
26
+ import threading
27
+
28
+ _RIDGE_CV_ALPHA_CACHE_MAXSIZE = int(64)
29
+ _RIDGE_CV_ALPHA_CACHE: "OrderedDict[Tuple[Any, ...], Dict[str, Any]]" = OrderedDict()
30
+ _RIDGE_CV_CACHE_LOCK = threading.Lock()
31
+
32
+
33
+ def _ridge_cv_cache_get(key):
34
+ """Get cached Ridge CV results."""
35
+ if key is None:
36
+ return None
37
+ with _RIDGE_CV_CACHE_LOCK:
38
+ val = _RIDGE_CV_ALPHA_CACHE.get(key)
39
+ if val is not None:
40
+ _RIDGE_CV_ALPHA_CACHE.move_to_end(key)
41
+ return val
42
+
43
+
44
+ def _ridge_cv_cache_put(key, value):
45
+ """Put cached Ridge CV results."""
46
+ if key is None:
47
+ return
48
+ with _RIDGE_CV_CACHE_LOCK:
49
+ _RIDGE_CV_ALPHA_CACHE[key] = value
50
+ _RIDGE_CV_ALPHA_CACHE.move_to_end(key)
51
+ while len(_RIDGE_CV_ALPHA_CACHE) > _RIDGE_CV_ALPHA_CACHE_MAXSIZE:
52
+ _RIDGE_CV_ALPHA_CACHE.popitem(last=False)
53
+
54
+
55
+ def _make_ridge_cv_auto_cache_key(X, y, alphas, folds, fit_intercept, use_gpu, sample_weight=None):
56
+ """Generate automatic cache key for Ridge CV.
57
+
58
+ Delegates data hashing to shared hash_cv_data() (10M threshold,
59
+ row-index aware), then appends Ridge-specific parameters.
60
+ """
61
+ from statgpu.cross_validation._base import hash_cv_data
62
+ # Shared data hash (10M threshold, row indices for large datasets)
63
+ data_hash = hash_cv_data(X, y, sample_weight)
64
+ # Ridge-specific parameters
65
+ h = hashlib.blake2b(digest_size=32)
66
+ h.update(data_hash)
67
+ h.update(str(X.dtype).encode("utf-8"))
68
+ h.update(np.asarray(alphas, dtype=np.float64).tobytes())
69
+ h.update(str(fit_intercept).encode("utf-8"))
70
+ h.update(str(use_gpu).encode("utf-8"))
71
+ # Hash fold indices (all elements to avoid collisions)
72
+ for train_idx, val_idx in folds:
73
+ h.update(train_idx.tobytes())
74
+ h.update(val_idx.tobytes())
75
+ return h.hexdigest()
76
+
77
+
78
+ # =============================================================================
79
+ # K-fold helper
80
+ # =============================================================================
81
+
82
+ from statgpu.cross_validation._base import kfold_indices as _kfold_indices, folds_are_complete as _folds_are_complete, batch_mse as _batch_mse_cv
83
+
84
+
85
+ # =============================================================================
86
+ # Alpha grid generation
87
+ # =============================================================================
88
+
89
+ def _default_ridge_alpha_grid(X, y, n_alphas: int = 100, alpha_min_ratio: float = 1e-3):
90
+ """
91
+ Generate default alpha grid for Ridge CV.
92
+
93
+ Mirrors sklearn's approach: alpha values are log-spaced between
94
+ alpha_min and alpha_max based on the data.
95
+
96
+ Parameters
97
+ ----------
98
+ X : ndarray
99
+ Design matrix (n_samples, n_features).
100
+ y : ndarray
101
+ Response vector.
102
+ n_alphas : int
103
+ Number of alpha values to generate.
104
+ alpha_min_ratio : float
105
+ Minimum alpha as a ratio of max alpha.
106
+
107
+ Returns
108
+ -------
109
+ alphas : ndarray
110
+ Log-spaced alpha values.
111
+ """
112
+ X_arr = np.asarray(X, dtype=np.float64)
113
+ y_arr = np.asarray(y, dtype=np.float64).reshape(-1)
114
+
115
+ # Handle intercept by centering
116
+ X_mean = np.mean(X_arr, axis=0)
117
+ y_mean = np.mean(y_arr)
118
+ X_centered = X_arr - X_mean
119
+ y_centered = y_arr - y_mean
120
+
121
+ # Compute XtX and Xty for alpha_max estimation
122
+ XtX = X_centered.T @ X_centered
123
+ Xty = X_centered.T @ y_centered
124
+
125
+ # alpha_max: heuristic upper bound for the alpha grid.
126
+ # The *2.0 factor is a conservative heuristic to ensure the grid covers
127
+ # a wide enough range; CV selects the best alpha empirically regardless.
128
+ # (Exact L1 alpha_max = max(|X'y|)/n; Ridge has no exact sparsity threshold.)
129
+ n_samples = X_arr.shape[0]
130
+ alpha_max = np.max(np.abs(Xty)) * 2.0 / n_samples
131
+
132
+ if alpha_max == 0:
133
+ alpha_max = 1.0
134
+
135
+ alpha_min = alpha_max * alpha_min_ratio
136
+
137
+ # Log-spaced grid
138
+ if n_alphas <= 1:
139
+ return np.array([alpha_max])
140
+
141
+ alphas = np.logspace(
142
+ np.log10(alpha_min),
143
+ np.log10(alpha_max),
144
+ num=n_alphas,
145
+ dtype=np.float64,
146
+ )
147
+ return alphas
148
+
149
+
150
+ # =============================================================================
151
+ # Batch MSE computation
152
+ # =============================================================================
153
+
154
+
155
+ # =============================================================================
156
+ # GPU batch solver for Ridge
157
+ # =============================================================================
158
+
159
+ def _solve_ridge_path_gpu_from_gram_eig(XtX_batch, Xty_batch, alphas, backend, fit_intercept=True, n_samples_vec=None):
160
+ """
161
+ Solve Ridge path for multiple folds using eigendecomposition (vectorized over alphas).
162
+
163
+ This function uses eigendecomposition to solve the Ridge regression problem for all
164
+ alphas simultaneously, avoiding repeated Cholesky decompositions.
165
+
166
+ Mathematical formulation:
167
+ - Given XtX = Q @ Lambda @ Q.T (eigendecomposition)
168
+ - Ridge solution: coef(alpha) = (XtX + alpha*I)^-1 @ Xty
169
+ - Using eigenbasis: coef(alpha) = Q @ diag(1/(lambda_i + alpha)) @ Q.T @ Xty
170
+
171
+ Parameters
172
+ ----------
173
+ XtX_batch : array-like
174
+ Batch of Gram matrices (n_folds, n_features, n_features).
175
+ Xty_batch : array-like
176
+ Batch of cross products (n_folds, n_features).
177
+ alphas : ndarray
178
+ Alpha values (n_alphas,).
179
+ backend : BackendBase
180
+ Backend instance (CuPyBackend or TorchBackend).
181
+ fit_intercept : bool
182
+ Whether intercept is fitted (not used here, kept for API compatibility).
183
+
184
+ Returns
185
+ -------
186
+ coefs_desc : ndarray
187
+ Coefficients for each alpha and fold (n_alphas, n_folds, n_features).
188
+ """
189
+ xp = backend.xp
190
+
191
+ n_folds = XtX_batch.shape[0]
192
+ n_features = XtX_batch.shape[1]
193
+ n_alphas = alphas.shape[0]
194
+
195
+ # Step 1: Eigendecomposition (done once per fold)
196
+ # eigvals: (n_folds, n_features), Q: (n_folds, n_features, n_features)
197
+ eigvals, Q = xp.linalg.eigh(XtX_batch)
198
+ # Clamp eigenvalues to avoid division by zero for rank-deficient X'X
199
+ # Use dtype-relative floor: float32 tiny ≈ 1.2e-38, float64 tiny ≈ 2.2e-308
200
+ try:
201
+ _eig_floor = max(float(xp.finfo(eigvals.dtype).tiny), 1e-15)
202
+ except (AttributeError, TypeError):
203
+ _eig_floor = 1e-15
204
+ eigvals = xp.maximum(eigvals, _eig_floor)
205
+
206
+ # Step 2: Project Xty into eigenbasis
207
+ # QTXty = Q.T @ Xty_batch -> (n_folds, n_features)
208
+ Q_T = backend.transpose(Q, (0, 2, 1))
209
+ QTXty = xp.matmul(Q_T, Xty_batch[:, :, None])[:, :, 0]
210
+
211
+ # Step 3: Convert alphas to backend array and compute inverse diagonal
212
+ # inv_diag: (n_folds, n_features, n_alphas)
213
+ # Scale alpha by n_samples to match Ridge.fit() convention.
214
+ alphas_arr = backend.asarray(alphas, dtype=eigvals.dtype)
215
+ if n_samples_vec is not None:
216
+ n_arr = backend.asarray(n_samples_vec, dtype=eigvals.dtype).reshape(-1, 1, 1)
217
+ inv_diag = 1.0 / (eigvals[:, :, None] + alphas_arr[None, None, :] * n_arr)
218
+ else:
219
+ inv_diag = 1.0 / (eigvals[:, :, None] + alphas_arr[None, None, :])
220
+
221
+ # Step 4: Scale projected Xty by inverse diagonal
222
+ # scaled: (n_folds, n_features, n_alphas)
223
+ scaled = QTXty[:, :, None] * inv_diag
224
+
225
+ # Step 5: Transform back to original basis
226
+ # coefs: (n_folds, n_features, n_alphas)
227
+ coefs = xp.matmul(Q, scaled)
228
+
229
+ # Step 6: Reshape to (n_alphas, n_folds, n_features)
230
+ # Current shape: (n_folds, n_features, n_alphas)
231
+ # Need to transpose to: (n_alphas, n_folds, n_features)
232
+ coefs = backend.transpose(coefs, (2, 0, 1))
233
+
234
+ # Keep on GPU for further processing (avoid unnecessary H2D transfer)
235
+ return coefs
236
+
237
+
238
+ def _solve_ridge_path_gpu_from_gram(XtX_batch, Xty_batch, n_samples_vec, alphas, backend, fit_intercept=True):
239
+ """
240
+ Solve Ridge path for multiple folds using eigendecomposition (optimized).
241
+
242
+ This function uses eigendecomposition to solve the Ridge regression problem for all
243
+ alphas simultaneously, avoiding repeated Cholesky decompositions.
244
+
245
+ Parameters
246
+ ----------
247
+ XtX_batch : array-like
248
+ Batch of Gram matrices (n_folds, n_features, n_features).
249
+ Xty_batch : array-like
250
+ Batch of cross products (n_folds, n_features).
251
+ n_samples_vec : np.ndarray
252
+ Number of samples for each fold (not used, kept for API compatibility).
253
+ alphas : ndarray
254
+ Alpha values (n_alphas,).
255
+ backend : BackendBase
256
+ Backend instance (CuPyBackend or TorchBackend).
257
+ fit_intercept : bool
258
+ Whether intercept is fitted.
259
+
260
+ Returns
261
+ -------
262
+ coefs_desc : ndarray
263
+ Coefficients for each alpha and fold (n_alphas, n_folds, n_features).
264
+ """
265
+ # Use eigendecomposition-based solver (vectorized over alphas)
266
+ return _solve_ridge_path_gpu_from_gram_eig(XtX_batch, Xty_batch, alphas, backend, fit_intercept, n_samples_vec=n_samples_vec)
267
+
268
+
269
+ # =============================================================================
270
+ # Main CV selection function
271
+ # =============================================================================
272
+
273
+ def _select_ridge_alpha_cv(
274
+ X,
275
+ y,
276
+ *,
277
+ alphas=None,
278
+ n_alphas: int = 100,
279
+ alpha_min_ratio: float = 1e-3,
280
+ cv_folds: int = 5,
281
+ cv_splits=None,
282
+ random_state: Optional[int] = None,
283
+ sample_weight=None,
284
+ fit_intercept: bool = True,
285
+ device: Union[str, Device] = Device.CPU,
286
+ return_details: bool = False,
287
+ cache_key: Optional[Tuple[Any, ...]] = None,
288
+ gpu_cv_mixed_precision: bool = True,
289
+ ):
290
+ """
291
+ Select alpha for Ridge regression via K-fold cross-validation.
292
+
293
+ Parameters
294
+ ----------
295
+ X : array-like
296
+ Design matrix (n_samples, n_features).
297
+ y : array-like
298
+ Response vector.
299
+ alphas : array-like or None
300
+ Alpha values to try. If None, generates n_alphas values.
301
+ n_alphas : int
302
+ Number of alpha values (if alphas is None).
303
+ alpha_min_ratio : float
304
+ Minimum alpha ratio.
305
+ cv_folds : int
306
+ Number of CV folds.
307
+ cv_splits : list or None
308
+ Pre-computed CV splits. If None, uses K-fold.
309
+ random_state : int or None
310
+ Random seed for CV splits.
311
+ sample_weight : array-like or None
312
+ Sample weights.
313
+ fit_intercept : bool
314
+ Whether to fit intercept.
315
+ device : str or Device
316
+ Device to use ('cpu' or 'cuda').
317
+ return_details : bool
318
+ Whether to return full CV details.
319
+ cache_key : tuple or None
320
+ Cache key for CV results.
321
+ gpu_cv_mixed_precision : bool
322
+ Whether to use mixed precision on GPU.
323
+
324
+ Returns
325
+ -------
326
+ alpha : float
327
+ Best alpha value.
328
+ details : dict (if return_details=True)
329
+ Full CV results including alpha grid, MSE path, etc.
330
+ """
331
+ if isinstance(device, Device):
332
+ device = device.value
333
+ device_name = str(device).lower()
334
+ use_gpu = device_name in (Device.CUDA.value, Device.TORCH.value, "torch")
335
+ gpu_requested = use_gpu
336
+
337
+ gpu_input_cupy = False
338
+ gpu_input_torch = False
339
+ if use_gpu:
340
+ # Check if inputs are already on GPU (CuPy or Torch)
341
+ try:
342
+ import cupy as cp
343
+ gpu_input_cupy = isinstance(X, cp.ndarray) and isinstance(y, cp.ndarray)
344
+ if sample_weight is not None and not isinstance(sample_weight, cp.ndarray):
345
+ gpu_input_cupy = False
346
+ except Exception:
347
+ pass
348
+
349
+ # Also check for torch tensors
350
+ if not gpu_input_cupy:
351
+ try:
352
+ import torch
353
+ gpu_input_torch = isinstance(X, torch.Tensor) and isinstance(y, torch.Tensor)
354
+ if sample_weight is not None and not isinstance(sample_weight, torch.Tensor):
355
+ gpu_input_torch = False
356
+ except Exception:
357
+ pass
358
+
359
+ X_np = None
360
+ y_np = None
361
+ sample_weight_np = None
362
+
363
+ if gpu_input_cupy or gpu_input_torch:
364
+ # GPU inputs - get backend for validation
365
+ # Use torch backend for torch tensors, cupy for cupy arrays
366
+ if gpu_input_torch:
367
+ backend = get_backend(backend='torch', device='cuda')
368
+ else:
369
+ backend = get_backend(backend='cupy', device='cuda')
370
+ if len(tuple(X.shape)) != 2:
371
+ raise ValueError("X must be a 2D array")
372
+ n_samples = int(X.shape[0])
373
+ y_check = backend.asarray(y).reshape(-1)
374
+ if int(y_check.shape[0]) != n_samples:
375
+ raise ValueError("y must have the same number of rows as X")
376
+ if sample_weight is not None:
377
+ sw_check = backend.asarray(sample_weight).reshape(-1)
378
+ if int(sw_check.shape[0]) != n_samples:
379
+ raise ValueError("sample_weight must have the same number of rows as X")
380
+ else:
381
+ X_np = np.asarray(X, dtype=np.float64)
382
+ y_np = np.asarray(y, dtype=np.float64).reshape(-1)
383
+ if sample_weight is not None:
384
+ sample_weight_np = np.asarray(sample_weight, dtype=np.float64).reshape(-1)
385
+ if X_np.ndim != 2:
386
+ raise ValueError("X must be a 2D array")
387
+ if y_np.shape[0] != X_np.shape[0]:
388
+ raise ValueError("y must have the same number of rows as X")
389
+ if sample_weight_np is not None and sample_weight_np.shape[0] != X_np.shape[0]:
390
+ raise ValueError("sample_weight must have the same number of rows as X")
391
+ n_samples = int(X_np.shape[0])
392
+
393
+ # Generate alpha grid
394
+ if alphas is None:
395
+ if gpu_input_cupy or gpu_input_torch or use_gpu:
396
+ # GPU path for alpha grid generation
397
+ if gpu_input_torch:
398
+ backend = get_backend(backend='torch', device='cuda')
399
+ else:
400
+ backend = get_backend(backend='cupy', device='cuda')
401
+ X_temp = backend.asarray(X)
402
+ y_temp = backend.asarray(y)
403
+ X_mean = backend.mean(X_temp, axis=0)
404
+ y_mean = backend.mean(y_temp)
405
+ X_centered = X_temp - X_mean
406
+ y_centered = y_temp - y_mean
407
+ XtX = X_centered.T @ X_centered
408
+ Xty = X_centered.T @ y_centered
409
+ n = int(X.shape[0])
410
+ alpha_max = float(backend.max(backend.abs(Xty)) * 2.0 / n)
411
+ if alpha_max == 0:
412
+ alpha_max = 1.0
413
+ alpha_min = alpha_max * alpha_min_ratio
414
+ alpha_grid = np.logspace(np.log10(alpha_min), np.log10(alpha_max), num=n_alphas)
415
+ del X_temp, y_temp, X_mean, y_mean, X_centered, y_centered, XtX, Xty
416
+ else:
417
+ alpha_grid = _default_ridge_alpha_grid(X_np, y_np, n_alphas=n_alphas, alpha_min_ratio=alpha_min_ratio)
418
+ else:
419
+ alpha_grid = np.asarray(alphas, dtype=np.float64)
420
+ alpha_grid = alpha_grid[np.isfinite(alpha_grid)]
421
+ alpha_grid = alpha_grid[alpha_grid > 0.0]
422
+ if alpha_grid.size == 0:
423
+ warnings.warn("All provided alphas were filtered; using default grid.", RuntimeWarning)
424
+ if gpu_input_cupy or gpu_input_torch or use_gpu:
425
+ # GPU path for alpha grid generation
426
+ backend = get_backend(backend="auto", device="cuda")
427
+ X_temp = backend.asarray(X)
428
+ y_temp = backend.asarray(y)
429
+ X_mean = backend.mean(X_temp, axis=0)
430
+ y_mean = backend.mean(y_temp)
431
+ X_centered = X_temp - X_mean
432
+ y_centered = y_temp - y_mean
433
+ XtX = X_centered.T @ X_centered
434
+ Xty = X_centered.T @ y_centered
435
+ n = int(X.shape[0])
436
+ alpha_max = float(backend.max(backend.abs(Xty)) * 2.0 / n)
437
+ if alpha_max == 0:
438
+ alpha_max = 1.0
439
+ alpha_min = alpha_max * alpha_min_ratio
440
+ alpha_grid = np.logspace(np.log10(alpha_min), np.log10(alpha_max), num=n_alphas)
441
+ else:
442
+ alpha_grid = _default_ridge_alpha_grid(X_np, y_np, n_alphas=n_alphas, alpha_min_ratio=alpha_min_ratio)
443
+
444
+ # Handle degenerate cases
445
+ if int(n_samples) < 4 or int(alpha_grid.size) == 1 or int(cv_folds) < 2:
446
+ alpha0 = float(alpha_grid[0])
447
+ if not return_details:
448
+ return alpha0
449
+ return {
450
+ "alpha": alpha0,
451
+ "alphas": alpha_grid.astype(np.float64, copy=False),
452
+ "mse_path": np.full((int(alpha_grid.size), 1), np.nan, dtype=np.float64),
453
+ "mean_mse": np.full(int(alpha_grid.size), np.nan, dtype=np.float64),
454
+ }
455
+
456
+ # Generate CV folds
457
+ if cv_splits is not None:
458
+ folds = cv_splits
459
+ else:
460
+ folds = _kfold_indices(n_samples=int(n_samples), n_splits=int(cv_folds), random_state=random_state)
461
+
462
+ folds_are_complete = _folds_are_complete(folds, n_samples=int(n_samples))
463
+
464
+ alpha_grid = alpha_grid.astype(np.float64, copy=False)
465
+ n_alpha = int(alpha_grid.size)
466
+ n_folds = int(len(folds))
467
+
468
+ # Cache handling
469
+ # Auto-cache disabled by default to prevent stale results across datasets.
470
+ # Only use explicit cache_key if provided by the caller.
471
+ cache_key_eff = cache_key
472
+
473
+ cached_details = _ridge_cv_cache_get(cache_key_eff)
474
+ if cached_details is not None:
475
+ if return_details:
476
+ return cached_details
477
+ return float(cached_details["alpha"])
478
+
479
+ # Initialize MSE path
480
+ mse_path = np.full((n_alpha, n_folds), np.nan, dtype=np.float64)
481
+
482
+ # GPU path
483
+ if use_gpu:
484
+ try:
485
+ # Get backend based on input data type to avoid cross-backend conversion
486
+ # Torch input -> TorchBackend, CuPy input -> CuPyBackend
487
+ import torch
488
+ try:
489
+ import cupy as cp
490
+ cupy_available = True
491
+ except ImportError:
492
+ cupy_available = False
493
+
494
+ # Detect input type and select appropriate backend
495
+ if hasattr(X, '__module__') and 'torch' in str(type(X).__module__):
496
+ backend = _torch_backend
497
+ elif cupy_available and hasattr(X, '__cuda_array_interface__'):
498
+ backend = _cupy_backend
499
+ else:
500
+ # Default to auto-selection for numpy input
501
+ backend = get_backend(backend='auto', device='cuda')
502
+
503
+ xp = backend.xp
504
+
505
+ cv_dtype = backend.float32 if bool(gpu_cv_mixed_precision) else backend.float64
506
+
507
+ # Convert inputs to backend arrays
508
+ if gpu_input_cupy or gpu_input_torch:
509
+ # Already on GPU (CuPy or Torch)
510
+ X_full = backend.asarray(X, dtype=cv_dtype)
511
+ y_full = backend.asarray(y, dtype=cv_dtype).reshape(-1)
512
+ if sample_weight is not None:
513
+ sw_full = backend.asarray(sample_weight, dtype=cv_dtype).reshape(-1)
514
+ else:
515
+ sw_full = None
516
+ else:
517
+ # Convert from numpy
518
+ X_full = backend.asarray(X_np, dtype=cv_dtype)
519
+ y_full = backend.asarray(y_np, dtype=cv_dtype)
520
+ if sample_weight_np is not None:
521
+ sw_full = backend.asarray(sample_weight_np, dtype=cv_dtype)
522
+ else:
523
+ sw_full = None
524
+
525
+ # Precompute for fast fold statistics
526
+ XtX_folds = []
527
+ Xty_folds = []
528
+ n_train_folds = []
529
+ X_mean_folds = []
530
+ y_mean_folds = []
531
+
532
+ # For batched MSE evaluation (Phase 2 optimization)
533
+ X_val_folds = []
534
+ y_val_folds = []
535
+ sw_val_folds = []
536
+ n_val_folds = []
537
+
538
+ fast_fold_stats = (sw_full is None) and bool(folds_are_complete)
539
+ sw_train = None # initialized per-fold in slow path; None for fast path
540
+ if fast_fold_stats:
541
+ n_total = int(X_full.shape[0])
542
+ XtX_full = X_full.T @ X_full
543
+ Xty_full = X_full.T @ y_full
544
+ if bool(fit_intercept):
545
+ X_sum_full = backend.sum(X_full, axis=0)
546
+ y_sum_full = backend.sum(y_full)
547
+ else:
548
+ X_sum_full = None
549
+ y_sum_full = None
550
+
551
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
552
+ train_idx_gpu = backend.asarray(train_idx)
553
+ val_idx_gpu = backend.asarray(val_idx)
554
+
555
+ X_val = X_full[val_idx_gpu]
556
+ y_val = y_full[val_idx_gpu]
557
+ sw_val = None if sw_full is None else sw_full[val_idx_gpu]
558
+
559
+ # Store validation data for batched MSE
560
+ X_val_folds.append(X_val)
561
+ y_val_folds.append(y_val)
562
+ sw_val_folds.append(sw_val)
563
+ n_val_folds.append(int(val_idx_gpu.shape[0]))
564
+
565
+ if fast_fold_stats:
566
+ n_val = int(val_idx_gpu.shape[0])
567
+ n_train = int(n_total - n_val)
568
+
569
+ XtX_val = X_val.T @ X_val
570
+ Xty_val = X_val.T @ y_val
571
+ XtX_raw = XtX_full - XtX_val
572
+ Xty_raw = Xty_full - Xty_val
573
+
574
+ if bool(fit_intercept):
575
+ X_sum_val = backend.sum(X_val, axis=0)
576
+ y_sum_val = backend.sum(y_val)
577
+ X_sum_train = X_sum_full - X_sum_val
578
+ y_sum_train = y_sum_full - y_sum_val
579
+
580
+ inv_n = backend.asarray(1.0 / float(max(1, n_train)), dtype=X_full.dtype)
581
+ X_mean = X_sum_train * inv_n
582
+ y_mean = y_sum_train * inv_n
583
+ XtX = XtX_raw - backend.outer(X_sum_train, X_sum_train) * inv_n
584
+ Xty = Xty_raw - X_sum_train * y_mean
585
+ else:
586
+ X_mean = backend.zeros((X_full.shape[1],), dtype=X_full.dtype)
587
+ y_mean = backend.array(0.0, dtype=X_full.dtype)
588
+ XtX = XtX_raw
589
+ Xty = Xty_raw
590
+ else:
591
+ X_train = X_full[train_idx_gpu]
592
+ y_train = y_full[train_idx_gpu]
593
+ sw_train = None if sw_full is None else sw_full[train_idx_gpu]
594
+
595
+ if sw_train is not None:
596
+ # Weighted Ridge: use X'WX, X'Wy directly
597
+ sw_col = sw_train[:, None]
598
+ if bool(fit_intercept):
599
+ w_sum = max(float(backend.sum(sw_train)), 1e-15)
600
+ X_wmean = backend.sum(X_train * sw_col, axis=0) / w_sum
601
+ y_wmean = backend.sum(y_train * sw_train) / w_sum
602
+ XtX = (X_train * sw_col).T @ X_train - w_sum * backend.outer(X_wmean, X_wmean)
603
+ Xty = (X_train * sw_col).T @ y_train - w_sum * X_wmean * y_wmean
604
+ X_mean = X_wmean
605
+ y_mean = y_wmean
606
+ else:
607
+ XtX = (X_train * sw_col).T @ X_train
608
+ Xty = (X_train * sw_col).T @ y_train
609
+ X_mean = backend.zeros((X_train.shape[1],), dtype=X_train.dtype)
610
+ y_mean = backend.array(0.0, dtype=X_train.dtype)
611
+ n_train = float(sw_train.sum()) # Use weight sum for regularization consistency
612
+ else:
613
+ if bool(fit_intercept):
614
+ X_mean = backend.mean(X_train, axis=0)
615
+ y_mean = backend.mean(y_train)
616
+ X_centered = X_train - X_mean
617
+ y_centered = y_train - y_mean
618
+ else:
619
+ X_mean = backend.zeros((X_train.shape[1],), dtype=X_train.dtype)
620
+ y_mean = backend.array(0.0, dtype=X_train.dtype)
621
+ X_centered = X_train
622
+ y_centered = y_train
623
+
624
+ XtX = X_centered.T @ X_centered
625
+ Xty = X_centered.T @ y_centered
626
+ n_train = int(X_train.shape[0])
627
+
628
+ XtX_folds.append(XtX)
629
+ Xty_folds.append(Xty)
630
+ # For weighted Ridge, n_train is sum(sw) (float); for unweighted, it's the count (int)
631
+ n_train_folds.append(float(n_train) if sw_train is not None else int(n_train))
632
+ X_mean_folds.append(X_mean)
633
+ y_mean_folds.append(y_mean)
634
+
635
+ # Batch solve for all alphas (Phase 1 optimization)
636
+ XtX_batch = backend.stack(XtX_folds, axis=0)
637
+ Xty_batch = backend.stack(Xty_folds, axis=0)
638
+ # Use float64 to preserve fractional sum(sw) for weighted Ridge
639
+ n_samples_vec = np.asarray(n_train_folds, dtype=np.float64)
640
+
641
+ coefs_batch = _solve_ridge_path_gpu_from_gram(
642
+ XtX_batch, Xty_batch, n_samples_vec, alpha_grid, backend, fit_intercept=bool(fit_intercept)
643
+ )
644
+
645
+ # Batch compute intercepts (Phase 2 optimization)
646
+ X_mean_batch = backend.stack(X_mean_folds, axis=0) # (n_folds, n_features)
647
+ y_mean_batch = backend.stack(y_mean_folds, axis=0) # (n_folds,)
648
+
649
+ intercepts_batch = _compute_intercepts_batch(
650
+ coefs_batch, X_mean_batch, y_mean_batch, backend, fit_intercept=bool(fit_intercept)
651
+ ) # (n_alphas, n_folds)
652
+
653
+ # Batch compute MSE for all folds (Phase 2 optimization)
654
+ # Pad validation sets to same size
655
+ n_val_max = max(n_val_folds)
656
+ n_features = int(X_full.shape[1])
657
+
658
+ # Pre-allocate padded batches (Phase 3 optimization - memory pre-allocation)
659
+ X_val_batch = backend.zeros((n_folds, n_val_max, n_features), dtype=cv_dtype)
660
+ y_val_batch = backend.zeros((n_folds, n_val_max), dtype=cv_dtype)
661
+
662
+ if sw_full is not None:
663
+ sw_val_batch = backend.zeros((n_folds, n_val_max), dtype=cv_dtype)
664
+ else:
665
+ sw_val_batch = None
666
+
667
+ # Fill padded batches
668
+ for fold_idx in range(n_folds):
669
+ n_val = n_val_folds[fold_idx]
670
+ X_val_batch[fold_idx, :n_val, :] = X_val_folds[fold_idx]
671
+ y_val_batch[fold_idx, :n_val] = y_val_folds[fold_idx]
672
+ if sw_val_batch is not None:
673
+ sw_val_batch[fold_idx, :n_val] = sw_val_folds[fold_idx]
674
+
675
+ # Batched MSE computation (fully vectorized)
676
+ mse_path_gpu = _batch_mse_all_folds(
677
+ X_val_batch, y_val_batch, coefs_batch, intercepts_batch, backend, sw_val_batch,
678
+ n_val_folds=n_val_folds,
679
+ )
680
+
681
+ # Convert to numpy
682
+ mse_path = backend.to_numpy(mse_path_gpu)
683
+
684
+ except Exception as exc:
685
+ raise RuntimeError(
686
+ "GPU path failed in _select_ridge_alpha_cv with device='cuda'; "
687
+ "CPU fallback is disabled for strict CUDA execution."
688
+ ) from exc
689
+
690
+ # CPU path
691
+ if not use_gpu:
692
+ if gpu_requested:
693
+ raise RuntimeError(
694
+ "device='cuda' requested but GPU path was not executed; "
695
+ "CPU fallback is disabled for strict CUDA execution."
696
+ )
697
+
698
+ fast_fold_stats = (sample_weight_np is None) and bool(folds_are_complete)
699
+ if fast_fold_stats:
700
+ n_total = int(X_np.shape[0])
701
+ XtX_full = X_np.T @ X_np
702
+ Xty_full = X_np.T @ y_np
703
+ if bool(fit_intercept):
704
+ X_sum_full = np.sum(X_np, axis=0)
705
+ y_sum_full = float(np.sum(y_np))
706
+ else:
707
+ X_sum_full = None
708
+ y_sum_full = None
709
+
710
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
711
+ X_val = X_np[val_idx]
712
+ y_val = y_np[val_idx]
713
+ sw_val = None if sample_weight_np is None else sample_weight_np[val_idx]
714
+
715
+ if fast_fold_stats:
716
+ n_val = int(np.asarray(val_idx, dtype=np.int64).reshape(-1).size)
717
+ n_train = int(n_total - n_val)
718
+
719
+ XtX_val = X_val.T @ X_val
720
+ Xty_val = X_val.T @ y_val
721
+ XtX_raw = XtX_full - XtX_val
722
+ Xty_raw = Xty_full - Xty_val
723
+
724
+ if bool(fit_intercept):
725
+ X_sum_val = np.sum(X_val, axis=0)
726
+ y_sum_val = float(np.sum(y_val))
727
+ X_sum_train = X_sum_full - X_sum_val
728
+ y_sum_train = y_sum_full - y_sum_val
729
+
730
+ inv_n = 1.0 / float(max(1, n_train))
731
+ X_mean = X_sum_train * inv_n
732
+ y_mean = y_sum_train * inv_n
733
+ XtX = XtX_raw - np.outer(X_sum_train, X_sum_train) * inv_n
734
+ Xty = Xty_raw - X_sum_train * y_mean
735
+ else:
736
+ X_mean = np.zeros((X_np.shape[1],), dtype=np.float64)
737
+ y_mean = 0.0
738
+ XtX = XtX_raw
739
+ Xty = Xty_raw
740
+ else:
741
+ X_train = X_np[train_idx]
742
+ y_train = y_np[train_idx]
743
+ sw_train = None if sample_weight_np is None else sample_weight_np[train_idx]
744
+
745
+ if sw_train is not None:
746
+ # Weighted Ridge: use X'WX, X'Wy directly (matches GPU path)
747
+ sw_col = sw_train[:, np.newaxis]
748
+ if bool(fit_intercept):
749
+ w_sum = max(float(np.sum(sw_train)), 1e-15)
750
+ X_wmean = np.sum(X_train * sw_col, axis=0) / w_sum
751
+ y_wmean = float(np.sum(y_train * sw_train)) / w_sum
752
+ XtX = (X_train * sw_col).T @ X_train - w_sum * np.outer(X_wmean, X_wmean)
753
+ Xty = (X_train * sw_col).T @ y_train - w_sum * X_wmean * y_wmean
754
+ X_mean = X_wmean
755
+ y_mean = y_wmean
756
+ else:
757
+ XtX = (X_train * sw_col).T @ X_train
758
+ Xty = (X_train * sw_col).T @ y_train
759
+ X_mean = np.zeros((X_train.shape[1],), dtype=np.float64)
760
+ y_mean = 0.0
761
+ n_train = float(np.sum(sw_train))
762
+ else:
763
+ if bool(fit_intercept):
764
+ X_mean = np.mean(X_train, axis=0)
765
+ y_mean = float(np.mean(y_train))
766
+ X_centered = X_train - X_mean
767
+ y_centered = y_train - y_mean
768
+ else:
769
+ X_mean = np.zeros((X_train.shape[1],), dtype=np.float64)
770
+ y_mean = 0.0
771
+ X_centered = X_train
772
+ y_centered = y_train
773
+
774
+ XtX = X_centered.T @ X_centered
775
+ Xty = X_centered.T @ y_centered
776
+ n_train = int(X_train.shape[0])
777
+
778
+ # Solve for all alphas: (XtX + n_eff*alpha*I)^-1 @ Xty
779
+ # n_eff scaling matches Ridge.fit() and PGLM exact ridge.
780
+ I = np.eye(XtX.shape[0])
781
+ coefs_desc = []
782
+ for alpha in alpha_grid:
783
+ XtX_reg = XtX + alpha * float(n_train) * I
784
+ try:
785
+ coef = np.linalg.solve(XtX_reg, Xty)
786
+ except np.linalg.LinAlgError:
787
+ coef = np.linalg.lstsq(XtX_reg, Xty, rcond=None)[0]
788
+ coefs_desc.append(coef.flatten())
789
+ coefs_desc = np.stack(coefs_desc, axis=0)
790
+
791
+ # Compute intercepts
792
+ if bool(fit_intercept):
793
+ # X_mean: (p,), coefs_desc: (n_alphas, p)
794
+ # X_mean @ coefs_desc.T = coefs_desc @ X_mean = (n_alphas,)
795
+ intercepts_desc = y_mean - coefs_desc @ X_mean
796
+ else:
797
+ intercepts_desc = np.zeros((coefs_desc.shape[0],))
798
+
799
+ # Compute MSE
800
+ mse_desc = _batch_mse_cv(X_val, y_val, coefs_desc, intercepts_desc, sample_weight=sw_val)
801
+ mse_path[:, fold_idx] = mse_desc
802
+
803
+ # Compute mean MSE across folds
804
+ mean_mse = np.nanmean(mse_path, axis=1)
805
+
806
+ # Find best alpha (minimum MSE)
807
+ best_idx = int(np.nanargmin(mean_mse))
808
+ best_alpha = float(alpha_grid[best_idx])
809
+
810
+ details = {
811
+ "alpha": best_alpha,
812
+ "alphas": alpha_grid,
813
+ "mse_path": mse_path,
814
+ "mean_mse": mean_mse,
815
+ }
816
+
817
+ _ridge_cv_cache_put(cache_key_eff, details)
818
+
819
+ if return_details:
820
+ return details
821
+ return best_alpha
822
+
823
+
824
+ # =============================================================================
825
+ # GPU MSE helper — batched across folds
826
+ # =============================================================================
827
+
828
+ def _batch_mse_all_folds(X_val_batch, y_val_batch, coefs_batch, intercepts_batch, backend, sample_weights_batch=None, n_val_folds=None):
829
+ """
830
+ Compute MSE for all folds and all alphas simultaneously (fully vectorized).
831
+
832
+ This function batches the MSE computation across all CV folds and all alphas,
833
+ maximizing GPU parallelism and minimizing kernel launch overhead.
834
+
835
+ Parameters
836
+ ----------
837
+ X_val_batch : array-like
838
+ Batched validation matrices (n_folds, n_val_max, n_features).
839
+ Padded with zeros if fold sizes differ.
840
+ y_val_batch : array-like
841
+ Batched validation responses (n_folds, n_val_max).
842
+ Padded with zeros if fold sizes differ.
843
+ coefs_batch : array-like
844
+ Coefficient matrix (n_alphas, n_folds, n_features). Same device as X_val_batch.
845
+ intercepts_batch : array-like
846
+ Intercept vector (n_alphas, n_folds). Same device as X_val_batch.
847
+ backend : BackendBase
848
+ Backend instance (CuPyBackend or TorchBackend).
849
+ sample_weights_batch : array-like or None
850
+ Batched sample weights (n_folds, n_val_max), or None.
851
+
852
+ Returns
853
+ -------
854
+ mse : array-like
855
+ MSE for each alpha and fold (n_alphas, n_folds). Same device as input.
856
+ """
857
+ xp = backend.xp
858
+ n_folds = X_val_batch.shape[0]
859
+
860
+ # coefs_batch and intercepts_batch are already on GPU (no conversion needed)
861
+ # Compute predictions: (n_folds, n_val_max, n_alphas)
862
+ # X_val_batch: (n_folds, n_val_max, n_features)
863
+ # coefs_batch: (n_alphas, n_folds, n_features) -> transpose to (n_folds, n_features, n_alphas)
864
+ coefs_T = backend.transpose(coefs_batch, (1, 2, 0)) # (n_folds, n_features, n_alphas)
865
+ y_pred = xp.matmul(X_val_batch, coefs_T) # (n_folds, n_val_max, n_alphas)
866
+
867
+ # Add intercepts: (n_alphas, n_folds) -> (n_folds, 1, n_alphas) broadcasts
868
+ # intercepts_batch.T: (n_folds, n_alphas) -> expand_dims to (1, n_folds, n_alphas)
869
+ _is_torch = _torch_dev(coefs_batch) is not None
870
+ _expand = lambda a, dim: a.unsqueeze(dim) if _is_torch else xp.expand_dims(a, axis=dim)
871
+
872
+ intercepts_expanded = _expand(intercepts_batch.T, 1) # (1, n_folds, n_alphas)
873
+ y_pred = y_pred + intercepts_expanded # broadcasts to (n_folds, n_val_max, n_alphas)
874
+
875
+ # Residuals: (n_folds, n_val_max, n_alphas)
876
+ y_val_expanded = _expand(y_val_batch, 2) # (n_folds, n_val_max, 1)
877
+ residuals = y_pred - y_val_expanded
878
+
879
+ # Zero out padded rows to prevent inflated MSE from intercept contribution
880
+ if n_val_folds is not None:
881
+ n_val_max = residuals.shape[1]
882
+ # Create mask: (n_folds, n_val_max) -> (n_folds, n_val_max, 1)
883
+ if _is_torch:
884
+ import torch
885
+ mask = torch.arange(n_val_max, device=residuals.device).unsqueeze(0) < \
886
+ torch.tensor(n_val_folds, device=residuals.device).unsqueeze(1)
887
+ mask = mask.unsqueeze(2).to(residuals.dtype)
888
+ else:
889
+ mask = xp.arange(n_val_max).reshape(1, -1) < \
890
+ xp.asarray(n_val_folds).reshape(-1, 1)
891
+ mask = mask[:, :, xp.newaxis].astype(residuals.dtype)
892
+ residuals = residuals * mask
893
+
894
+ # Compute MSE — use per-fold n_val to exclude padded zeros
895
+ if sample_weights_batch is not None:
896
+ sw = _expand(sample_weights_batch, 2) # (n_folds, n_val_max, 1)
897
+ ssr = xp.sum(sw * residuals ** 2, axis=1) # (n_folds, n_alphas)
898
+ sw_sum = xp.sum(sw * mask, axis=1) if n_val_folds is not None else xp.sum(sw, axis=1)
899
+ # Guard against zero weight sum (avoid division by zero)
900
+ sw_sum_safe = xp.where(sw_sum > 0, sw_sum, xp.ones_like(sw_sum))
901
+ # sw_sum_safe already has shape (n_folds, 1) — no extra axis needed
902
+ mse = (ssr / sw_sum_safe).T # (n_alphas, n_folds)
903
+ else:
904
+ ssr = xp.sum(residuals ** 2, axis=1) # (n_folds, n_alphas)
905
+ if n_val_folds is not None:
906
+ n_val_vec = backend.asarray(n_val_folds, dtype=ssr.dtype).reshape(-1, 1)
907
+ mse = (ssr / n_val_vec).T # (n_alphas, n_folds)
908
+ else:
909
+ mse = xp.mean(residuals ** 2, axis=1).T
910
+
911
+ return mse
912
+
913
+
914
+ def _compute_intercepts_batch(coefs_batch, X_mean_batch, y_mean_batch, backend, fit_intercept=True):
915
+ """
916
+ Compute intercepts for all alphas and all folds simultaneously.
917
+
918
+ Parameters
919
+ ----------
920
+ coefs_batch : array-like
921
+ Coefficient matrix (n_alphas, n_folds, n_features). Can be GPU or CPU.
922
+ X_mean_batch : array-like
923
+ Training set means (n_folds, n_features).
924
+ y_mean_batch : array-like
925
+ Training set response means (n_folds,).
926
+ backend : BackendBase
927
+ Backend instance.
928
+ fit_intercept : bool
929
+ Whether to compute intercepts.
930
+
931
+ Returns
932
+ -------
933
+ intercepts : array-like
934
+ Intercept matrix (n_alphas, n_folds). Same device as input.
935
+ """
936
+ xp = backend.xp
937
+
938
+ if not fit_intercept:
939
+ return backend.zeros((coefs_batch.shape[0], coefs_batch.shape[1]), dtype=coefs_batch.dtype)
940
+
941
+ n_alphas = coefs_batch.shape[0]
942
+ n_folds = coefs_batch.shape[1]
943
+ n_features = coefs_batch.shape[2]
944
+
945
+ # Compute coefs @ X_mean for each fold
946
+ # Reshape coefs to (n_alphas * n_folds, n_features)
947
+ coefs_reshaped = coefs_batch.reshape((n_alphas * n_folds, n_features))
948
+
949
+ # Tile X_mean for each alpha
950
+ X_mean_tiled = xp.tile(X_mean_batch, (n_alphas, 1))
951
+
952
+ # Batched dot product: sum over features
953
+ coefs_dot_sum = xp.sum(coefs_reshaped * X_mean_tiled, axis=1) # (n_alphas * n_folds,)
954
+ coefs_dot_sum = coefs_dot_sum.reshape((n_alphas, n_folds)) # (n_alphas, n_folds)
955
+
956
+ # y_mean_batch: (n_folds,) -> (1, n_folds) broadcasts to (n_alphas, n_folds)
957
+ if _torch_dev(coefs_batch) is not None:
958
+ y_mean_expanded = y_mean_batch.unsqueeze(0)
959
+ else:
960
+ y_mean_expanded = xp.expand_dims(y_mean_batch, axis=0)
961
+ intercepts = y_mean_expanded - coefs_dot_sum
962
+
963
+ return intercepts
964
+
965
+
966
+ # =============================================================================
967
+ # RidgeCV Class
968
+ # =============================================================================
969
+
970
+ class RidgeCV(CVEstimatorBase):
971
+ """
972
+ Cross-validated Ridge regression with GPU support.
973
+
974
+ This class implements K-fold cross-validation to select the optimal
975
+ regularization parameter alpha for Ridge regression.
976
+
977
+ Parameters
978
+ ----------
979
+ alphas : array-like or None
980
+ Alpha values to try. If None, generates n_alphas values.
981
+ n_alphas : int
982
+ Number of alpha values (if alphas is None). Default is 100.
983
+ alpha_min_ratio : float
984
+ Minimum alpha as a ratio of max alpha.
985
+ cv : int
986
+ Number of CV folds. Default is 5.
987
+ fit_intercept : bool
988
+ Whether to fit intercept. Default is True.
989
+ device : str or Device
990
+ Computation device: 'cpu', 'cuda', or 'auto'.
991
+ n_jobs : int or None
992
+ Number of parallel jobs (not yet implemented).
993
+ compute_inference : bool
994
+ Whether to compute standard errors, t-stats, p-values and CI.
995
+ cov_type : str
996
+ Covariance estimator for inference. One of:
997
+ 'nonrobust', 'hc0', 'hc1', 'hc2', 'hc3', 'hac'.
998
+ gpu_memory_cleanup : bool
999
+ Whether to free CuPy memory pool after fitting.
1000
+ random_state : int or None
1001
+ Random seed for CV splits.
1002
+ gpu_cv_mixed_precision : bool
1003
+ Whether to use mixed precision on GPU.
1004
+
1005
+ Attributes
1006
+ ----------
1007
+ alpha_ : float
1008
+ Selected alpha value.
1009
+ alphas_ : ndarray
1010
+ All alpha values tested.
1011
+ cv_results_ : dict
1012
+ CV results including mse_path and mean_mse.
1013
+ best_score_ : float
1014
+ Best (minimum) MSE across CV folds.
1015
+ coef_ : ndarray
1016
+ Coefficients of the final model.
1017
+ intercept_ : float
1018
+ Intercept of the final model.
1019
+ estimator_ : Ridge
1020
+ The fitted Ridge estimator with selected alpha.
1021
+
1022
+ Examples
1023
+ --------
1024
+ >>> import numpy as np
1025
+ >>> from statgpu.linear_model import RidgeCV
1026
+ >>> X = np.random.randn(1000, 20)
1027
+ >>> y = X @ np.random.randn(20) + 0.1 * np.random.randn(1000)
1028
+ >>> model = RidgeCV(cv=5, device='cuda')
1029
+ >>> model.fit(X, y)
1030
+ >>> print(f"Selected alpha: {model.alpha_:.4f}")
1031
+ >>> print(f"Best CV score: {model.best_score_:.4f}")
1032
+ """
1033
+
1034
+ def __init__(
1035
+ self,
1036
+ alphas=None,
1037
+ n_alphas: int = 100,
1038
+ alpha_min_ratio: float = 1e-3,
1039
+ cv: int = 5,
1040
+ cv_splits=None,
1041
+ fit_intercept: bool = True,
1042
+ device: Union[str, Device] = Device.AUTO,
1043
+ n_jobs: Optional[int] = None,
1044
+ compute_inference: bool = True,
1045
+ cov_type: str = "nonrobust",
1046
+ gpu_memory_cleanup: bool = False,
1047
+ random_state: Optional[int] = None,
1048
+ gpu_cv_mixed_precision: bool = True,
1049
+ ):
1050
+ super().__init__(
1051
+ cv=cv,
1052
+ random_state=random_state,
1053
+ device=device,
1054
+ n_jobs=n_jobs,
1055
+ )
1056
+ self.alphas = alphas
1057
+ self.n_alphas = int(n_alphas)
1058
+ self.alpha_min_ratio = float(alpha_min_ratio)
1059
+ self.cv = int(cv)
1060
+ self.cv_splits = cv_splits
1061
+ self.fit_intercept = bool(fit_intercept)
1062
+ self.compute_inference = bool(compute_inference)
1063
+ self.cov_type = str(cov_type)
1064
+ self.gpu_memory_cleanup = bool(gpu_memory_cleanup)
1065
+ self.gpu_cv_mixed_precision = bool(gpu_cv_mixed_precision)
1066
+
1067
+ self.alpha_ = None
1068
+ self.alphas_ = None
1069
+ self.cv_results_ = None
1070
+ self.mean_mse_ = None
1071
+ self.best_score_ = None
1072
+ self.coef_ = None
1073
+ self.intercept_ = None
1074
+ self.n_iter_ = None
1075
+ self.estimator_ = None
1076
+
1077
+ def fit(self, X, y, sample_weight=None):
1078
+ """
1079
+ Fit Ridge regression with cross-validation to select alpha.
1080
+
1081
+ Parameters
1082
+ ----------
1083
+ X : array-like
1084
+ Training data (n_samples, n_features).
1085
+ y : array-like
1086
+ Target values.
1087
+ sample_weight : array-like or None
1088
+ Sample weights.
1089
+
1090
+ Returns
1091
+ -------
1092
+ self : RidgeCV
1093
+ Fitted estimator.
1094
+ """
1095
+ from statgpu.cross_validation._base import validate_cv_sample_weight
1096
+ n_samples = int(X.shape[0]) if hasattr(X, 'shape') else len(X)
1097
+ sample_weight = validate_cv_sample_weight(sample_weight, n_samples)
1098
+
1099
+ device_name = self._get_compute_device().value
1100
+
1101
+ # Run CV to select alpha
1102
+ details = _select_ridge_alpha_cv(
1103
+ X,
1104
+ y,
1105
+ alphas=self.alphas,
1106
+ n_alphas=self.n_alphas,
1107
+ alpha_min_ratio=self.alpha_min_ratio,
1108
+ cv_folds=self.cv,
1109
+ cv_splits=self.cv_splits,
1110
+ random_state=self.random_state,
1111
+ sample_weight=sample_weight,
1112
+ fit_intercept=self.fit_intercept,
1113
+ device=device_name,
1114
+ gpu_cv_mixed_precision=self.gpu_cv_mixed_precision,
1115
+ return_details=True,
1116
+ )
1117
+
1118
+ # Store CV results
1119
+ self.alpha_ = float(details["alpha"])
1120
+ self.alphas_ = np.asarray(details["alphas"], dtype=np.float64)
1121
+ mse_path = np.asarray(details["mse_path"], dtype=np.float64)
1122
+ mean_mse = np.asarray(details["mean_mse"], dtype=np.float64)
1123
+
1124
+ self.cv_results_ = {"mse_path": mse_path}
1125
+ self.mean_mse_ = mean_mse
1126
+
1127
+ if np.any(np.isfinite(mean_mse)):
1128
+ # sklearn convention: best_score_ is negative MSE (higher is better)
1129
+ self.best_score_ = -float(np.nanmin(mean_mse))
1130
+ else:
1131
+ self.best_score_ = np.nan
1132
+
1133
+ # Fit final model with selected alpha.
1134
+ # Exact solve uses n*alpha on unnormalized X'X, matching the
1135
+ # per-sample convention (loss/n + alpha*||w||^2) used by all paths.
1136
+ # alpha_ stores the CV-selected value; pass it directly to Ridge.
1137
+ estimator = Ridge(
1138
+ alpha=self.alpha_,
1139
+ fit_intercept=self.fit_intercept,
1140
+ device=self.device,
1141
+ n_jobs=self.n_jobs,
1142
+ compute_inference=self.compute_inference,
1143
+ cov_type=self.cov_type,
1144
+ gpu_memory_cleanup=self.gpu_memory_cleanup,
1145
+ )
1146
+
1147
+ estimator.fit(X, y, sample_weight=sample_weight)
1148
+
1149
+ self.estimator_ = estimator
1150
+ self.coef_ = np.asarray(estimator.coef_)
1151
+ self.intercept_ = estimator.intercept_
1152
+ self.n_iter_ = getattr(estimator, 'n_iter_', None)
1153
+
1154
+ self._fitted = True
1155
+ return self
1156
+
1157
+ def predict(self, X):
1158
+ """Predict using the fitted Ridge model."""
1159
+ self._check_is_fitted()
1160
+ return self.estimator_.predict(X)