statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,252 @@
1
+ """Mini-batch non-negative matrix factorization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import backend_random_normal, check_2d_array, reject_sparse, scalar_to_float
12
+
13
+
14
+ class MiniBatchNMF(BaseEstimator):
15
+ """Dense mini-batch NMF with multiplicative updates and Frobenius loss."""
16
+
17
+ def __init__(
18
+ self,
19
+ n_components: Optional[int] = None,
20
+ init: str = "random",
21
+ batch_size: int = 1024,
22
+ max_iter: int = 200,
23
+ tol: float = 1e-4,
24
+ random_state: Optional[int] = None,
25
+ device: Union[str, Device] = Device.AUTO,
26
+ n_jobs: Optional[int] = None,
27
+ ):
28
+ super().__init__(device=device, n_jobs=n_jobs)
29
+ self.n_components = n_components
30
+ self.init = init
31
+ self.batch_size = batch_size
32
+ self.max_iter = max_iter
33
+ self.tol = tol
34
+ self.random_state = random_state
35
+
36
+ def _validate_params(self, n_samples: int, n_features: int):
37
+ if self.n_components is None:
38
+ n_components = min(n_samples, n_features)
39
+ else:
40
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
41
+ raise ValueError("n_components must be None or a positive integer")
42
+ n_components = int(self.n_components)
43
+ if self.init != "random":
44
+ raise NotImplementedError("MiniBatchNMF v1 only supports init='random'")
45
+ if not isinstance(self.batch_size, (int, np.integer)) or int(self.batch_size) < 1:
46
+ raise ValueError("batch_size must be a positive integer")
47
+ if not isinstance(self.max_iter, (int, np.integer)) or int(self.max_iter) < 1:
48
+ raise ValueError("max_iter must be a positive integer")
49
+ if float(self.tol) < 0.0:
50
+ raise ValueError("tol must be non-negative")
51
+ return n_components
52
+
53
+ def _check_nonnegative(self, backend, X):
54
+ if scalar_to_float(backend.min(X)) < 0.0:
55
+ raise ValueError("MiniBatchNMF input X must be non-negative")
56
+
57
+ def _init_matrix(self, backend, shape, scale, seed):
58
+ return backend.abs(backend_random_normal(backend, seed, size=shape, scale=scale)) + 1e-8
59
+
60
+ def _init_components(self, backend, X, n_components):
61
+ mean = max(scalar_to_float(backend.mean(X)), np.finfo(np.float64).eps)
62
+ scale = np.sqrt(mean / float(n_components))
63
+ return self._init_matrix(backend, (n_components, X.shape[1]), scale, self.random_state)
64
+
65
+ def _init_w(self, backend, X, seed=None):
66
+ mean = max(scalar_to_float(backend.mean(X)), np.finfo(np.float64).eps)
67
+ scale = np.sqrt(mean / float(self.n_components_))
68
+ if seed is None:
69
+ seed = self.random_state
70
+ return self._init_matrix(backend, (X.shape[0], self.n_components_), scale, seed)
71
+
72
+ def _init_w_from_data(self, backend, X, H, eps):
73
+ numerator = backend.matmul(X, H.T)
74
+ denominator = backend.reshape(backend.sum(H * H, axis=1) + eps, (1, H.shape[0]))
75
+ return backend.maximum(numerator / denominator, eps)
76
+
77
+ def _update_h(self, backend, X, W, H, eps):
78
+ numerator = backend.matmul(W.T, X)
79
+ denominator = backend.matmul(backend.matmul(W.T, W), H) + eps
80
+ return H * numerator / denominator
81
+
82
+ def _update_h_from_stats(self, backend, H, A, B, eps):
83
+ denominator = backend.matmul(A, H) + eps
84
+ return H * B / denominator
85
+
86
+ def _update_h_from_stats_steps(self, backend, H, A, B, eps, n_steps):
87
+ for _ in range(int(n_steps)):
88
+ H = self._update_h_from_stats(backend, H, A, B, eps)
89
+ return H
90
+
91
+ def _update_w(self, backend, X, W, H, eps):
92
+ numerator = backend.matmul(X, H.T)
93
+ denominator = backend.matmul(W, backend.matmul(H, H.T)) + eps
94
+ return W * numerator / denominator
95
+
96
+ def _fit_batch_w(self, backend, X, H, n_steps):
97
+ eps = np.finfo(np.float64).eps
98
+ W = self._init_w_from_data(backend, X, H, eps)
99
+ for _ in range(int(n_steps)):
100
+ W = self._update_w(backend, X, W, H, eps)
101
+ return W
102
+
103
+ def _batch_stats(self, backend, X, W):
104
+ return backend.matmul(W.T, W), backend.matmul(W.T, X)
105
+
106
+ def _reconstruction_error(self, backend, X, W, H):
107
+ residual = X - backend.matmul(W, H)
108
+ return scalar_to_float(backend.sqrt(backend.sum(residual * residual)))
109
+
110
+ def _reconstruction_error_from_stats(self, backend, x_sq, A, B, H):
111
+ cross = backend.sum(B * H)
112
+ quadratic = backend.sum(backend.matmul(A, H) * H)
113
+ value = backend.maximum(x_sq - 2.0 * cross + quadratic, 0.0)
114
+ return scalar_to_float(backend.sqrt(value))
115
+
116
+ def partial_fit(self, X, y=None):
117
+ reject_sparse(X, "MiniBatchNMF")
118
+ backend = self._get_backend()
119
+ X_arr = backend.asarray(X, dtype=backend.float64)
120
+ check_2d_array(X_arr)
121
+ self._check_nonnegative(backend, X_arr)
122
+ n_samples, n_features = X_arr.shape
123
+ if not getattr(self, "_fitted", False):
124
+ n_components = self._validate_params(n_samples, n_features)
125
+ self.n_components_ = int(n_components)
126
+ self.n_features_in_ = int(n_features)
127
+ self.components_ = self._init_components(backend, X_arr, n_components)
128
+ self.n_iter_ = 0
129
+ self._n_batches_seen_ = 0
130
+ self._A_accum = backend.zeros((self.n_components_, self.n_components_), dtype=backend.float64)
131
+ self._B_accum = backend.zeros((self.n_components_, self.n_features_in_), dtype=backend.float64)
132
+ self._backend_name = backend.name
133
+ self._fitted = True
134
+ elif n_features != self.n_features_in_:
135
+ raise ValueError(f"X has {n_features} features, expected {self.n_features_in_}")
136
+
137
+ eps = np.finfo(np.float64).eps
138
+ W = self._fit_batch_w(backend, X_arr, self.components_, n_steps=3)
139
+ A_batch, B_batch = self._batch_stats(backend, X_arr, W)
140
+ self._A_accum = self._A_accum + A_batch
141
+ self._B_accum = self._B_accum + B_batch
142
+ self._n_batches_seen_ = int(self._n_batches_seen_) + 1
143
+ self.components_ = self._update_h_from_stats_steps(
144
+ backend, self.components_, self._A_accum, self._B_accum, eps, n_steps=3
145
+ )
146
+ self.n_iter_ = int(self.n_iter_) + 1
147
+ self.reconstruction_err_ = self._reconstruction_error(backend, X_arr, W, self.components_)
148
+ return self
149
+
150
+ def fit(self, X, y=None):
151
+ reject_sparse(X, "MiniBatchNMF")
152
+ backend = self._get_backend()
153
+ X_arr = backend.asarray(X, dtype=backend.float64)
154
+ check_2d_array(X_arr)
155
+ self._check_nonnegative(backend, X_arr)
156
+ n_samples, n_features = X_arr.shape
157
+ n_components = self._validate_params(n_samples, n_features)
158
+ self.n_components_ = int(n_components)
159
+ self.n_features_in_ = int(n_features)
160
+ self.components_ = self._init_components(backend, X_arr, n_components)
161
+ self.n_iter_ = 0
162
+ self._n_batches_seen_ = 0
163
+ self._backend_name = backend.name
164
+ self._fitted = True
165
+ batch_size = min(int(self.batch_size), n_samples)
166
+ eps = np.finfo(np.float64).eps
167
+ previous_delta = None
168
+ last_A = None
169
+ last_B = None
170
+ for epoch in range(1, int(self.max_iter) + 1):
171
+ A_epoch = backend.zeros((self.n_components_, self.n_components_), dtype=backend.float64)
172
+ B_epoch = backend.zeros((self.n_components_, self.n_features_in_), dtype=backend.float64)
173
+ for start in range(0, n_samples, batch_size):
174
+ X_batch = X_arr[start : start + batch_size]
175
+ W_batch = self._fit_batch_w(backend, X_batch, self.components_, n_steps=3)
176
+ A_batch, B_batch = self._batch_stats(backend, X_batch, W_batch)
177
+ A_epoch = A_epoch + A_batch
178
+ B_epoch = B_epoch + B_batch
179
+ self._n_batches_seen_ = int(self._n_batches_seen_) + 1
180
+
181
+ old_components = self.components_
182
+ new_components = self._update_h_from_stats_steps(
183
+ backend, old_components, A_epoch, B_epoch, eps, n_steps=3
184
+ )
185
+ delta = scalar_to_float(backend.xp.linalg.norm(new_components - old_components) / (backend.xp.linalg.norm(old_components) + eps))
186
+ self.components_ = new_components
187
+ self.n_iter_ = int(epoch)
188
+ last_A = A_epoch
189
+ last_B = B_epoch
190
+ if previous_delta is not None and delta <= float(self.tol):
191
+ break
192
+ previous_delta = delta
193
+ else:
194
+ epoch = int(self.max_iter)
195
+ self.n_iter_ = int(epoch)
196
+ if last_A is None or last_B is None:
197
+ W_full = self.transform(X_arr)
198
+ self.reconstruction_err_ = self._reconstruction_error(backend, X_arr, W_full, self.components_)
199
+ self._A_accum = backend.zeros((self.n_components_, self.n_components_), dtype=backend.float64)
200
+ self._B_accum = backend.zeros((self.n_components_, self.n_features_in_), dtype=backend.float64)
201
+ else:
202
+ self.reconstruction_err_ = self._reconstruction_error_from_stats(
203
+ backend, backend.sum(X_arr * X_arr), last_A, last_B, self.components_
204
+ )
205
+ self._A_accum = backend.copy(last_A)
206
+ self._B_accum = backend.copy(last_B)
207
+ return self
208
+
209
+ def transform(self, X):
210
+ self._check_is_fitted()
211
+ reject_sparse(X, "MiniBatchNMF")
212
+ backend = self._get_backend()
213
+ X_arr = backend.asarray(X, dtype=backend.float64)
214
+ check_2d_array(X_arr)
215
+ self._check_nonnegative(backend, X_arr)
216
+ if X_arr.shape[1] != self.n_features_in_:
217
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
218
+ eps = np.finfo(np.float64).eps
219
+ W = self._init_w_from_data(backend, X_arr, self.components_, eps)
220
+ n_steps = max(100, min(300, int(self.max_iter) * 5))
221
+ for _ in range(n_steps):
222
+ W = self._update_w(backend, X_arr, W, self.components_, eps)
223
+ return W
224
+
225
+ def fit_transform(self, X, y=None):
226
+ return self.fit(X, y=y).transform(X)
227
+
228
+ def inverse_transform(self, X):
229
+ self._check_is_fitted()
230
+ backend = self._get_backend()
231
+ X_arr = backend.asarray(X, dtype=backend.float64)
232
+ check_2d_array(X_arr)
233
+ if X_arr.shape[1] != self.n_components_:
234
+ raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
235
+ return backend.matmul(X_arr, self.components_)
236
+
237
+ def predict(self, X):
238
+ return self.transform(X)
239
+
240
+ def get_params(self, deep=True):
241
+ params = super().get_params(deep=deep)
242
+ params.update(
243
+ {
244
+ "n_components": self.n_components,
245
+ "init": self.init,
246
+ "batch_size": self.batch_size,
247
+ "max_iter": self.max_iter,
248
+ "tol": self.tol,
249
+ "random_state": self.random_state,
250
+ }
251
+ )
252
+ return params
@@ -0,0 +1,190 @@
1
+ """Non-negative matrix factorization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import (
12
+ backend_random_normal,
13
+ check_2d_array,
14
+ draw_random_seed,
15
+ reject_sparse,
16
+ scalar_to_float,
17
+ )
18
+
19
+
20
+ class NMF(BaseEstimator):
21
+ """NMF with multiplicative updates and Frobenius loss."""
22
+
23
+ def __init__(
24
+ self,
25
+ n_components: Optional[int] = None,
26
+ init: str = "random",
27
+ solver: str = "mu",
28
+ beta_loss: str = "frobenius",
29
+ max_iter: int = 200,
30
+ tol: float = 1e-4,
31
+ random_state: Optional[int] = None,
32
+ device: Union[str, Device] = Device.AUTO,
33
+ n_jobs: Optional[int] = None,
34
+ ):
35
+ super().__init__(device=device, n_jobs=n_jobs)
36
+ self.n_components = n_components
37
+ self.init = init
38
+ self.solver = solver
39
+ self.beta_loss = beta_loss
40
+ self.max_iter = max_iter
41
+ self.tol = tol
42
+ self.random_state = random_state
43
+
44
+ def _validate_params(self, n_samples: int, n_features: int):
45
+ if self.n_components is None:
46
+ n_components = min(n_samples, n_features)
47
+ else:
48
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
49
+ raise ValueError("n_components must be None or a positive integer")
50
+ n_components = int(self.n_components)
51
+ if self.init != "random":
52
+ raise NotImplementedError("NMF v1 only supports init='random'")
53
+ if self.solver != "mu":
54
+ raise NotImplementedError("NMF v1 only supports solver='mu'")
55
+ if self.beta_loss != "frobenius":
56
+ raise NotImplementedError("NMF v1 only supports beta_loss='frobenius'")
57
+ if not isinstance(self.max_iter, (int, np.integer)) or int(self.max_iter) < 1:
58
+ raise ValueError("max_iter must be a positive integer")
59
+ if float(self.tol) < 0.0:
60
+ raise ValueError("tol must be non-negative")
61
+ return n_components
62
+
63
+ def _check_nonnegative(self, backend, X):
64
+ if scalar_to_float(backend.min(X)) < 0.0:
65
+ raise ValueError("NMF input X must be non-negative")
66
+
67
+ def _init_factors(self, backend, X, n_components, seed):
68
+ eps = np.finfo(np.float64).eps
69
+ rng = np.random.default_rng(draw_random_seed(seed))
70
+ if X.shape[0] >= n_components:
71
+ indices = rng.choice(int(X.shape[0]), size=int(n_components), replace=False)
72
+ indices = backend.asarray(indices, dtype=backend.int64)
73
+ H = backend.maximum(X[indices], eps) + 1e-8
74
+ else:
75
+ mean = max(scalar_to_float(backend.mean(X)), np.finfo(np.float64).eps)
76
+ scale = np.sqrt(mean / float(n_components))
77
+ H = backend.abs(backend_random_normal(backend, seed, size=(n_components, X.shape[1]), scale=scale)) + 1e-8
78
+ W = self._init_w_from_data(backend, X, H, eps)
79
+ return W, H
80
+
81
+ def _init_w_from_data(self, backend, X, H, eps):
82
+ numerator = backend.matmul(X, H.T)
83
+ denominator = backend.reshape(backend.sum(H * H, axis=1) + eps, (1, H.shape[0]))
84
+ return backend.maximum(numerator / denominator, eps)
85
+
86
+ def _reconstruction_error(self, backend, X, W, H):
87
+ residual = X - backend.matmul(W, H)
88
+ return scalar_to_float(backend.sqrt(backend.sum(residual * residual)))
89
+
90
+ def _update_h(self, backend, X, W, H, eps):
91
+ numerator = backend.matmul(W.T, X)
92
+ denominator = backend.matmul(backend.matmul(W.T, W), H) + eps
93
+ H *= numerator
94
+ H /= denominator
95
+ return H
96
+
97
+ def _update_w(self, backend, X, W, H, eps):
98
+ numerator = backend.matmul(X, H.T)
99
+ denominator = backend.matmul(W, backend.matmul(H, H.T)) + eps
100
+ W *= numerator
101
+ W /= denominator
102
+ return W
103
+
104
+ def fit(self, X, y=None):
105
+ reject_sparse(X, "NMF")
106
+ backend = self._get_backend()
107
+ X_arr = backend.asarray(X, dtype=backend.float64)
108
+ check_2d_array(X_arr)
109
+ self._check_nonnegative(backend, X_arr)
110
+ n_samples, n_features = X_arr.shape
111
+ n_components = self._validate_params(n_samples, n_features)
112
+
113
+ W, H = self._init_factors(backend, X_arr, n_components, self.random_state)
114
+ eps = np.finfo(np.float64).eps
115
+ previous_error = None
116
+ error = None
117
+ n_iter = 0
118
+ if backend.name == "numpy":
119
+ error_check_interval = 10
120
+ else:
121
+ # GPU/torch backends check less frequently than CPU to reduce host-sync
122
+ # overhead while still preserving tol-based early stopping.
123
+ error_check_interval = max(1, min(25, int(self.max_iter) // 5))
124
+ for n_iter in range(1, int(self.max_iter) + 1):
125
+ W = self._update_w(backend, X_arr, W, H, eps)
126
+ H = self._update_h(backend, X_arr, W, H, eps)
127
+ if n_iter % error_check_interval == 0 or n_iter == int(self.max_iter):
128
+ error = self._reconstruction_error(backend, X_arr, W, H)
129
+ if previous_error is not None:
130
+ if abs(previous_error - error) / max(previous_error, eps) <= float(self.tol):
131
+ break
132
+ previous_error = error
133
+
134
+ if error is None:
135
+ error = self._reconstruction_error(backend, X_arr, W, H)
136
+
137
+ self.components_ = H
138
+ self._fit_W = W
139
+ self.reconstruction_err_ = float(error if error is not None else 0.0)
140
+ self.n_iter_ = int(n_iter)
141
+ self.n_components_ = int(n_components)
142
+ self.n_features_in_ = int(n_features)
143
+ self._backend_name = backend.name
144
+ self._fitted = True
145
+ return self
146
+
147
+ def transform(self, X):
148
+ self._check_is_fitted()
149
+ reject_sparse(X, "NMF")
150
+ backend = self._get_backend()
151
+ X_arr = backend.asarray(X, dtype=backend.float64)
152
+ check_2d_array(X_arr)
153
+ self._check_nonnegative(backend, X_arr)
154
+ if X_arr.shape[1] != self.n_features_in_:
155
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
156
+ eps = np.finfo(np.float64).eps
157
+ W = self._init_w_from_data(backend, X_arr, self.components_, eps)
158
+ for _ in range(int(self.max_iter)):
159
+ W = self._update_w(backend, X_arr, W, self.components_, eps)
160
+ return W
161
+
162
+ def fit_transform(self, X, y=None):
163
+ return self.fit(X, y=y)._fit_W
164
+
165
+ def inverse_transform(self, X):
166
+ self._check_is_fitted()
167
+ backend = self._get_backend()
168
+ X_arr = backend.asarray(X, dtype=backend.float64)
169
+ check_2d_array(X_arr)
170
+ if X_arr.shape[1] != self.n_components_:
171
+ raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
172
+ return backend.matmul(X_arr, self.components_)
173
+
174
+ def predict(self, X):
175
+ return self.transform(X)
176
+
177
+ def get_params(self, deep=True):
178
+ params = super().get_params(deep=deep)
179
+ params.update(
180
+ {
181
+ "n_components": self.n_components,
182
+ "init": self.init,
183
+ "solver": self.solver,
184
+ "beta_loss": self.beta_loss,
185
+ "max_iter": self.max_iter,
186
+ "tol": self.tol,
187
+ "random_state": self.random_state,
188
+ }
189
+ )
190
+ return params
@@ -0,0 +1,189 @@
1
+ """Principal component analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import backend_random_normal, check_2d_array, scalar_to_float, svd_flip_components
12
+
13
+
14
+ class PCA(BaseEstimator):
15
+ """
16
+ Principal component analysis with NumPy, CuPy, or Torch backends.
17
+
18
+ Parameters
19
+ ----------
20
+ n_components : int or None, default=None
21
+ Number of components to keep. ``None`` keeps all components.
22
+ svd_solver : {'auto', 'full', 'covariance', 'randomized'}, default='auto'
23
+ Solver used for the decomposition. ``'auto'`` uses covariance/eigh
24
+ when ``n_samples >= n_features`` and full SVD otherwise. ``'randomized'``
25
+ computes an approximate truncated SVD and is useful when only a small
26
+ number of components is needed.
27
+ whiten : bool, default=False
28
+ When True, scale transformed components to unit variance.
29
+ copy : bool, default=True
30
+ Kept for sklearn-style API compatibility. Inputs are not modified.
31
+ device : {'auto', 'cpu', 'cuda', 'torch'}, default='auto'
32
+ Compute device.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ n_components: Optional[int] = None,
38
+ svd_solver: str = "auto",
39
+ whiten: bool = False,
40
+ copy: bool = True,
41
+ random_state: Optional[int] = None,
42
+ n_oversamples: int = 10,
43
+ iterated_power: int = 2,
44
+ device: Union[str, Device] = Device.AUTO,
45
+ n_jobs: Optional[int] = None,
46
+ ):
47
+ super().__init__(device=device, n_jobs=n_jobs)
48
+ self.n_components = n_components
49
+ self.svd_solver = svd_solver
50
+ self.whiten = whiten
51
+ self.copy = copy
52
+ self.random_state = random_state
53
+ self.n_oversamples = n_oversamples
54
+ self.iterated_power = iterated_power
55
+
56
+ def _validate_params(self, n_samples: int, n_features: int):
57
+ if self.svd_solver not in ("auto", "full", "covariance", "randomized"):
58
+ raise ValueError("svd_solver must be one of: 'auto', 'full', 'covariance', 'randomized'")
59
+ if not isinstance(self.n_oversamples, (int, np.integer)) or int(self.n_oversamples) < 0:
60
+ raise ValueError("n_oversamples must be a non-negative integer")
61
+ if not isinstance(self.iterated_power, (int, np.integer)) or int(self.iterated_power) < 0:
62
+ raise ValueError("iterated_power must be a non-negative integer")
63
+ max_components = min(n_samples, n_features)
64
+ if self.n_components is None:
65
+ n_components = max_components
66
+ else:
67
+ if not isinstance(self.n_components, (int, np.integer)):
68
+ raise ValueError("n_components must be None or a positive integer")
69
+ n_components = int(self.n_components)
70
+ if n_components < 1 or n_components > max_components:
71
+ raise ValueError(
72
+ f"n_components must be in [1, {max_components}] for the input shape"
73
+ )
74
+ solver = self.svd_solver
75
+ if solver == "auto":
76
+ solver = "covariance" if n_samples >= n_features else "full"
77
+ return n_components, solver
78
+
79
+ def _randomized_svd(self, backend, X_centered, n_components: int):
80
+ n_samples, n_features = X_centered.shape
81
+ n_random = min(n_features, n_components + int(self.n_oversamples))
82
+ omega = backend_random_normal(backend, self.random_state, size=(n_features, n_random))
83
+ Q, _ = backend.qr(backend.matmul(X_centered, omega))
84
+ for _ in range(int(self.iterated_power)):
85
+ Q, _ = backend.qr(backend.matmul(X_centered.T, Q))
86
+ Q, _ = backend.qr(backend.matmul(X_centered, Q))
87
+ B = backend.matmul(Q.T, X_centered)
88
+ _, singular_values_all, vh = backend.svd(B, full_matrices=False)
89
+ return singular_values_all[:n_components], svd_flip_components(backend, vh[:n_components])
90
+
91
+ def fit(self, X, y=None):
92
+ backend = self._get_backend()
93
+ X_arr = backend.asarray(X, dtype=backend.float64)
94
+ check_2d_array(X_arr)
95
+ n_samples, n_features = X_arr.shape
96
+ if n_samples < 2:
97
+ raise ValueError("PCA requires at least two samples")
98
+
99
+ n_components, solver = self._validate_params(n_samples, n_features)
100
+ mean = backend.mean(X_arr, axis=0, keepdims=False)
101
+
102
+ if solver == "covariance":
103
+ gram = backend.matmul(X_arr.T, X_arr)
104
+ mean_col = backend.reshape(mean, (n_features, 1))
105
+ cov = (gram - float(n_samples) * backend.matmul(mean_col, mean_col.T)) / float(n_samples - 1)
106
+ eigenvalues, eigenvectors = backend.eigh(cov)
107
+ order = backend.flip(backend.argsort(eigenvalues, axis=0), axis=0)
108
+ eigenvalues = eigenvalues[order]
109
+ eigenvectors = eigenvectors[:, order]
110
+ explained_variance = backend.maximum(eigenvalues[:n_components], 0.0)
111
+ components = eigenvectors[:, :n_components].T
112
+ components = svd_flip_components(backend, components)
113
+ singular_values = backend.sqrt(explained_variance * float(n_samples - 1))
114
+ total_var = backend.sum(backend.diag(cov))
115
+ elif solver == "randomized":
116
+ X_centered = X_arr - mean
117
+ singular_values, components = self._randomized_svd(backend, X_centered, n_components)
118
+ explained_variance = (singular_values ** 2) / float(n_samples - 1)
119
+ total_var = backend.sum(X_centered * X_centered) / float(n_samples - 1)
120
+ else:
121
+ X_centered = X_arr - mean
122
+ _, singular_values_all, vh = backend.svd(X_centered, full_matrices=False)
123
+ components = svd_flip_components(backend, vh[:n_components])
124
+ singular_values = singular_values_all[:n_components]
125
+ explained_variance = (singular_values ** 2) / float(n_samples - 1)
126
+ total_var = backend.sum(X_centered * X_centered) / float(n_samples - 1)
127
+
128
+ if scalar_to_float(total_var) > 0.0:
129
+ explained_variance_ratio = explained_variance / total_var
130
+ else:
131
+ explained_variance_ratio = explained_variance * 0.0
132
+
133
+ self.components_ = components
134
+ self.mean_ = mean
135
+ self.explained_variance_ = explained_variance
136
+ self.explained_variance_ratio_ = explained_variance_ratio
137
+ self.singular_values_ = singular_values
138
+ self.n_components_ = int(n_components)
139
+ self.n_features_in_ = int(n_features)
140
+ self._backend_name = backend.name
141
+ self._fitted = True
142
+ return self
143
+
144
+ def transform(self, X):
145
+ self._check_is_fitted()
146
+ backend = self._get_backend()
147
+ X_arr = backend.asarray(X, dtype=backend.float64)
148
+ check_2d_array(X_arr)
149
+ if X_arr.shape[1] != self.n_features_in_:
150
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
151
+ X_transformed = backend.matmul(X_arr - self.mean_, self.components_.T)
152
+ if self.whiten:
153
+ scale = backend.sqrt(self.explained_variance_)
154
+ X_transformed = X_transformed / scale
155
+ return X_transformed
156
+
157
+ def fit_transform(self, X, y=None):
158
+ return self.fit(X, y=y).transform(X)
159
+
160
+ def inverse_transform(self, X):
161
+ self._check_is_fitted()
162
+ backend = self._get_backend()
163
+ X_arr = backend.asarray(X, dtype=backend.float64)
164
+ if getattr(X_arr, "ndim", None) != 2:
165
+ raise ValueError("X must be a 2D array")
166
+ if X_arr.shape[1] != self.n_components_:
167
+ raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
168
+ if self.whiten:
169
+ X_arr = X_arr * backend.sqrt(self.explained_variance_)
170
+ return backend.matmul(X_arr, self.components_) + self.mean_
171
+
172
+ def predict(self, X):
173
+ """Alias for transform, provided for BaseEstimator compatibility."""
174
+ return self.transform(X)
175
+
176
+ def get_params(self, deep=True):
177
+ params = super().get_params(deep=deep)
178
+ params.update(
179
+ {
180
+ "n_components": self.n_components,
181
+ "svd_solver": self.svd_solver,
182
+ "whiten": self.whiten,
183
+ "copy": self.copy,
184
+ "random_state": self.random_state,
185
+ "n_oversamples": self.n_oversamples,
186
+ "iterated_power": self.iterated_power,
187
+ }
188
+ )
189
+ return params