statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,332 @@
1
+ """Gaussian mixture models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._kmeans import KMeans
12
+ from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_float
13
+
14
+
15
+ class GaussianMixture(BaseEstimator):
16
+ """Gaussian mixture model fitted with log-domain EM."""
17
+
18
+ def __init__(
19
+ self,
20
+ n_components: int = 1,
21
+ covariance_type: str = "diag",
22
+ tol: float = 1e-3,
23
+ reg_covar: float = 1e-6,
24
+ max_iter: int = 100,
25
+ n_init: int = 1,
26
+ init_params: str = "kmeans",
27
+ random_state: Optional[int] = None,
28
+ device: Union[str, Device] = Device.AUTO,
29
+ n_jobs: Optional[int] = None,
30
+ ):
31
+ super().__init__(device=device, n_jobs=n_jobs)
32
+ self.n_components = n_components
33
+ self.covariance_type = covariance_type
34
+ self.tol = tol
35
+ self.reg_covar = reg_covar
36
+ self.max_iter = max_iter
37
+ self.n_init = n_init
38
+ self.init_params = init_params
39
+ self.random_state = random_state
40
+
41
+ def _validate_params(self, n_samples: int):
42
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
43
+ raise ValueError("n_components must be a positive integer")
44
+ if int(self.n_components) > n_samples:
45
+ raise ValueError("n_components must be less than or equal to n_samples")
46
+ if self.covariance_type not in ("diag", "spherical", "tied", "full"):
47
+ raise ValueError("covariance_type must be one of: 'diag', 'spherical', 'tied', 'full'")
48
+ if self.init_params not in ("kmeans", "random"):
49
+ raise ValueError("init_params must be one of: 'kmeans', 'random'")
50
+ if float(self.tol) < 0.0:
51
+ raise ValueError("tol must be non-negative")
52
+ if float(self.reg_covar) < 0.0:
53
+ raise ValueError("reg_covar must be non-negative")
54
+ if not isinstance(self.max_iter, (int, np.integer)) or int(self.max_iter) < 1:
55
+ raise ValueError("max_iter must be a positive integer")
56
+ if not isinstance(self.n_init, (int, np.integer)) or int(self.n_init) < 1:
57
+ raise ValueError("n_init must be a positive integer")
58
+
59
+ def _linalg_inv(self, backend, matrix):
60
+ return backend.xp.linalg.inv(matrix)
61
+
62
+ def _linalg_logdet(self, backend, matrix):
63
+ sign, logabsdet = backend.xp.linalg.slogdet(matrix)
64
+ if scalar_to_float(sign) <= 0.0:
65
+ raise ValueError("covariance matrix must be positive definite")
66
+ return logabsdet
67
+
68
+ def _linalg_cholesky(self, backend, matrix):
69
+ return backend.xp.linalg.cholesky(matrix)
70
+
71
+ def _eye(self, backend, n_features: int):
72
+ if hasattr(backend, "eye"):
73
+ return backend.eye(n_features, dtype=backend.float64)
74
+ return backend.asarray(np.eye(n_features), dtype=backend.float64)
75
+
76
+ def _estimate_log_gaussian_prob(self, backend, X, means, covariances, precisions_cholesky=None):
77
+ n_features = X.shape[1]
78
+ log_2pi = float(n_features) * np.log(2.0 * np.pi)
79
+ if self.covariance_type == "diag":
80
+ precisions = 1.0 / covariances
81
+ log_det = backend.sum(backend.log(covariances), axis=1)
82
+ x2 = backend.matmul(X * X, precisions.T)
83
+ cross = backend.matmul(X, (means * precisions).T)
84
+ mean2 = backend.sum(means * means * precisions, axis=1)
85
+ quad = x2 - 2.0 * cross + backend.expand_dims(mean2, 0)
86
+ return -0.5 * (log_2pi + backend.expand_dims(log_det, 0) + quad)
87
+
88
+ if self.covariance_type == "spherical":
89
+ precisions = 1.0 / covariances
90
+ log_det = float(n_features) * backend.log(covariances)
91
+ diff = backend.expand_dims(X, 1) - backend.expand_dims(means, 0)
92
+ quad = backend.sum(diff * diff, axis=2) * backend.expand_dims(precisions, 0)
93
+ return -0.5 * (log_2pi + backend.expand_dims(log_det, 0) + quad)
94
+
95
+ log_probs = []
96
+ if self.covariance_type == "tied":
97
+ if precisions_cholesky is None:
98
+ precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
99
+ log_det = -2.0 * backend.sum(backend.log(backend.diag(precisions_cholesky)))
100
+ for k in range(int(self.n_components)):
101
+ diff = X - means[k]
102
+ solved = backend.matmul(diff, precisions_cholesky)
103
+ quad = backend.sum(solved * solved, axis=1)
104
+ log_probs.append(-0.5 * (log_2pi + log_det + quad))
105
+ return backend.stack(log_probs, axis=1)
106
+
107
+ if precisions_cholesky is None:
108
+ precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
109
+ for k in range(int(self.n_components)):
110
+ log_det = -2.0 * backend.sum(backend.log(backend.diag(precisions_cholesky[k])))
111
+ diff = X - means[k]
112
+ solved = backend.matmul(diff, precisions_cholesky[k])
113
+ quad = backend.sum(solved * solved, axis=1)
114
+ log_probs.append(-0.5 * (log_2pi + log_det + quad))
115
+ return backend.stack(log_probs, axis=1)
116
+
117
+ def _estimate_weighted_log_prob(self, backend, X, weights, means, covariances, precisions_cholesky=None):
118
+ return self._estimate_log_gaussian_prob(
119
+ backend,
120
+ X,
121
+ means,
122
+ covariances,
123
+ precisions_cholesky=precisions_cholesky,
124
+ ) + backend.expand_dims(backend.log(weights), 0)
125
+
126
+ def _e_step(self, backend, X, weights, means, covariances):
127
+ precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
128
+ weighted_log_prob = self._estimate_weighted_log_prob(
129
+ backend,
130
+ X,
131
+ weights,
132
+ means,
133
+ covariances,
134
+ precisions_cholesky=precisions_cholesky,
135
+ )
136
+ log_prob_norm = backend.logsumexp(weighted_log_prob, axis=1)
137
+ log_resp = weighted_log_prob - backend.expand_dims(log_prob_norm, 1)
138
+ return scalar_to_float(backend.mean(log_prob_norm)), backend.exp(log_resp)
139
+
140
+ def _m_step(self, backend, X, resp):
141
+ n_samples = X.shape[0]
142
+ n_features = X.shape[1]
143
+ nk = backend.sum(resp, axis=0) + 10.0 * np.finfo(np.float64).eps
144
+ weights = nk / float(n_samples)
145
+ means = backend.matmul(resp.T, X) / backend.expand_dims(nk, 1)
146
+ if self.covariance_type in ("diag", "spherical"):
147
+ second_moment = backend.matmul(resp.T, X * X) / backend.expand_dims(nk, 1)
148
+ diag_covariances = backend.maximum(second_moment - means * means, float(self.reg_covar))
149
+ if self.covariance_type == "diag":
150
+ return weights, means, diag_covariances
151
+ spherical_covariances = backend.maximum(backend.mean(diag_covariances, axis=1), float(self.reg_covar))
152
+ return weights, means, spherical_covariances
153
+
154
+ eye = self._eye(backend, n_features)
155
+ if self.covariance_type == "tied":
156
+ covariance = backend.zeros((n_features, n_features), dtype=backend.float64)
157
+ for k in range(int(self.n_components)):
158
+ diff = X - means[k]
159
+ weighted = diff * backend.expand_dims(resp[:, k], 1)
160
+ covariance = covariance + backend.matmul(weighted.T, diff)
161
+ covariance = covariance / float(n_samples) + float(self.reg_covar) * eye
162
+ return weights, means, covariance
163
+
164
+ covariances = []
165
+ for k in range(int(self.n_components)):
166
+ diff = X - means[k]
167
+ weighted = diff * backend.expand_dims(resp[:, k], 1)
168
+ covariance = backend.matmul(weighted.T, diff) / nk[k] + float(self.reg_covar) * eye
169
+ covariances.append(covariance)
170
+ covariances = backend.stack(covariances, axis=0)
171
+ return weights, means, covariances
172
+
173
+ def _initialize(self, backend, X, seed):
174
+ n_samples, n_features = X.shape
175
+ rng = np.random.default_rng(seed)
176
+ if self.init_params == "kmeans":
177
+ km = KMeans(
178
+ n_clusters=int(self.n_components),
179
+ n_init=1,
180
+ max_iter=min(50, int(self.max_iter)),
181
+ random_state=seed,
182
+ device=self.device,
183
+ ).fit(X)
184
+ means = km.cluster_centers_
185
+ else:
186
+ indices = rng.choice(n_samples, size=int(self.n_components), replace=False)
187
+ means = X[backend.asarray(indices, dtype=backend.int64)]
188
+ weights = backend.full((int(self.n_components),), 1.0 / float(self.n_components), dtype=backend.float64)
189
+ centered = X - backend.mean(X, axis=0)
190
+ global_var = backend.mean(centered * centered, axis=0) + float(self.reg_covar)
191
+ if self.covariance_type == "diag":
192
+ covariances = backend.ones((int(self.n_components), n_features), dtype=backend.float64) * global_var
193
+ elif self.covariance_type == "spherical":
194
+ covariances = backend.full(
195
+ (int(self.n_components),),
196
+ scalar_to_float(backend.mean(global_var)),
197
+ dtype=backend.float64,
198
+ )
199
+ else:
200
+ global_covariance = backend.matmul(centered.T, centered) / float(n_samples)
201
+ global_covariance = global_covariance + float(self.reg_covar) * self._eye(backend, n_features)
202
+ if self.covariance_type == "tied":
203
+ covariances = global_covariance
204
+ else:
205
+ covariances = backend.stack([backend.copy(global_covariance) for _ in range(int(self.n_components))], axis=0)
206
+ return weights, means, covariances
207
+
208
+ def _estimate_precisions_cholesky(self, backend, covariances):
209
+ if self.covariance_type in ("diag", "spherical"):
210
+ return 1.0 / backend.sqrt(covariances)
211
+ if self.covariance_type == "tied":
212
+ return self._linalg_cholesky(backend, self._linalg_inv(backend, covariances))
213
+ return backend.stack(
214
+ [self._linalg_cholesky(backend, self._linalg_inv(backend, covariances[k])) for k in range(int(self.n_components))],
215
+ axis=0,
216
+ )
217
+
218
+ def fit(self, X, y=None):
219
+ reject_sparse(X, "GaussianMixture")
220
+ backend = self._get_backend()
221
+ X_arr = backend.asarray(X, dtype=backend.float64)
222
+ check_2d_array(X_arr)
223
+ n_samples, n_features = X_arr.shape
224
+ self._validate_params(n_samples)
225
+
226
+ rng = np.random.default_rng(self.random_state)
227
+ best = None
228
+ for _ in range(int(self.n_init)):
229
+ seed = None if self.random_state is None else int(rng.integers(0, np.iinfo(np.int32).max))
230
+ weights, means, covariances = self._initialize(backend, X_arr, seed)
231
+ lower_bound = -np.inf
232
+ converged = False
233
+ n_iter = 0
234
+ for n_iter in range(1, int(self.max_iter) + 1):
235
+ prev_lower_bound = lower_bound
236
+ lower_bound, resp = self._e_step(backend, X_arr, weights, means, covariances)
237
+ weights, means, covariances = self._m_step(backend, X_arr, resp)
238
+ if abs(lower_bound - prev_lower_bound) < float(self.tol):
239
+ converged = True
240
+ break
241
+ if best is None or lower_bound > best[0]:
242
+ best = (lower_bound, converged, n_iter, weights, means, covariances)
243
+
244
+ lower_bound, converged, n_iter, weights, means, covariances = best
245
+ self.weights_ = weights
246
+ self.means_ = means
247
+ self.covariances_ = covariances
248
+ self.precisions_cholesky_ = self._estimate_precisions_cholesky(backend, covariances)
249
+ self.converged_ = bool(converged)
250
+ self.n_iter_ = int(n_iter)
251
+ self.lower_bound_ = float(lower_bound)
252
+ self.n_features_in_ = int(n_features)
253
+ self._backend_name = backend.name
254
+ self._fitted = True
255
+ return self
256
+
257
+ def score_samples(self, X):
258
+ self._check_is_fitted()
259
+ backend = self._get_backend()
260
+ X_arr = backend.asarray(X, dtype=backend.float64)
261
+ check_2d_array(X_arr)
262
+ if X_arr.shape[1] != self.n_features_in_:
263
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
264
+ return backend.logsumexp(
265
+ self._estimate_weighted_log_prob(
266
+ backend,
267
+ X_arr,
268
+ self.weights_,
269
+ self.means_,
270
+ self.covariances_,
271
+ precisions_cholesky=self.precisions_cholesky_,
272
+ ),
273
+ axis=1,
274
+ )
275
+
276
+ def predict_proba(self, X):
277
+ self._check_is_fitted()
278
+ backend = self._get_backend()
279
+ X_arr = backend.asarray(X, dtype=backend.float64)
280
+ check_2d_array(X_arr)
281
+ if X_arr.shape[1] != self.n_features_in_:
282
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
283
+ _, resp = self._e_step(backend, X_arr, self.weights_, self.means_, self.covariances_)
284
+ return resp
285
+
286
+ def predict(self, X):
287
+ backend = self._get_backend()
288
+ return backend.argmax(self.predict_proba(X), axis=1)
289
+
290
+ def fit_predict(self, X, y=None):
291
+ return self.fit(X, y=y).predict(X)
292
+
293
+ def score(self, X, y=None):
294
+ backend = self._get_backend()
295
+ return scalar_to_float(backend.mean(self.score_samples(X)))
296
+
297
+ def _n_parameters(self):
298
+ n_components = int(self.n_components)
299
+ n_features = int(self.n_features_in_)
300
+ mean_params = n_components * n_features
301
+ weight_params = n_components - 1
302
+ if self.covariance_type == "diag":
303
+ covariance_params = n_components * n_features
304
+ elif self.covariance_type == "spherical":
305
+ covariance_params = n_components
306
+ elif self.covariance_type == "tied":
307
+ covariance_params = n_features * (n_features + 1) // 2
308
+ else:
309
+ covariance_params = n_components * n_features * (n_features + 1) // 2
310
+ return mean_params + covariance_params + weight_params
311
+
312
+ def bic(self, X):
313
+ return -2.0 * float(self.score(X)) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])
314
+
315
+ def aic(self, X):
316
+ return -2.0 * float(self.score(X)) * X.shape[0] + 2.0 * self._n_parameters()
317
+
318
+ def get_params(self, deep=True):
319
+ params = super().get_params(deep=deep)
320
+ params.update(
321
+ {
322
+ "n_components": self.n_components,
323
+ "covariance_type": self.covariance_type,
324
+ "tol": self.tol,
325
+ "reg_covar": self.reg_covar,
326
+ "max_iter": self.max_iter,
327
+ "n_init": self.n_init,
328
+ "init_params": self.init_params,
329
+ "random_state": self.random_state,
330
+ }
331
+ )
332
+ return params
@@ -0,0 +1,176 @@
1
+ """Incremental principal component analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_float, svd_flip_components
12
+
13
+
14
+ class IncrementalPCA(BaseEstimator):
15
+ """Dense incremental PCA with NumPy, CuPy, or Torch backends."""
16
+
17
+ def __init__(
18
+ self,
19
+ n_components: Optional[int] = None,
20
+ batch_size: Optional[int] = None,
21
+ whiten: bool = False,
22
+ copy: bool = True,
23
+ device: Union[str, Device] = Device.AUTO,
24
+ n_jobs: Optional[int] = None,
25
+ ):
26
+ super().__init__(device=device, n_jobs=n_jobs)
27
+ self.n_components = n_components
28
+ self.batch_size = batch_size
29
+ self.whiten = whiten
30
+ self.copy = copy
31
+
32
+ def _validate_params(self, n_samples: int, n_features: int, first_pass: bool):
33
+ if self.n_components is None:
34
+ if first_pass:
35
+ n_components = min(n_samples, n_features)
36
+ else:
37
+ if not hasattr(self, "n_components_"):
38
+ raise ValueError("IncrementalPCA internal state is inconsistent; refit the estimator")
39
+ n_components = int(self.n_components_)
40
+ else:
41
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
42
+ raise ValueError("n_components must be None or a positive integer")
43
+ n_components = int(self.n_components)
44
+ if n_components > n_features:
45
+ raise ValueError("n_components must be less than or equal to n_features")
46
+ if first_pass and n_samples < n_components:
47
+ raise ValueError("first partial_fit batch must contain at least n_components samples")
48
+ if self.batch_size is not None:
49
+ if not isinstance(self.batch_size, (int, np.integer)) or int(self.batch_size) < 1:
50
+ raise ValueError("batch_size must be None or a positive integer")
51
+ return n_components
52
+
53
+ def _update_mean_var(self, backend, batch, batch_mean, batch_var):
54
+ batch_count = int(batch.shape[0])
55
+ if not getattr(self, "_fitted", False):
56
+ return batch_mean, batch_var, batch_count
57
+ old_count = int(self.n_samples_seen_)
58
+ new_count = old_count + batch_count
59
+ old_mean = self.mean_
60
+ old_var = self.var_
61
+ new_mean = (float(old_count) * old_mean + float(batch_count) * batch_mean) / float(new_count)
62
+ old_ss = float(old_count) * (old_var + (old_mean - new_mean) ** 2)
63
+ batch_ss = float(batch_count) * (batch_var + (batch_mean - new_mean) ** 2)
64
+ new_var = (old_ss + batch_ss) / float(new_count)
65
+ return new_mean, new_var, new_count
66
+
67
+ def partial_fit(self, X, y=None):
68
+ reject_sparse(X, "IncrementalPCA")
69
+ backend = self._get_backend()
70
+ X_arr = backend.asarray(X, dtype=backend.float64)
71
+ check_2d_array(X_arr)
72
+ n_samples, n_features = X_arr.shape
73
+ first_pass = not getattr(self, "_fitted", False)
74
+ n_components = self._validate_params(n_samples, n_features, first_pass=first_pass)
75
+ if not first_pass and n_features != self.n_features_in_:
76
+ raise ValueError(f"X has {n_features} features, expected {self.n_features_in_}")
77
+
78
+ batch_mean = backend.mean(X_arr, axis=0)
79
+ batch_var = backend.mean((X_arr - batch_mean) ** 2, axis=0)
80
+ new_mean, new_var, new_count = self._update_mean_var(backend, X_arr, batch_mean, batch_var)
81
+ X_centered = X_arr - batch_mean
82
+
83
+ if first_pass:
84
+ matrix = X_centered
85
+ else:
86
+ old_count = int(self.n_samples_seen_)
87
+ old_basis = self.singular_values_[:, None] * self.components_
88
+ mean_correction = np.sqrt(float(old_count * n_samples) / float(new_count)) * (self.mean_ - batch_mean)
89
+ matrix = backend.concatenate([old_basis, X_centered, backend.reshape(mean_correction, (1, n_features))], axis=0)
90
+
91
+ _, singular_values_all, vh = backend.svd(matrix, full_matrices=False)
92
+ components = svd_flip_components(backend, vh[:n_components])
93
+ singular_values = singular_values_all[:n_components]
94
+ if new_count > 1:
95
+ explained_variance = (singular_values ** 2) / float(new_count - 1)
96
+ total_var = backend.sum(new_var) * float(new_count) / float(new_count - 1)
97
+ else:
98
+ explained_variance = singular_values * 0.0
99
+ total_var = backend.sum(new_var)
100
+ if scalar_to_float(total_var) > 0.0:
101
+ explained_variance_ratio = explained_variance / total_var
102
+ else:
103
+ explained_variance_ratio = explained_variance * 0.0
104
+
105
+ self.components_ = components
106
+ self.mean_ = new_mean
107
+ self.var_ = new_var
108
+ self.explained_variance_ = explained_variance
109
+ self.explained_variance_ratio_ = explained_variance_ratio
110
+ self.singular_values_ = singular_values
111
+ self.n_components_ = int(n_components)
112
+ self.n_features_in_ = int(n_features)
113
+ self.n_samples_seen_ = int(new_count)
114
+ self._backend_name = backend.name
115
+ self._fitted = True
116
+ return self
117
+
118
+ def fit(self, X, y=None):
119
+ reject_sparse(X, "IncrementalPCA")
120
+ backend = self._get_backend()
121
+ X_arr = backend.asarray(X, dtype=backend.float64)
122
+ check_2d_array(X_arr)
123
+ n_samples, n_features = X_arr.shape
124
+ n_components = self._validate_params(n_samples, n_features, first_pass=True)
125
+ batch_size = int(self.batch_size) if self.batch_size is not None else min(n_samples, max(1, 5 * n_features))
126
+ self._fitted = False
127
+ first_batch_end = batch_size
128
+ if n_samples >= n_components and first_batch_end < n_components:
129
+ first_batch_end = n_components
130
+ self.partial_fit(X_arr[:first_batch_end])
131
+ for start in range(first_batch_end, n_samples, batch_size):
132
+ self.partial_fit(X_arr[start : start + batch_size])
133
+ return self
134
+
135
+ def transform(self, X):
136
+ self._check_is_fitted()
137
+ backend = self._get_backend()
138
+ X_arr = backend.asarray(X, dtype=backend.float64)
139
+ check_2d_array(X_arr)
140
+ if X_arr.shape[1] != self.n_features_in_:
141
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
142
+ X_transformed = backend.matmul(X_arr - self.mean_, self.components_.T)
143
+ if self.whiten:
144
+ safe_variance = backend.maximum(self.explained_variance_, np.finfo(np.float64).eps)
145
+ X_transformed = X_transformed / backend.sqrt(safe_variance)
146
+ return X_transformed
147
+
148
+ def fit_transform(self, X, y=None):
149
+ return self.fit(X, y=y).transform(X)
150
+
151
+ def inverse_transform(self, X):
152
+ self._check_is_fitted()
153
+ backend = self._get_backend()
154
+ X_arr = backend.asarray(X, dtype=backend.float64)
155
+ check_2d_array(X_arr)
156
+ if X_arr.shape[1] != self.n_components_:
157
+ raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
158
+ if self.whiten:
159
+ safe_variance = backend.maximum(self.explained_variance_, np.finfo(np.float64).eps)
160
+ X_arr = X_arr * backend.sqrt(safe_variance)
161
+ return backend.matmul(X_arr, self.components_) + self.mean_
162
+
163
+ def predict(self, X):
164
+ return self.transform(X)
165
+
166
+ def get_params(self, deep=True):
167
+ params = super().get_params(deep=deep)
168
+ params.update(
169
+ {
170
+ "n_components": self.n_components,
171
+ "batch_size": self.batch_size,
172
+ "whiten": self.whiten,
173
+ "copy": self.copy,
174
+ }
175
+ )
176
+ return params