statgpu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statgpu/__init__.py +174 -0
- statgpu/_base.py +544 -0
- statgpu/_config.py +127 -0
- statgpu/anova/__init__.py +5 -0
- statgpu/anova/_oneway.py +194 -0
- statgpu/backends/__init__.py +83 -0
- statgpu/backends/_array_ops.py +529 -0
- statgpu/backends/_base.py +184 -0
- statgpu/backends/_cupy.py +453 -0
- statgpu/backends/_factory.py +65 -0
- statgpu/backends/_gpu_inference_cupy.py +214 -0
- statgpu/backends/_gpu_inference_torch.py +422 -0
- statgpu/backends/_numpy.py +324 -0
- statgpu/backends/_torch.py +685 -0
- statgpu/backends/_torch_safe.py +47 -0
- statgpu/backends/_utils.py +423 -0
- statgpu/core/__init__.py +10 -0
- statgpu/core/formula/__init__.py +33 -0
- statgpu/core/formula/_design.py +99 -0
- statgpu/core/formula/_parser.py +191 -0
- statgpu/core/formula/_terms.py +70 -0
- statgpu/core/formula/tests/__init__.py +0 -0
- statgpu/core/formula/tests/test_parser.py +194 -0
- statgpu/covariance/__init__.py +6 -0
- statgpu/covariance/_empirical.py +310 -0
- statgpu/covariance/_shrinkage.py +248 -0
- statgpu/cross_validation/__init__.py +31 -0
- statgpu/cross_validation/_base.py +410 -0
- statgpu/cross_validation/_engine.py +167 -0
- statgpu/diagnostics/__init__.py +7 -0
- statgpu/diagnostics/_regression_diagnostics.py +188 -0
- statgpu/feature_selection/__init__.py +24 -0
- statgpu/feature_selection/_knockoff.py +870 -0
- statgpu/feature_selection/_knockoff_utils.py +1003 -0
- statgpu/feature_selection/_stepwise.py +300 -0
- statgpu/glm_core/__init__.py +81 -0
- statgpu/glm_core/_base.py +202 -0
- statgpu/glm_core/_family.py +362 -0
- statgpu/glm_core/_fused.py +149 -0
- statgpu/glm_core/_gamma.py +111 -0
- statgpu/glm_core/_inverse_gaussian.py +62 -0
- statgpu/glm_core/_irls.py +561 -0
- statgpu/glm_core/_logistic.py +82 -0
- statgpu/glm_core/_negative_binomial.py +68 -0
- statgpu/glm_core/_poisson.py +60 -0
- statgpu/glm_core/_solver_legacy.py +100 -0
- statgpu/glm_core/_squared.py +53 -0
- statgpu/glm_core/_tweedie.py +74 -0
- statgpu/inference/__init__.py +239 -0
- statgpu/inference/_distributions_backend.py +2610 -0
- statgpu/inference/_multiple_testing.py +391 -0
- statgpu/inference/_resampling.py +1400 -0
- statgpu/inference/_results.py +265 -0
- statgpu/linear_model/__init__.py +75 -0
- statgpu/linear_model/_gaussian_inference.py +306 -0
- statgpu/linear_model/_glm_base.py +1261 -0
- statgpu/linear_model/_ordered_logit.py +52 -0
- statgpu/linear_model/_ordered_probit.py +50 -0
- statgpu/linear_model/_stats.py +170 -0
- statgpu/linear_model/cv/__init__.py +13 -0
- statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
- statgpu/linear_model/cv/_lasso_cv.py +253 -0
- statgpu/linear_model/cv/_logistic_cv.py +895 -0
- statgpu/linear_model/cv/_ridge_cv.py +1160 -0
- statgpu/linear_model/legacy/__init__.py +1 -0
- statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
- statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
- statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
- statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
- statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
- statgpu/linear_model/legacy/_solver_legacy.py +104 -0
- statgpu/linear_model/penalized/__init__.py +25 -0
- statgpu/linear_model/penalized/_base.py +437 -0
- statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
- statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
- statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
- statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
- statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
- statgpu/linear_model/penalized/_penalized_linear.py +236 -0
- statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
- statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
- statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
- statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
- statgpu/linear_model/penalized/_predict_mixin.py +182 -0
- statgpu/linear_model/wrappers/__init__.py +31 -0
- statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
- statgpu/linear_model/wrappers/_elasticnet.py +75 -0
- statgpu/linear_model/wrappers/_gamma.py +67 -0
- statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
- statgpu/linear_model/wrappers/_lasso.py +2124 -0
- statgpu/linear_model/wrappers/_linear.py +1127 -0
- statgpu/linear_model/wrappers/_logistic.py +1435 -0
- statgpu/linear_model/wrappers/_mcp.py +58 -0
- statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
- statgpu/linear_model/wrappers/_poisson.py +48 -0
- statgpu/linear_model/wrappers/_ridge.py +166 -0
- statgpu/linear_model/wrappers/_scad.py +58 -0
- statgpu/linear_model/wrappers/_tweedie.py +57 -0
- statgpu/metrics/__init__.py +21 -0
- statgpu/metrics/_classification.py +591 -0
- statgpu/nonparametric/__init__.py +50 -0
- statgpu/nonparametric/kernel_methods/__init__.py +25 -0
- statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
- statgpu/nonparametric/kernel_methods/_krr.py +234 -0
- statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
- statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
- statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
- statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
- statgpu/nonparametric/splines/__init__.py +5 -0
- statgpu/nonparametric/splines/_bspline_basis.py +336 -0
- statgpu/nonparametric/splines/_penalized.py +349 -0
- statgpu/panel/__init__.py +19 -0
- statgpu/panel/_covariance.py +140 -0
- statgpu/panel/_fixed_effects.py +420 -0
- statgpu/panel/_random_effects.py +385 -0
- statgpu/panel/_utils.py +482 -0
- statgpu/penalties/__init__.py +139 -0
- statgpu/penalties/_adaptive_l1.py +313 -0
- statgpu/penalties/_base.py +261 -0
- statgpu/penalties/_categories.py +39 -0
- statgpu/penalties/_elasticnet.py +98 -0
- statgpu/penalties/_group_lasso.py +678 -0
- statgpu/penalties/_group_mcp.py +553 -0
- statgpu/penalties/_group_scad.py +605 -0
- statgpu/penalties/_l1.py +107 -0
- statgpu/penalties/_l2.py +77 -0
- statgpu/penalties/_mcp.py +237 -0
- statgpu/penalties/_scad.py +260 -0
- statgpu/semiparametric/__init__.py +5 -0
- statgpu/semiparametric/_gam.py +401 -0
- statgpu/solvers/__init__.py +24 -0
- statgpu/solvers/_admm.py +241 -0
- statgpu/solvers/_constants.py +15 -0
- statgpu/solvers/_convergence.py +6 -0
- statgpu/solvers/_fista.py +436 -0
- statgpu/solvers/_fista_bb.py +513 -0
- statgpu/solvers/_fista_lla.py +541 -0
- statgpu/solvers/_lbfgs.py +206 -0
- statgpu/solvers/_newton.py +149 -0
- statgpu/solvers/_utils.py +277 -0
- statgpu/survival/__init__.py +14 -0
- statgpu/survival/_cox.py +3974 -0
- statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
- statgpu/survival/_cox_cv.py +1159 -0
- statgpu/survival/_cox_efron_cuda.py +1280 -0
- statgpu/survival/_cox_efron_triton.py +359 -0
- statgpu/unsupervised/__init__.py +29 -0
- statgpu/unsupervised/_agglomerative.py +307 -0
- statgpu/unsupervised/_dbscan.py +263 -0
- statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
- statgpu/unsupervised/_gmm.py +332 -0
- statgpu/unsupervised/_incremental_pca.py +176 -0
- statgpu/unsupervised/_kmeans.py +261 -0
- statgpu/unsupervised/_minibatch_kmeans.py +299 -0
- statgpu/unsupervised/_minibatch_nmf.py +252 -0
- statgpu/unsupervised/_nmf.py +190 -0
- statgpu/unsupervised/_pca.py +189 -0
- statgpu/unsupervised/_truncated_svd.py +132 -0
- statgpu/unsupervised/_tsne.py +192 -0
- statgpu/unsupervised/_umap.py +224 -0
- statgpu/unsupervised/_utils.py +134 -0
- statgpu-0.1.0.dist-info/METADATA +245 -0
- statgpu-0.1.0.dist-info/RECORD +168 -0
- statgpu-0.1.0.dist-info/WHEEL +5 -0
- statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
- statgpu-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""Gaussian mixture models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from statgpu._base import BaseEstimator
|
|
10
|
+
from statgpu._config import Device
|
|
11
|
+
from statgpu.unsupervised._kmeans import KMeans
|
|
12
|
+
from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GaussianMixture(BaseEstimator):
|
|
16
|
+
"""Gaussian mixture model fitted with log-domain EM."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
n_components: int = 1,
|
|
21
|
+
covariance_type: str = "diag",
|
|
22
|
+
tol: float = 1e-3,
|
|
23
|
+
reg_covar: float = 1e-6,
|
|
24
|
+
max_iter: int = 100,
|
|
25
|
+
n_init: int = 1,
|
|
26
|
+
init_params: str = "kmeans",
|
|
27
|
+
random_state: Optional[int] = None,
|
|
28
|
+
device: Union[str, Device] = Device.AUTO,
|
|
29
|
+
n_jobs: Optional[int] = None,
|
|
30
|
+
):
|
|
31
|
+
super().__init__(device=device, n_jobs=n_jobs)
|
|
32
|
+
self.n_components = n_components
|
|
33
|
+
self.covariance_type = covariance_type
|
|
34
|
+
self.tol = tol
|
|
35
|
+
self.reg_covar = reg_covar
|
|
36
|
+
self.max_iter = max_iter
|
|
37
|
+
self.n_init = n_init
|
|
38
|
+
self.init_params = init_params
|
|
39
|
+
self.random_state = random_state
|
|
40
|
+
|
|
41
|
+
def _validate_params(self, n_samples: int):
|
|
42
|
+
if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
|
|
43
|
+
raise ValueError("n_components must be a positive integer")
|
|
44
|
+
if int(self.n_components) > n_samples:
|
|
45
|
+
raise ValueError("n_components must be less than or equal to n_samples")
|
|
46
|
+
if self.covariance_type not in ("diag", "spherical", "tied", "full"):
|
|
47
|
+
raise ValueError("covariance_type must be one of: 'diag', 'spherical', 'tied', 'full'")
|
|
48
|
+
if self.init_params not in ("kmeans", "random"):
|
|
49
|
+
raise ValueError("init_params must be one of: 'kmeans', 'random'")
|
|
50
|
+
if float(self.tol) < 0.0:
|
|
51
|
+
raise ValueError("tol must be non-negative")
|
|
52
|
+
if float(self.reg_covar) < 0.0:
|
|
53
|
+
raise ValueError("reg_covar must be non-negative")
|
|
54
|
+
if not isinstance(self.max_iter, (int, np.integer)) or int(self.max_iter) < 1:
|
|
55
|
+
raise ValueError("max_iter must be a positive integer")
|
|
56
|
+
if not isinstance(self.n_init, (int, np.integer)) or int(self.n_init) < 1:
|
|
57
|
+
raise ValueError("n_init must be a positive integer")
|
|
58
|
+
|
|
59
|
+
def _linalg_inv(self, backend, matrix):
|
|
60
|
+
return backend.xp.linalg.inv(matrix)
|
|
61
|
+
|
|
62
|
+
def _linalg_logdet(self, backend, matrix):
|
|
63
|
+
sign, logabsdet = backend.xp.linalg.slogdet(matrix)
|
|
64
|
+
if scalar_to_float(sign) <= 0.0:
|
|
65
|
+
raise ValueError("covariance matrix must be positive definite")
|
|
66
|
+
return logabsdet
|
|
67
|
+
|
|
68
|
+
def _linalg_cholesky(self, backend, matrix):
|
|
69
|
+
return backend.xp.linalg.cholesky(matrix)
|
|
70
|
+
|
|
71
|
+
def _eye(self, backend, n_features: int):
|
|
72
|
+
if hasattr(backend, "eye"):
|
|
73
|
+
return backend.eye(n_features, dtype=backend.float64)
|
|
74
|
+
return backend.asarray(np.eye(n_features), dtype=backend.float64)
|
|
75
|
+
|
|
76
|
+
def _estimate_log_gaussian_prob(self, backend, X, means, covariances, precisions_cholesky=None):
|
|
77
|
+
n_features = X.shape[1]
|
|
78
|
+
log_2pi = float(n_features) * np.log(2.0 * np.pi)
|
|
79
|
+
if self.covariance_type == "diag":
|
|
80
|
+
precisions = 1.0 / covariances
|
|
81
|
+
log_det = backend.sum(backend.log(covariances), axis=1)
|
|
82
|
+
x2 = backend.matmul(X * X, precisions.T)
|
|
83
|
+
cross = backend.matmul(X, (means * precisions).T)
|
|
84
|
+
mean2 = backend.sum(means * means * precisions, axis=1)
|
|
85
|
+
quad = x2 - 2.0 * cross + backend.expand_dims(mean2, 0)
|
|
86
|
+
return -0.5 * (log_2pi + backend.expand_dims(log_det, 0) + quad)
|
|
87
|
+
|
|
88
|
+
if self.covariance_type == "spherical":
|
|
89
|
+
precisions = 1.0 / covariances
|
|
90
|
+
log_det = float(n_features) * backend.log(covariances)
|
|
91
|
+
diff = backend.expand_dims(X, 1) - backend.expand_dims(means, 0)
|
|
92
|
+
quad = backend.sum(diff * diff, axis=2) * backend.expand_dims(precisions, 0)
|
|
93
|
+
return -0.5 * (log_2pi + backend.expand_dims(log_det, 0) + quad)
|
|
94
|
+
|
|
95
|
+
log_probs = []
|
|
96
|
+
if self.covariance_type == "tied":
|
|
97
|
+
if precisions_cholesky is None:
|
|
98
|
+
precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
|
|
99
|
+
log_det = -2.0 * backend.sum(backend.log(backend.diag(precisions_cholesky)))
|
|
100
|
+
for k in range(int(self.n_components)):
|
|
101
|
+
diff = X - means[k]
|
|
102
|
+
solved = backend.matmul(diff, precisions_cholesky)
|
|
103
|
+
quad = backend.sum(solved * solved, axis=1)
|
|
104
|
+
log_probs.append(-0.5 * (log_2pi + log_det + quad))
|
|
105
|
+
return backend.stack(log_probs, axis=1)
|
|
106
|
+
|
|
107
|
+
if precisions_cholesky is None:
|
|
108
|
+
precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
|
|
109
|
+
for k in range(int(self.n_components)):
|
|
110
|
+
log_det = -2.0 * backend.sum(backend.log(backend.diag(precisions_cholesky[k])))
|
|
111
|
+
diff = X - means[k]
|
|
112
|
+
solved = backend.matmul(diff, precisions_cholesky[k])
|
|
113
|
+
quad = backend.sum(solved * solved, axis=1)
|
|
114
|
+
log_probs.append(-0.5 * (log_2pi + log_det + quad))
|
|
115
|
+
return backend.stack(log_probs, axis=1)
|
|
116
|
+
|
|
117
|
+
def _estimate_weighted_log_prob(self, backend, X, weights, means, covariances, precisions_cholesky=None):
|
|
118
|
+
return self._estimate_log_gaussian_prob(
|
|
119
|
+
backend,
|
|
120
|
+
X,
|
|
121
|
+
means,
|
|
122
|
+
covariances,
|
|
123
|
+
precisions_cholesky=precisions_cholesky,
|
|
124
|
+
) + backend.expand_dims(backend.log(weights), 0)
|
|
125
|
+
|
|
126
|
+
def _e_step(self, backend, X, weights, means, covariances):
|
|
127
|
+
precisions_cholesky = self._estimate_precisions_cholesky(backend, covariances)
|
|
128
|
+
weighted_log_prob = self._estimate_weighted_log_prob(
|
|
129
|
+
backend,
|
|
130
|
+
X,
|
|
131
|
+
weights,
|
|
132
|
+
means,
|
|
133
|
+
covariances,
|
|
134
|
+
precisions_cholesky=precisions_cholesky,
|
|
135
|
+
)
|
|
136
|
+
log_prob_norm = backend.logsumexp(weighted_log_prob, axis=1)
|
|
137
|
+
log_resp = weighted_log_prob - backend.expand_dims(log_prob_norm, 1)
|
|
138
|
+
return scalar_to_float(backend.mean(log_prob_norm)), backend.exp(log_resp)
|
|
139
|
+
|
|
140
|
+
def _m_step(self, backend, X, resp):
|
|
141
|
+
n_samples = X.shape[0]
|
|
142
|
+
n_features = X.shape[1]
|
|
143
|
+
nk = backend.sum(resp, axis=0) + 10.0 * np.finfo(np.float64).eps
|
|
144
|
+
weights = nk / float(n_samples)
|
|
145
|
+
means = backend.matmul(resp.T, X) / backend.expand_dims(nk, 1)
|
|
146
|
+
if self.covariance_type in ("diag", "spherical"):
|
|
147
|
+
second_moment = backend.matmul(resp.T, X * X) / backend.expand_dims(nk, 1)
|
|
148
|
+
diag_covariances = backend.maximum(second_moment - means * means, float(self.reg_covar))
|
|
149
|
+
if self.covariance_type == "diag":
|
|
150
|
+
return weights, means, diag_covariances
|
|
151
|
+
spherical_covariances = backend.maximum(backend.mean(diag_covariances, axis=1), float(self.reg_covar))
|
|
152
|
+
return weights, means, spherical_covariances
|
|
153
|
+
|
|
154
|
+
eye = self._eye(backend, n_features)
|
|
155
|
+
if self.covariance_type == "tied":
|
|
156
|
+
covariance = backend.zeros((n_features, n_features), dtype=backend.float64)
|
|
157
|
+
for k in range(int(self.n_components)):
|
|
158
|
+
diff = X - means[k]
|
|
159
|
+
weighted = diff * backend.expand_dims(resp[:, k], 1)
|
|
160
|
+
covariance = covariance + backend.matmul(weighted.T, diff)
|
|
161
|
+
covariance = covariance / float(n_samples) + float(self.reg_covar) * eye
|
|
162
|
+
return weights, means, covariance
|
|
163
|
+
|
|
164
|
+
covariances = []
|
|
165
|
+
for k in range(int(self.n_components)):
|
|
166
|
+
diff = X - means[k]
|
|
167
|
+
weighted = diff * backend.expand_dims(resp[:, k], 1)
|
|
168
|
+
covariance = backend.matmul(weighted.T, diff) / nk[k] + float(self.reg_covar) * eye
|
|
169
|
+
covariances.append(covariance)
|
|
170
|
+
covariances = backend.stack(covariances, axis=0)
|
|
171
|
+
return weights, means, covariances
|
|
172
|
+
|
|
173
|
+
def _initialize(self, backend, X, seed):
|
|
174
|
+
n_samples, n_features = X.shape
|
|
175
|
+
rng = np.random.default_rng(seed)
|
|
176
|
+
if self.init_params == "kmeans":
|
|
177
|
+
km = KMeans(
|
|
178
|
+
n_clusters=int(self.n_components),
|
|
179
|
+
n_init=1,
|
|
180
|
+
max_iter=min(50, int(self.max_iter)),
|
|
181
|
+
random_state=seed,
|
|
182
|
+
device=self.device,
|
|
183
|
+
).fit(X)
|
|
184
|
+
means = km.cluster_centers_
|
|
185
|
+
else:
|
|
186
|
+
indices = rng.choice(n_samples, size=int(self.n_components), replace=False)
|
|
187
|
+
means = X[backend.asarray(indices, dtype=backend.int64)]
|
|
188
|
+
weights = backend.full((int(self.n_components),), 1.0 / float(self.n_components), dtype=backend.float64)
|
|
189
|
+
centered = X - backend.mean(X, axis=0)
|
|
190
|
+
global_var = backend.mean(centered * centered, axis=0) + float(self.reg_covar)
|
|
191
|
+
if self.covariance_type == "diag":
|
|
192
|
+
covariances = backend.ones((int(self.n_components), n_features), dtype=backend.float64) * global_var
|
|
193
|
+
elif self.covariance_type == "spherical":
|
|
194
|
+
covariances = backend.full(
|
|
195
|
+
(int(self.n_components),),
|
|
196
|
+
scalar_to_float(backend.mean(global_var)),
|
|
197
|
+
dtype=backend.float64,
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
global_covariance = backend.matmul(centered.T, centered) / float(n_samples)
|
|
201
|
+
global_covariance = global_covariance + float(self.reg_covar) * self._eye(backend, n_features)
|
|
202
|
+
if self.covariance_type == "tied":
|
|
203
|
+
covariances = global_covariance
|
|
204
|
+
else:
|
|
205
|
+
covariances = backend.stack([backend.copy(global_covariance) for _ in range(int(self.n_components))], axis=0)
|
|
206
|
+
return weights, means, covariances
|
|
207
|
+
|
|
208
|
+
def _estimate_precisions_cholesky(self, backend, covariances):
|
|
209
|
+
if self.covariance_type in ("diag", "spherical"):
|
|
210
|
+
return 1.0 / backend.sqrt(covariances)
|
|
211
|
+
if self.covariance_type == "tied":
|
|
212
|
+
return self._linalg_cholesky(backend, self._linalg_inv(backend, covariances))
|
|
213
|
+
return backend.stack(
|
|
214
|
+
[self._linalg_cholesky(backend, self._linalg_inv(backend, covariances[k])) for k in range(int(self.n_components))],
|
|
215
|
+
axis=0,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def fit(self, X, y=None):
|
|
219
|
+
reject_sparse(X, "GaussianMixture")
|
|
220
|
+
backend = self._get_backend()
|
|
221
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
222
|
+
check_2d_array(X_arr)
|
|
223
|
+
n_samples, n_features = X_arr.shape
|
|
224
|
+
self._validate_params(n_samples)
|
|
225
|
+
|
|
226
|
+
rng = np.random.default_rng(self.random_state)
|
|
227
|
+
best = None
|
|
228
|
+
for _ in range(int(self.n_init)):
|
|
229
|
+
seed = None if self.random_state is None else int(rng.integers(0, np.iinfo(np.int32).max))
|
|
230
|
+
weights, means, covariances = self._initialize(backend, X_arr, seed)
|
|
231
|
+
lower_bound = -np.inf
|
|
232
|
+
converged = False
|
|
233
|
+
n_iter = 0
|
|
234
|
+
for n_iter in range(1, int(self.max_iter) + 1):
|
|
235
|
+
prev_lower_bound = lower_bound
|
|
236
|
+
lower_bound, resp = self._e_step(backend, X_arr, weights, means, covariances)
|
|
237
|
+
weights, means, covariances = self._m_step(backend, X_arr, resp)
|
|
238
|
+
if abs(lower_bound - prev_lower_bound) < float(self.tol):
|
|
239
|
+
converged = True
|
|
240
|
+
break
|
|
241
|
+
if best is None or lower_bound > best[0]:
|
|
242
|
+
best = (lower_bound, converged, n_iter, weights, means, covariances)
|
|
243
|
+
|
|
244
|
+
lower_bound, converged, n_iter, weights, means, covariances = best
|
|
245
|
+
self.weights_ = weights
|
|
246
|
+
self.means_ = means
|
|
247
|
+
self.covariances_ = covariances
|
|
248
|
+
self.precisions_cholesky_ = self._estimate_precisions_cholesky(backend, covariances)
|
|
249
|
+
self.converged_ = bool(converged)
|
|
250
|
+
self.n_iter_ = int(n_iter)
|
|
251
|
+
self.lower_bound_ = float(lower_bound)
|
|
252
|
+
self.n_features_in_ = int(n_features)
|
|
253
|
+
self._backend_name = backend.name
|
|
254
|
+
self._fitted = True
|
|
255
|
+
return self
|
|
256
|
+
|
|
257
|
+
def score_samples(self, X):
|
|
258
|
+
self._check_is_fitted()
|
|
259
|
+
backend = self._get_backend()
|
|
260
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
261
|
+
check_2d_array(X_arr)
|
|
262
|
+
if X_arr.shape[1] != self.n_features_in_:
|
|
263
|
+
raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
|
|
264
|
+
return backend.logsumexp(
|
|
265
|
+
self._estimate_weighted_log_prob(
|
|
266
|
+
backend,
|
|
267
|
+
X_arr,
|
|
268
|
+
self.weights_,
|
|
269
|
+
self.means_,
|
|
270
|
+
self.covariances_,
|
|
271
|
+
precisions_cholesky=self.precisions_cholesky_,
|
|
272
|
+
),
|
|
273
|
+
axis=1,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
def predict_proba(self, X):
|
|
277
|
+
self._check_is_fitted()
|
|
278
|
+
backend = self._get_backend()
|
|
279
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
280
|
+
check_2d_array(X_arr)
|
|
281
|
+
if X_arr.shape[1] != self.n_features_in_:
|
|
282
|
+
raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
|
|
283
|
+
_, resp = self._e_step(backend, X_arr, self.weights_, self.means_, self.covariances_)
|
|
284
|
+
return resp
|
|
285
|
+
|
|
286
|
+
def predict(self, X):
|
|
287
|
+
backend = self._get_backend()
|
|
288
|
+
return backend.argmax(self.predict_proba(X), axis=1)
|
|
289
|
+
|
|
290
|
+
def fit_predict(self, X, y=None):
|
|
291
|
+
return self.fit(X, y=y).predict(X)
|
|
292
|
+
|
|
293
|
+
def score(self, X, y=None):
|
|
294
|
+
backend = self._get_backend()
|
|
295
|
+
return scalar_to_float(backend.mean(self.score_samples(X)))
|
|
296
|
+
|
|
297
|
+
def _n_parameters(self):
|
|
298
|
+
n_components = int(self.n_components)
|
|
299
|
+
n_features = int(self.n_features_in_)
|
|
300
|
+
mean_params = n_components * n_features
|
|
301
|
+
weight_params = n_components - 1
|
|
302
|
+
if self.covariance_type == "diag":
|
|
303
|
+
covariance_params = n_components * n_features
|
|
304
|
+
elif self.covariance_type == "spherical":
|
|
305
|
+
covariance_params = n_components
|
|
306
|
+
elif self.covariance_type == "tied":
|
|
307
|
+
covariance_params = n_features * (n_features + 1) // 2
|
|
308
|
+
else:
|
|
309
|
+
covariance_params = n_components * n_features * (n_features + 1) // 2
|
|
310
|
+
return mean_params + covariance_params + weight_params
|
|
311
|
+
|
|
312
|
+
def bic(self, X):
|
|
313
|
+
return -2.0 * float(self.score(X)) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])
|
|
314
|
+
|
|
315
|
+
def aic(self, X):
|
|
316
|
+
return -2.0 * float(self.score(X)) * X.shape[0] + 2.0 * self._n_parameters()
|
|
317
|
+
|
|
318
|
+
def get_params(self, deep=True):
|
|
319
|
+
params = super().get_params(deep=deep)
|
|
320
|
+
params.update(
|
|
321
|
+
{
|
|
322
|
+
"n_components": self.n_components,
|
|
323
|
+
"covariance_type": self.covariance_type,
|
|
324
|
+
"tol": self.tol,
|
|
325
|
+
"reg_covar": self.reg_covar,
|
|
326
|
+
"max_iter": self.max_iter,
|
|
327
|
+
"n_init": self.n_init,
|
|
328
|
+
"init_params": self.init_params,
|
|
329
|
+
"random_state": self.random_state,
|
|
330
|
+
}
|
|
331
|
+
)
|
|
332
|
+
return params
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Incremental principal component analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from statgpu._base import BaseEstimator
|
|
10
|
+
from statgpu._config import Device
|
|
11
|
+
from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_float, svd_flip_components
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IncrementalPCA(BaseEstimator):
|
|
15
|
+
"""Dense incremental PCA with NumPy, CuPy, or Torch backends."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
n_components: Optional[int] = None,
|
|
20
|
+
batch_size: Optional[int] = None,
|
|
21
|
+
whiten: bool = False,
|
|
22
|
+
copy: bool = True,
|
|
23
|
+
device: Union[str, Device] = Device.AUTO,
|
|
24
|
+
n_jobs: Optional[int] = None,
|
|
25
|
+
):
|
|
26
|
+
super().__init__(device=device, n_jobs=n_jobs)
|
|
27
|
+
self.n_components = n_components
|
|
28
|
+
self.batch_size = batch_size
|
|
29
|
+
self.whiten = whiten
|
|
30
|
+
self.copy = copy
|
|
31
|
+
|
|
32
|
+
def _validate_params(self, n_samples: int, n_features: int, first_pass: bool):
|
|
33
|
+
if self.n_components is None:
|
|
34
|
+
if first_pass:
|
|
35
|
+
n_components = min(n_samples, n_features)
|
|
36
|
+
else:
|
|
37
|
+
if not hasattr(self, "n_components_"):
|
|
38
|
+
raise ValueError("IncrementalPCA internal state is inconsistent; refit the estimator")
|
|
39
|
+
n_components = int(self.n_components_)
|
|
40
|
+
else:
|
|
41
|
+
if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
|
|
42
|
+
raise ValueError("n_components must be None or a positive integer")
|
|
43
|
+
n_components = int(self.n_components)
|
|
44
|
+
if n_components > n_features:
|
|
45
|
+
raise ValueError("n_components must be less than or equal to n_features")
|
|
46
|
+
if first_pass and n_samples < n_components:
|
|
47
|
+
raise ValueError("first partial_fit batch must contain at least n_components samples")
|
|
48
|
+
if self.batch_size is not None:
|
|
49
|
+
if not isinstance(self.batch_size, (int, np.integer)) or int(self.batch_size) < 1:
|
|
50
|
+
raise ValueError("batch_size must be None or a positive integer")
|
|
51
|
+
return n_components
|
|
52
|
+
|
|
53
|
+
def _update_mean_var(self, backend, batch, batch_mean, batch_var):
|
|
54
|
+
batch_count = int(batch.shape[0])
|
|
55
|
+
if not getattr(self, "_fitted", False):
|
|
56
|
+
return batch_mean, batch_var, batch_count
|
|
57
|
+
old_count = int(self.n_samples_seen_)
|
|
58
|
+
new_count = old_count + batch_count
|
|
59
|
+
old_mean = self.mean_
|
|
60
|
+
old_var = self.var_
|
|
61
|
+
new_mean = (float(old_count) * old_mean + float(batch_count) * batch_mean) / float(new_count)
|
|
62
|
+
old_ss = float(old_count) * (old_var + (old_mean - new_mean) ** 2)
|
|
63
|
+
batch_ss = float(batch_count) * (batch_var + (batch_mean - new_mean) ** 2)
|
|
64
|
+
new_var = (old_ss + batch_ss) / float(new_count)
|
|
65
|
+
return new_mean, new_var, new_count
|
|
66
|
+
|
|
67
|
+
def partial_fit(self, X, y=None):
|
|
68
|
+
reject_sparse(X, "IncrementalPCA")
|
|
69
|
+
backend = self._get_backend()
|
|
70
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
71
|
+
check_2d_array(X_arr)
|
|
72
|
+
n_samples, n_features = X_arr.shape
|
|
73
|
+
first_pass = not getattr(self, "_fitted", False)
|
|
74
|
+
n_components = self._validate_params(n_samples, n_features, first_pass=first_pass)
|
|
75
|
+
if not first_pass and n_features != self.n_features_in_:
|
|
76
|
+
raise ValueError(f"X has {n_features} features, expected {self.n_features_in_}")
|
|
77
|
+
|
|
78
|
+
batch_mean = backend.mean(X_arr, axis=0)
|
|
79
|
+
batch_var = backend.mean((X_arr - batch_mean) ** 2, axis=0)
|
|
80
|
+
new_mean, new_var, new_count = self._update_mean_var(backend, X_arr, batch_mean, batch_var)
|
|
81
|
+
X_centered = X_arr - batch_mean
|
|
82
|
+
|
|
83
|
+
if first_pass:
|
|
84
|
+
matrix = X_centered
|
|
85
|
+
else:
|
|
86
|
+
old_count = int(self.n_samples_seen_)
|
|
87
|
+
old_basis = self.singular_values_[:, None] * self.components_
|
|
88
|
+
mean_correction = np.sqrt(float(old_count * n_samples) / float(new_count)) * (self.mean_ - batch_mean)
|
|
89
|
+
matrix = backend.concatenate([old_basis, X_centered, backend.reshape(mean_correction, (1, n_features))], axis=0)
|
|
90
|
+
|
|
91
|
+
_, singular_values_all, vh = backend.svd(matrix, full_matrices=False)
|
|
92
|
+
components = svd_flip_components(backend, vh[:n_components])
|
|
93
|
+
singular_values = singular_values_all[:n_components]
|
|
94
|
+
if new_count > 1:
|
|
95
|
+
explained_variance = (singular_values ** 2) / float(new_count - 1)
|
|
96
|
+
total_var = backend.sum(new_var) * float(new_count) / float(new_count - 1)
|
|
97
|
+
else:
|
|
98
|
+
explained_variance = singular_values * 0.0
|
|
99
|
+
total_var = backend.sum(new_var)
|
|
100
|
+
if scalar_to_float(total_var) > 0.0:
|
|
101
|
+
explained_variance_ratio = explained_variance / total_var
|
|
102
|
+
else:
|
|
103
|
+
explained_variance_ratio = explained_variance * 0.0
|
|
104
|
+
|
|
105
|
+
self.components_ = components
|
|
106
|
+
self.mean_ = new_mean
|
|
107
|
+
self.var_ = new_var
|
|
108
|
+
self.explained_variance_ = explained_variance
|
|
109
|
+
self.explained_variance_ratio_ = explained_variance_ratio
|
|
110
|
+
self.singular_values_ = singular_values
|
|
111
|
+
self.n_components_ = int(n_components)
|
|
112
|
+
self.n_features_in_ = int(n_features)
|
|
113
|
+
self.n_samples_seen_ = int(new_count)
|
|
114
|
+
self._backend_name = backend.name
|
|
115
|
+
self._fitted = True
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
def fit(self, X, y=None):
|
|
119
|
+
reject_sparse(X, "IncrementalPCA")
|
|
120
|
+
backend = self._get_backend()
|
|
121
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
122
|
+
check_2d_array(X_arr)
|
|
123
|
+
n_samples, n_features = X_arr.shape
|
|
124
|
+
n_components = self._validate_params(n_samples, n_features, first_pass=True)
|
|
125
|
+
batch_size = int(self.batch_size) if self.batch_size is not None else min(n_samples, max(1, 5 * n_features))
|
|
126
|
+
self._fitted = False
|
|
127
|
+
first_batch_end = batch_size
|
|
128
|
+
if n_samples >= n_components and first_batch_end < n_components:
|
|
129
|
+
first_batch_end = n_components
|
|
130
|
+
self.partial_fit(X_arr[:first_batch_end])
|
|
131
|
+
for start in range(first_batch_end, n_samples, batch_size):
|
|
132
|
+
self.partial_fit(X_arr[start : start + batch_size])
|
|
133
|
+
return self
|
|
134
|
+
|
|
135
|
+
def transform(self, X):
|
|
136
|
+
self._check_is_fitted()
|
|
137
|
+
backend = self._get_backend()
|
|
138
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
139
|
+
check_2d_array(X_arr)
|
|
140
|
+
if X_arr.shape[1] != self.n_features_in_:
|
|
141
|
+
raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
|
|
142
|
+
X_transformed = backend.matmul(X_arr - self.mean_, self.components_.T)
|
|
143
|
+
if self.whiten:
|
|
144
|
+
safe_variance = backend.maximum(self.explained_variance_, np.finfo(np.float64).eps)
|
|
145
|
+
X_transformed = X_transformed / backend.sqrt(safe_variance)
|
|
146
|
+
return X_transformed
|
|
147
|
+
|
|
148
|
+
def fit_transform(self, X, y=None):
|
|
149
|
+
return self.fit(X, y=y).transform(X)
|
|
150
|
+
|
|
151
|
+
def inverse_transform(self, X):
|
|
152
|
+
self._check_is_fitted()
|
|
153
|
+
backend = self._get_backend()
|
|
154
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
155
|
+
check_2d_array(X_arr)
|
|
156
|
+
if X_arr.shape[1] != self.n_components_:
|
|
157
|
+
raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
|
|
158
|
+
if self.whiten:
|
|
159
|
+
safe_variance = backend.maximum(self.explained_variance_, np.finfo(np.float64).eps)
|
|
160
|
+
X_arr = X_arr * backend.sqrt(safe_variance)
|
|
161
|
+
return backend.matmul(X_arr, self.components_) + self.mean_
|
|
162
|
+
|
|
163
|
+
def predict(self, X):
|
|
164
|
+
return self.transform(X)
|
|
165
|
+
|
|
166
|
+
def get_params(self, deep=True):
|
|
167
|
+
params = super().get_params(deep=deep)
|
|
168
|
+
params.update(
|
|
169
|
+
{
|
|
170
|
+
"n_components": self.n_components,
|
|
171
|
+
"batch_size": self.batch_size,
|
|
172
|
+
"whiten": self.whiten,
|
|
173
|
+
"copy": self.copy,
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
return params
|