statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,132 @@
1
+ """Truncated singular value decomposition."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import check_2d_array, randomized_svd, reject_sparse, scalar_to_float
12
+
13
+
14
+ class TruncatedSVD(BaseEstimator):
15
+ """
16
+ Dense truncated SVD with NumPy, CuPy, or Torch backends.
17
+
18
+ Unlike PCA, this estimator does not center the input matrix.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ n_components: int = 2,
24
+ algorithm: str = "randomized",
25
+ n_iter: int = 5,
26
+ n_oversamples: int = 10,
27
+ random_state: Optional[int] = None,
28
+ device: Union[str, Device] = Device.AUTO,
29
+ n_jobs: Optional[int] = None,
30
+ ):
31
+ super().__init__(device=device, n_jobs=n_jobs)
32
+ self.n_components = n_components
33
+ self.algorithm = algorithm
34
+ self.n_iter = n_iter
35
+ self.n_oversamples = n_oversamples
36
+ self.random_state = random_state
37
+
38
+ def _validate_params(self, n_samples: int, n_features: int):
39
+ if not isinstance(self.n_components, (int, np.integer)):
40
+ raise ValueError("n_components must be a positive integer")
41
+ n_components = int(self.n_components)
42
+ max_components = min(n_samples, n_features)
43
+ if n_components < 1 or n_components > max_components:
44
+ raise ValueError(f"n_components must be in [1, {max_components}] for the input shape")
45
+ if self.algorithm not in ("randomized", "full"):
46
+ raise ValueError("algorithm must be one of: 'randomized', 'full'")
47
+ if not isinstance(self.n_iter, (int, np.integer)) or int(self.n_iter) < 0:
48
+ raise ValueError("n_iter must be a non-negative integer")
49
+ if not isinstance(self.n_oversamples, (int, np.integer)) or int(self.n_oversamples) < 0:
50
+ raise ValueError("n_oversamples must be a non-negative integer")
51
+ return n_components
52
+
53
+ def fit(self, X, y=None):
54
+ reject_sparse(X, "TruncatedSVD")
55
+ backend = self._get_backend()
56
+ X_arr = backend.asarray(X, dtype=backend.float64)
57
+ check_2d_array(X_arr)
58
+ n_samples, n_features = X_arr.shape
59
+ n_components = self._validate_params(n_samples, n_features)
60
+
61
+ if self.algorithm == "full":
62
+ _, singular_values_all, vh = backend.svd(X_arr, full_matrices=False)
63
+ singular_values = singular_values_all[:n_components]
64
+ components = vh[:n_components]
65
+ else:
66
+ singular_values, components = randomized_svd(
67
+ backend,
68
+ X_arr,
69
+ n_components=n_components,
70
+ n_oversamples=int(self.n_oversamples),
71
+ n_iter=int(self.n_iter),
72
+ random_state=self.random_state,
73
+ )
74
+
75
+ transformed = backend.matmul(X_arr, components.T)
76
+ transformed_mean = backend.mean(transformed, axis=0, keepdims=True)
77
+ explained_variance = backend.mean((transformed - transformed_mean) ** 2, axis=0)
78
+ feature_mean = backend.mean(X_arr, axis=0, keepdims=True)
79
+ total_variance = backend.sum(backend.mean((X_arr - feature_mean) ** 2, axis=0))
80
+ if scalar_to_float(total_variance) > 0.0:
81
+ explained_variance_ratio = explained_variance / total_variance
82
+ else:
83
+ explained_variance_ratio = explained_variance * 0.0
84
+
85
+ self.components_ = components
86
+ self.explained_variance_ = explained_variance
87
+ self.explained_variance_ratio_ = explained_variance_ratio
88
+ self.singular_values_ = singular_values
89
+ self.n_components_ = int(n_components)
90
+ self.n_features_in_ = int(n_features)
91
+ self._backend_name = backend.name
92
+ self._fitted = True
93
+ return self
94
+
95
+ def transform(self, X):
96
+ self._check_is_fitted()
97
+ reject_sparse(X, "TruncatedSVD")
98
+ backend = self._get_backend()
99
+ X_arr = backend.asarray(X, dtype=backend.float64)
100
+ check_2d_array(X_arr)
101
+ if X_arr.shape[1] != self.n_features_in_:
102
+ raise ValueError(f"X has {X_arr.shape[1]} features, expected {self.n_features_in_}")
103
+ return backend.matmul(X_arr, self.components_.T)
104
+
105
+ def fit_transform(self, X, y=None):
106
+ return self.fit(X, y=y).transform(X)
107
+
108
+ def inverse_transform(self, X):
109
+ self._check_is_fitted()
110
+ backend = self._get_backend()
111
+ X_arr = backend.asarray(X, dtype=backend.float64)
112
+ check_2d_array(X_arr)
113
+ if X_arr.shape[1] != self.n_components_:
114
+ raise ValueError(f"X has {X_arr.shape[1]} components, expected {self.n_components_}")
115
+ return backend.matmul(X_arr, self.components_)
116
+
117
+ def predict(self, X):
118
+ """Alias for transform, provided for BaseEstimator compatibility."""
119
+ return self.transform(X)
120
+
121
+ def get_params(self, deep=True):
122
+ params = super().get_params(deep=deep)
123
+ params.update(
124
+ {
125
+ "n_components": self.n_components,
126
+ "algorithm": self.algorithm,
127
+ "n_iter": self.n_iter,
128
+ "n_oversamples": self.n_oversamples,
129
+ "random_state": self.random_state,
130
+ }
131
+ )
132
+ return params
@@ -0,0 +1,192 @@
1
+ """Exact dense t-SNE."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._pca import PCA
12
+ from statgpu.unsupervised._utils import (
13
+ backend_random_normal,
14
+ check_2d_array,
15
+ eye,
16
+ reject_sparse,
17
+ scalar_to_float,
18
+ squared_euclidean_distances,
19
+ )
20
+
21
+
22
+ class TSNE(BaseEstimator):
23
+ """Dense exact t-SNE with backend-native probability and gradient steps."""
24
+
25
+ def __init__(
26
+ self,
27
+ n_components: int = 2,
28
+ perplexity: float = 30.0,
29
+ early_exaggeration: float = 12.0,
30
+ learning_rate: Union[str, float] = "auto",
31
+ max_iter: int = 1000,
32
+ init: str = "pca",
33
+ random_state: Optional[int] = None,
34
+ metric: str = "euclidean",
35
+ device: Union[str, Device] = Device.AUTO,
36
+ n_jobs: Optional[int] = None,
37
+ ):
38
+ super().__init__(device=device, n_jobs=n_jobs)
39
+ self.n_components = n_components
40
+ self.perplexity = perplexity
41
+ self.early_exaggeration = early_exaggeration
42
+ self.learning_rate = learning_rate
43
+ self.max_iter = max_iter
44
+ self.init = init
45
+ self.random_state = random_state
46
+ self.metric = metric
47
+
48
+ def _validate_params(self, n_samples: int):
49
+ if self.metric != "euclidean":
50
+ raise NotImplementedError("TSNE v1 only supports metric='euclidean'")
51
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
52
+ raise ValueError("n_components must be a positive integer")
53
+ if int(self.n_components) >= n_samples:
54
+ raise ValueError("n_components must be less than n_samples")
55
+ if float(self.perplexity) <= 0.0 or float(self.perplexity) >= n_samples:
56
+ raise ValueError("perplexity must be in (0, n_samples)")
57
+ if float(self.early_exaggeration) <= 0.0:
58
+ raise ValueError("early_exaggeration must be positive")
59
+ if not isinstance(self.max_iter, (int, np.integer)) or int(self.max_iter) < 250:
60
+ raise ValueError("max_iter must be an integer >= 250")
61
+ if self.init not in ("pca", "random"):
62
+ raise ValueError("init must be one of: 'pca', 'random'")
63
+
64
+ def _joint_probabilities(self, backend, X):
65
+ n_samples = X.shape[0]
66
+ distances = squared_euclidean_distances(backend, X)
67
+ distances = distances * (1.0 - eye(backend, n_samples, dtype=backend.float64))
68
+ beta = backend.ones((n_samples, 1), dtype=backend.float64)
69
+ beta_min = backend.zeros((n_samples, 1), dtype=backend.float64)
70
+ beta_max = backend.full((n_samples, 1), np.inf, dtype=backend.float64)
71
+ target = float(np.log(self.perplexity))
72
+ off_diag = 1.0 - eye(backend, n_samples, dtype=backend.float64)
73
+
74
+ for _ in range(50):
75
+ P = backend.exp(-distances * beta) * off_diag
76
+ sumP = backend.maximum(backend.sum(P, axis=1, keepdims=True), 1e-300)
77
+ H = backend.log(sumP) + beta * backend.sum(distances * P, axis=1, keepdims=True) / sumP
78
+ too_high = H > target
79
+ beta_min = backend.where(too_high, beta, beta_min)
80
+ beta_max = backend.where(too_high, beta_max, beta)
81
+ doubled = beta * 2.0
82
+ halved = beta / 2.0
83
+ averaged_high = (beta + beta_max) / 2.0
84
+ averaged_low = (beta + beta_min) / 2.0
85
+ beta = backend.where(
86
+ too_high,
87
+ backend.where(beta_max == np.inf, doubled, averaged_high),
88
+ backend.where(beta_min == 0.0, halved, averaged_low),
89
+ )
90
+
91
+ P = backend.exp(-distances * beta) * off_diag
92
+ P = P / backend.maximum(backend.sum(P, axis=1, keepdims=True), 1e-300)
93
+ P = (P + P.T) / (2.0 * float(n_samples))
94
+ return backend.maximum(P, 1e-300)
95
+
96
+ def _initial_embedding(self, backend, X):
97
+ n_samples = X.shape[0]
98
+ if self.init == "random":
99
+ return backend_random_normal(backend, self.random_state, size=(n_samples, int(self.n_components)), scale=1e-4)
100
+ pca = PCA(
101
+ n_components=int(self.n_components),
102
+ svd_solver="auto",
103
+ random_state=self.random_state,
104
+ device=self.device,
105
+ n_jobs=self.n_jobs,
106
+ )
107
+ init = pca.fit_transform(X)
108
+ first = init[:, :1]
109
+ first_centered = first - backend.mean(first, axis=0, keepdims=True)
110
+ scale = backend.sqrt(backend.maximum(backend.mean(first_centered * first_centered), 1e-300))
111
+ return init / scale * 1e-4
112
+
113
+ def _learning_rate(self, n_samples: int) -> float:
114
+ if self.learning_rate == "auto":
115
+ return max(float(n_samples) / float(self.early_exaggeration) / 4.0, 10.0)
116
+ lr = float(self.learning_rate)
117
+ if lr <= 0.0:
118
+ raise ValueError("learning_rate must be 'auto' or a positive number")
119
+ return lr
120
+
121
+ def fit(self, X, y=None):
122
+ reject_sparse(X, "TSNE")
123
+ backend = self._get_backend()
124
+ X_arr = backend.asarray(X, dtype=backend.float64)
125
+ check_2d_array(X_arr)
126
+ n_samples, n_features = X_arr.shape
127
+ self._validate_params(n_samples)
128
+
129
+ P = self._joint_probabilities(backend, X_arr)
130
+ Y = self._initial_embedding(backend, X_arr)
131
+ lr = self._learning_rate(n_samples)
132
+ momentum = 0.5
133
+ velocity = backend.zeros_like(Y)
134
+ gains = backend.ones_like(Y)
135
+ off_diag = 1.0 - eye(backend, n_samples, dtype=backend.float64)
136
+ exaggeration_iters = min(250, int(self.max_iter) // 2)
137
+
138
+ kl = None
139
+ for it in range(int(self.max_iter)):
140
+ P_use = P * float(self.early_exaggeration) if it < exaggeration_iters else P
141
+ dist_sq = squared_euclidean_distances(backend, Y)
142
+ inv = (1.0 / (1.0 + dist_sq)) * off_diag
143
+ Q = inv / backend.maximum(backend.sum(inv), 1e-300)
144
+ forces = (P_use - Q) * inv
145
+ row_force = backend.sum(forces, axis=1, keepdims=True)
146
+ grad = 4.0 * (row_force * Y - backend.matmul(forces, Y))
147
+ sign_changed = (grad * velocity) < 0.0
148
+ gains = backend.where(sign_changed, gains + 0.2, gains * 0.8)
149
+ gains = backend.maximum(gains, 0.01)
150
+ velocity = momentum * velocity - lr * gains * grad
151
+ Y = Y + velocity
152
+ Y = Y - backend.mean(Y, axis=0, keepdims=True)
153
+ if it == exaggeration_iters:
154
+ momentum = 0.8
155
+
156
+ dist_sq = squared_euclidean_distances(backend, Y)
157
+ inv = (1.0 / (1.0 + dist_sq)) * off_diag
158
+ Q = backend.maximum(inv / backend.maximum(backend.sum(inv), 1e-300), 1e-300)
159
+ kl = backend.sum(P * (backend.log(P) - backend.log(Q)))
160
+
161
+ self.embedding_ = Y
162
+ self.kl_divergence_ = scalar_to_float(kl)
163
+ self.n_iter_ = int(self.max_iter)
164
+ self.n_features_in_ = int(n_features)
165
+ self._backend_name = backend.name
166
+ self._fitted = True
167
+ return self
168
+
169
+ def fit_transform(self, X, y=None):
170
+ return self.fit(X, y=y).embedding_
171
+
172
+ def transform(self, X):
173
+ raise NotImplementedError("TSNE v1 does not support transforming new data")
174
+
175
+ def predict(self, X):
176
+ raise NotImplementedError("TSNE v1 does not support prediction")
177
+
178
+ def get_params(self, deep=True):
179
+ params = super().get_params(deep=deep)
180
+ params.update(
181
+ {
182
+ "n_components": self.n_components,
183
+ "perplexity": self.perplexity,
184
+ "early_exaggeration": self.early_exaggeration,
185
+ "learning_rate": self.learning_rate,
186
+ "max_iter": self.max_iter,
187
+ "init": self.init,
188
+ "random_state": self.random_state,
189
+ "metric": self.metric,
190
+ }
191
+ )
192
+ return params
@@ -0,0 +1,224 @@
1
+ """Dense exact UMAP."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+
9
+ from statgpu._base import BaseEstimator
10
+ from statgpu._config import Device
11
+ from statgpu.unsupervised._utils import (
12
+ backend_random_normal,
13
+ check_2d_array,
14
+ eye,
15
+ reject_sparse,
16
+ squared_euclidean_distances,
17
+ topk_smallest,
18
+ )
19
+
20
+
21
+ class UMAP(BaseEstimator):
22
+ """
23
+ Dense exact UMAP with NumPy, CuPy, or Torch backends.
24
+
25
+ Version 1 builds an exact dense Euclidean neighbor graph and does not
26
+ implement approximate NNDescent or transforming new data.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ n_neighbors: int = 15,
32
+ n_components: int = 2,
33
+ metric: str = "euclidean",
34
+ min_dist: float = 0.1,
35
+ spread: float = 1.0,
36
+ n_epochs: Optional[int] = None,
37
+ learning_rate: float = 1.0,
38
+ init: str = "spectral",
39
+ negative_sample_rate: int = 5,
40
+ repulsion_strength: float = 1.0,
41
+ random_state: Optional[int] = None,
42
+ device: Union[str, Device] = Device.AUTO,
43
+ n_jobs: Optional[int] = None,
44
+ ):
45
+ super().__init__(device=device, n_jobs=n_jobs)
46
+ self.n_neighbors = n_neighbors
47
+ self.n_components = n_components
48
+ self.metric = metric
49
+ self.min_dist = min_dist
50
+ self.spread = spread
51
+ self.n_epochs = n_epochs
52
+ self.learning_rate = learning_rate
53
+ self.init = init
54
+ self.negative_sample_rate = negative_sample_rate
55
+ self.repulsion_strength = repulsion_strength
56
+ self.random_state = random_state
57
+
58
+ def _validate_params(self, n_samples: int):
59
+ if self.metric != "euclidean":
60
+ raise NotImplementedError("UMAP v1 only supports metric='euclidean'")
61
+ if not isinstance(self.n_neighbors, (int, np.integer)) or int(self.n_neighbors) < 2:
62
+ raise ValueError("n_neighbors must be an integer >= 2")
63
+ if int(self.n_neighbors) >= n_samples:
64
+ raise ValueError("n_neighbors must be less than n_samples")
65
+ if not isinstance(self.n_components, (int, np.integer)) or int(self.n_components) < 1:
66
+ raise ValueError("n_components must be a positive integer")
67
+ if int(self.n_components) >= n_samples:
68
+ raise ValueError("n_components must be less than n_samples")
69
+ if float(self.min_dist) < 0.0:
70
+ raise ValueError("min_dist must be non-negative")
71
+ if float(self.spread) <= 0.0:
72
+ raise ValueError("spread must be positive")
73
+ if self.init not in ("spectral", "random"):
74
+ raise ValueError("init must be one of: 'spectral', 'random'")
75
+ if self.n_epochs is not None:
76
+ if not isinstance(self.n_epochs, (int, np.integer)) or int(self.n_epochs) < 1:
77
+ raise ValueError("n_epochs must be None or a positive integer")
78
+ if float(self.learning_rate) <= 0.0:
79
+ raise ValueError("learning_rate must be positive")
80
+ if not isinstance(self.negative_sample_rate, (int, np.integer)) or int(self.negative_sample_rate) < 1:
81
+ raise ValueError("negative_sample_rate must be a positive integer")
82
+ if float(self.repulsion_strength) <= 0.0:
83
+ raise ValueError("repulsion_strength must be positive")
84
+
85
+ def _smooth_knn_membership(self, backend, neighbor_distances):
86
+ rho = neighbor_distances[:, :1]
87
+ adjusted = backend.maximum(neighbor_distances - rho, 0.0)
88
+ sigma = backend.maximum(backend.mean(adjusted, axis=1, keepdims=True), 1e-12)
89
+ membership = backend.exp(-adjusted / sigma)
90
+ membership[:, 0] = 1.0
91
+ return membership
92
+
93
+ def _fuzzy_graph(self, backend, X):
94
+ n_samples = X.shape[0]
95
+ distances = backend.sqrt(squared_euclidean_distances(backend, X))
96
+ distances = distances + eye(backend, n_samples, dtype=backend.float64) * 1e12
97
+ neighbor_distances, neighbor_indices = topk_smallest(backend, distances, int(self.n_neighbors))
98
+ membership = self._smooth_knn_membership(backend, neighbor_distances)
99
+ graph = backend.zeros((n_samples, n_samples), dtype=backend.float64)
100
+ rows = backend.reshape(backend.arange(n_samples, dtype=backend.int64), (n_samples, 1))
101
+ graph[rows, backend.astype(neighbor_indices, backend.int64)] = membership
102
+ graph = graph + graph.T - graph * graph.T
103
+ graph = graph * (1.0 - eye(backend, n_samples, dtype=backend.float64))
104
+ return graph
105
+
106
+ def _initial_embedding(self, backend, graph):
107
+ n_samples = graph.shape[0]
108
+ if self.init == "random":
109
+ return backend_random_normal(backend, self.random_state, size=(n_samples, int(self.n_components)), scale=1e-4)
110
+
111
+ degree = backend.sum(graph, axis=1)
112
+ laplacian = backend.diag(degree) - graph
113
+ eigenvalues, eigenvectors = backend.eigh(laplacian)
114
+ order = backend.argsort(eigenvalues, axis=0)
115
+ components = eigenvectors[:, order[1 : int(self.n_components) + 1]]
116
+ jitter = backend_random_normal(backend, self.random_state, size=(n_samples, int(self.n_components)), scale=1e-4)
117
+ return components + jitter
118
+
119
+ def _epochs(self, n_samples: int) -> int:
120
+ if self.n_epochs is not None:
121
+ return int(self.n_epochs)
122
+ return 500 if n_samples <= 10_000 else 200
123
+
124
+ def _attraction_curve_params(self):
125
+ """
126
+ Fit UMAP's (a, b) curve parameters from min_dist and spread.
127
+
128
+ This mirrors the reference approach used by umap-learn:
129
+ target(d) = 1 if d <= min_dist
130
+ exp(-(d-min_dist)/spread) otherwise
131
+ and we fit 1 / (1 + a * d^(2b)) to that target.
132
+ """
133
+ min_dist = float(self.min_dist)
134
+ spread = float(self.spread)
135
+ xv = np.linspace(0.0, spread * 3.0, 300, dtype=np.float64)
136
+ yv = np.where(xv <= min_dist, 1.0, np.exp(-(xv - min_dist) / max(spread, 1e-12)))
137
+
138
+ def curve(d, a, b):
139
+ return 1.0 / (1.0 + a * np.power(d, 2.0 * b))
140
+
141
+ try:
142
+ # Optional dependency: keep UMAP functional even when SciPy is absent.
143
+ from scipy.optimize import curve_fit
144
+
145
+ params, _ = curve_fit(
146
+ curve,
147
+ xv,
148
+ yv,
149
+ p0=(1.0, 1.0),
150
+ bounds=((1e-12, 1e-12), (1e6, 10.0)),
151
+ maxfev=20000,
152
+ )
153
+ a, b = float(params[0]), float(params[1])
154
+ if np.isfinite(a) and np.isfinite(b) and a > 0.0 and b > 0.0:
155
+ return a, b
156
+ except Exception:
157
+ pass
158
+
159
+ # Conservative fallback to ensure training can proceed.
160
+ return 1.0, 1.0 / max(spread, 1e-12)
161
+
162
+ def fit(self, X, y=None):
163
+ reject_sparse(X, "UMAP")
164
+ backend = self._get_backend()
165
+ X_arr = backend.asarray(X, dtype=backend.float64)
166
+ check_2d_array(X_arr)
167
+ n_samples, n_features = X_arr.shape
168
+ self._validate_params(n_samples)
169
+
170
+ graph = self._fuzzy_graph(backend, X_arr)
171
+ Y = self._initial_embedding(backend, graph)
172
+ n_epochs = self._epochs(n_samples)
173
+ a, b = self._attraction_curve_params()
174
+ off_diag = 1.0 - eye(backend, n_samples, dtype=backend.float64)
175
+ graph = backend.clip(graph, 0.0, 1.0)
176
+ repulsion = float(self.repulsion_strength) / float(self.negative_sample_rate)
177
+
178
+ for epoch in range(n_epochs):
179
+ alpha = float(self.learning_rate) * (1.0 - (epoch / max(n_epochs, 1)))
180
+ diff = backend.expand_dims(Y, 1) - backend.expand_dims(Y, 0)
181
+ dist_sq = backend.sum(diff * diff, axis=2)
182
+ inv = (1.0 / (1.0 + float(a) * (dist_sq ** float(b)))) * off_diag
183
+ attractive = graph
184
+ repulsive = (1.0 - graph) * inv * repulsion
185
+ forces = (attractive - repulsive) * inv
186
+ grad = 2.0 * backend.sum(backend.expand_dims(forces, 2) * diff, axis=1)
187
+ Y = Y - alpha * grad
188
+ Y = Y - backend.mean(Y, axis=0, keepdims=True)
189
+
190
+ self.embedding_ = Y
191
+ self.graph_ = graph
192
+ self.n_epochs_ = int(n_epochs)
193
+ self.n_features_in_ = int(n_features)
194
+ self._backend_name = backend.name
195
+ self._fitted = True
196
+ return self
197
+
198
+ def fit_transform(self, X, y=None):
199
+ return self.fit(X, y=y).embedding_
200
+
201
+ def transform(self, X):
202
+ raise NotImplementedError("UMAP v1 does not support transforming new data")
203
+
204
+ def predict(self, X):
205
+ raise NotImplementedError("UMAP v1 does not support prediction")
206
+
207
+ def get_params(self, deep=True):
208
+ params = super().get_params(deep=deep)
209
+ params.update(
210
+ {
211
+ "n_neighbors": self.n_neighbors,
212
+ "n_components": self.n_components,
213
+ "metric": self.metric,
214
+ "min_dist": self.min_dist,
215
+ "spread": self.spread,
216
+ "n_epochs": self.n_epochs,
217
+ "learning_rate": self.learning_rate,
218
+ "init": self.init,
219
+ "negative_sample_rate": self.negative_sample_rate,
220
+ "repulsion_strength": self.repulsion_strength,
221
+ "random_state": self.random_state,
222
+ }
223
+ )
224
+ return params
@@ -0,0 +1,134 @@
1
+ """Shared utilities for unsupervised estimators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from scipy import sparse
7
+
8
+
9
+ def check_2d_array(X, name: str = "X") -> None:
10
+ """Validate that *X* is a non-empty 2D array-like object."""
11
+ if getattr(X, "ndim", None) != 2:
12
+ raise ValueError(f"{name} must be a 2D array")
13
+ if X.shape[0] < 1 or X.shape[1] < 1:
14
+ raise ValueError(f"{name} must contain at least one sample and one feature")
15
+
16
+
17
+ def reject_sparse(X, estimator_name: str) -> None:
18
+ """Raise a consistent error for unsupported sparse inputs."""
19
+ if sparse.issparse(X):
20
+ raise NotImplementedError(f"sparse input is not supported in {estimator_name} v1")
21
+
22
+
23
+ def scalar_to_float(x) -> float:
24
+ """Convert a NumPy/CuPy/Torch scalar to Python float."""
25
+ if hasattr(x, "detach"):
26
+ return float(x.detach().cpu().item())
27
+ if hasattr(x, "get"):
28
+ return float(x.get())
29
+ if hasattr(x, "item"):
30
+ return float(x.item())
31
+ return float(x)
32
+
33
+
34
+ def scalar_to_int(x) -> int:
35
+ """Convert a NumPy/CuPy/Torch scalar to Python int."""
36
+ if hasattr(x, "detach"):
37
+ return int(x.detach().cpu().item())
38
+ if hasattr(x, "get"):
39
+ return int(x.get())
40
+ if hasattr(x, "item"):
41
+ return int(x.item())
42
+ return int(x)
43
+
44
+
45
+ def draw_random_seed(random_state) -> int:
46
+ """Draw an integer seed from int/None/RandomState/Generator inputs."""
47
+ if random_state is None:
48
+ return int(np.random.SeedSequence().generate_state(1, dtype=np.uint64)[0])
49
+ if isinstance(random_state, np.random.Generator):
50
+ return int(random_state.integers(0, np.iinfo(np.int32).max))
51
+ if isinstance(random_state, np.random.RandomState):
52
+ return int(random_state.randint(0, np.iinfo(np.int32).max))
53
+ return int(random_state)
54
+
55
+
56
+ def backend_random_normal(backend, random_state, size, scale: float = 1.0):
57
+ """Generate deterministic normal variates directly on the target backend.
58
+
59
+ This avoids allocating a NumPy random matrix and then transferring it to a
60
+ GPU backend. The lightweight Box-Muller generator is used only for
61
+ estimator initialization and randomized projections, where deterministic
62
+ seeded behavior is more important than cryptographic-quality randomness.
63
+ """
64
+ total = int(np.prod(size))
65
+ seed = draw_random_seed(random_state)
66
+ idx = backend.arange(total, dtype=backend.float64)
67
+ xp = backend.xp
68
+
69
+ def uniform(offset):
70
+ values = xp.sin((idx + 1.0 + float(offset)) * (12.9898 + 0.001 * float(seed))) * 43758.5453
71
+ return values - xp.floor(values)
72
+
73
+ u1 = backend.maximum(uniform(0), 1e-12)
74
+ u2 = uniform(total + 17)
75
+ z = backend.sqrt(-2.0 * backend.log(u1)) * xp.cos(2.0 * np.pi * u2)
76
+ return backend.reshape(z * float(scale), size)
77
+
78
+
79
+ def squared_euclidean_distances(backend, X, Y=None):
80
+ """Compute dense squared Euclidean distances with backend arrays."""
81
+ Y = X if Y is None else Y
82
+ x_norm = backend.sum(X * X, axis=1, keepdims=True)
83
+ y_norm = backend.sum(Y * Y, axis=1, keepdims=True)
84
+ distances = x_norm + y_norm.T - 2.0 * backend.matmul(X, Y.T)
85
+ return backend.maximum(distances, 0.0)
86
+
87
+
88
+ def topk_smallest(backend, distances, k: int):
89
+ """Return the k smallest values and indices along axis 1."""
90
+ order = backend.argsort(distances, axis=1)
91
+ idx = order[:, :k]
92
+ values = backend.take_along_axis(distances, idx, axis=1)
93
+ return values, idx
94
+
95
+
96
+ def svd_flip_components(backend, components):
97
+ """Apply a deterministic sign convention to right singular vectors."""
98
+ max_abs_cols = backend.argmax(backend.abs(components), axis=1)
99
+ rows = backend.arange(components.shape[0])
100
+ signs = backend.where(components[rows, max_abs_cols] < 0.0, -1.0, 1.0)
101
+ return components * backend.reshape(signs, (components.shape[0], 1))
102
+
103
+
104
+ def randomized_svd(
105
+ backend,
106
+ X,
107
+ n_components: int,
108
+ n_oversamples: int = 10,
109
+ n_iter: int = 2,
110
+ random_state=None,
111
+ ):
112
+ """Backend randomized SVD for dense matrices."""
113
+ n_samples, n_features = X.shape
114
+ n_random = min(min(n_samples, n_features), int(n_components) + int(n_oversamples))
115
+ omega = backend_random_normal(backend, random_state, size=(n_features, n_random))
116
+
117
+ # Re-orthogonalize each power iteration. This is slightly more work than
118
+ # the raw power method, but greatly improves randomized SVD stability when
119
+ # singular values are close or the matrix is moderately ill-conditioned.
120
+ Q, _ = backend.qr(backend.matmul(X, omega))
121
+ for _ in range(int(n_iter)):
122
+ Q, _ = backend.qr(backend.matmul(X.T, Q))
123
+ Q, _ = backend.qr(backend.matmul(X, Q))
124
+
125
+ B = backend.matmul(Q.T, X)
126
+ _, singular_values, vh = backend.svd(B, full_matrices=False)
127
+ return singular_values[:n_components], svd_flip_components(backend, vh[:n_components])
128
+
129
+
130
+ def eye(backend, n: int, dtype=None):
131
+ """Create an identity matrix on the requested backend."""
132
+ if hasattr(backend, "eye"):
133
+ return backend.eye(n, dtype=dtype)
134
+ return backend.asarray(np.eye(n), dtype=dtype or backend.float64)