statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. statgpu/__init__.py +174 -0
  2. statgpu/_base.py +544 -0
  3. statgpu/_config.py +127 -0
  4. statgpu/anova/__init__.py +5 -0
  5. statgpu/anova/_oneway.py +194 -0
  6. statgpu/backends/__init__.py +83 -0
  7. statgpu/backends/_array_ops.py +529 -0
  8. statgpu/backends/_base.py +184 -0
  9. statgpu/backends/_cupy.py +453 -0
  10. statgpu/backends/_factory.py +65 -0
  11. statgpu/backends/_gpu_inference_cupy.py +214 -0
  12. statgpu/backends/_gpu_inference_torch.py +422 -0
  13. statgpu/backends/_numpy.py +324 -0
  14. statgpu/backends/_torch.py +685 -0
  15. statgpu/backends/_torch_safe.py +47 -0
  16. statgpu/backends/_utils.py +423 -0
  17. statgpu/core/__init__.py +10 -0
  18. statgpu/core/formula/__init__.py +33 -0
  19. statgpu/core/formula/_design.py +99 -0
  20. statgpu/core/formula/_parser.py +191 -0
  21. statgpu/core/formula/_terms.py +70 -0
  22. statgpu/core/formula/tests/__init__.py +0 -0
  23. statgpu/core/formula/tests/test_parser.py +194 -0
  24. statgpu/covariance/__init__.py +6 -0
  25. statgpu/covariance/_empirical.py +310 -0
  26. statgpu/covariance/_shrinkage.py +248 -0
  27. statgpu/cross_validation/__init__.py +31 -0
  28. statgpu/cross_validation/_base.py +410 -0
  29. statgpu/cross_validation/_engine.py +167 -0
  30. statgpu/diagnostics/__init__.py +7 -0
  31. statgpu/diagnostics/_regression_diagnostics.py +188 -0
  32. statgpu/feature_selection/__init__.py +24 -0
  33. statgpu/feature_selection/_knockoff.py +870 -0
  34. statgpu/feature_selection/_knockoff_utils.py +1003 -0
  35. statgpu/feature_selection/_stepwise.py +300 -0
  36. statgpu/glm_core/__init__.py +81 -0
  37. statgpu/glm_core/_base.py +202 -0
  38. statgpu/glm_core/_family.py +362 -0
  39. statgpu/glm_core/_fused.py +149 -0
  40. statgpu/glm_core/_gamma.py +111 -0
  41. statgpu/glm_core/_inverse_gaussian.py +62 -0
  42. statgpu/glm_core/_irls.py +561 -0
  43. statgpu/glm_core/_logistic.py +82 -0
  44. statgpu/glm_core/_negative_binomial.py +68 -0
  45. statgpu/glm_core/_poisson.py +60 -0
  46. statgpu/glm_core/_solver_legacy.py +100 -0
  47. statgpu/glm_core/_squared.py +53 -0
  48. statgpu/glm_core/_tweedie.py +74 -0
  49. statgpu/inference/__init__.py +239 -0
  50. statgpu/inference/_distributions_backend.py +2610 -0
  51. statgpu/inference/_multiple_testing.py +391 -0
  52. statgpu/inference/_resampling.py +1400 -0
  53. statgpu/inference/_results.py +265 -0
  54. statgpu/linear_model/__init__.py +75 -0
  55. statgpu/linear_model/_gaussian_inference.py +306 -0
  56. statgpu/linear_model/_glm_base.py +1261 -0
  57. statgpu/linear_model/_ordered_logit.py +52 -0
  58. statgpu/linear_model/_ordered_probit.py +50 -0
  59. statgpu/linear_model/_stats.py +170 -0
  60. statgpu/linear_model/cv/__init__.py +13 -0
  61. statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
  62. statgpu/linear_model/cv/_lasso_cv.py +253 -0
  63. statgpu/linear_model/cv/_logistic_cv.py +895 -0
  64. statgpu/linear_model/cv/_ridge_cv.py +1160 -0
  65. statgpu/linear_model/legacy/__init__.py +1 -0
  66. statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
  67. statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
  68. statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
  69. statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
  70. statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
  71. statgpu/linear_model/legacy/_solver_legacy.py +104 -0
  72. statgpu/linear_model/penalized/__init__.py +25 -0
  73. statgpu/linear_model/penalized/_base.py +437 -0
  74. statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
  75. statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
  76. statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
  77. statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
  78. statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
  79. statgpu/linear_model/penalized/_penalized_linear.py +236 -0
  80. statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
  81. statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
  82. statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
  83. statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
  84. statgpu/linear_model/penalized/_predict_mixin.py +182 -0
  85. statgpu/linear_model/wrappers/__init__.py +31 -0
  86. statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
  87. statgpu/linear_model/wrappers/_elasticnet.py +75 -0
  88. statgpu/linear_model/wrappers/_gamma.py +67 -0
  89. statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
  90. statgpu/linear_model/wrappers/_lasso.py +2124 -0
  91. statgpu/linear_model/wrappers/_linear.py +1127 -0
  92. statgpu/linear_model/wrappers/_logistic.py +1435 -0
  93. statgpu/linear_model/wrappers/_mcp.py +58 -0
  94. statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
  95. statgpu/linear_model/wrappers/_poisson.py +48 -0
  96. statgpu/linear_model/wrappers/_ridge.py +166 -0
  97. statgpu/linear_model/wrappers/_scad.py +58 -0
  98. statgpu/linear_model/wrappers/_tweedie.py +57 -0
  99. statgpu/metrics/__init__.py +21 -0
  100. statgpu/metrics/_classification.py +591 -0
  101. statgpu/nonparametric/__init__.py +50 -0
  102. statgpu/nonparametric/kernel_methods/__init__.py +25 -0
  103. statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
  104. statgpu/nonparametric/kernel_methods/_krr.py +234 -0
  105. statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
  106. statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
  107. statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
  108. statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
  109. statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
  110. statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
  111. statgpu/nonparametric/splines/__init__.py +5 -0
  112. statgpu/nonparametric/splines/_bspline_basis.py +336 -0
  113. statgpu/nonparametric/splines/_penalized.py +349 -0
  114. statgpu/panel/__init__.py +19 -0
  115. statgpu/panel/_covariance.py +140 -0
  116. statgpu/panel/_fixed_effects.py +420 -0
  117. statgpu/panel/_random_effects.py +385 -0
  118. statgpu/panel/_utils.py +482 -0
  119. statgpu/penalties/__init__.py +139 -0
  120. statgpu/penalties/_adaptive_l1.py +313 -0
  121. statgpu/penalties/_base.py +261 -0
  122. statgpu/penalties/_categories.py +39 -0
  123. statgpu/penalties/_elasticnet.py +98 -0
  124. statgpu/penalties/_group_lasso.py +678 -0
  125. statgpu/penalties/_group_mcp.py +553 -0
  126. statgpu/penalties/_group_scad.py +605 -0
  127. statgpu/penalties/_l1.py +107 -0
  128. statgpu/penalties/_l2.py +77 -0
  129. statgpu/penalties/_mcp.py +237 -0
  130. statgpu/penalties/_scad.py +260 -0
  131. statgpu/semiparametric/__init__.py +5 -0
  132. statgpu/semiparametric/_gam.py +401 -0
  133. statgpu/solvers/__init__.py +24 -0
  134. statgpu/solvers/_admm.py +241 -0
  135. statgpu/solvers/_constants.py +15 -0
  136. statgpu/solvers/_convergence.py +6 -0
  137. statgpu/solvers/_fista.py +436 -0
  138. statgpu/solvers/_fista_bb.py +513 -0
  139. statgpu/solvers/_fista_lla.py +541 -0
  140. statgpu/solvers/_lbfgs.py +206 -0
  141. statgpu/solvers/_newton.py +149 -0
  142. statgpu/solvers/_utils.py +277 -0
  143. statgpu/survival/__init__.py +14 -0
  144. statgpu/survival/_cox.py +3974 -0
  145. statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
  146. statgpu/survival/_cox_cv.py +1159 -0
  147. statgpu/survival/_cox_efron_cuda.py +1280 -0
  148. statgpu/survival/_cox_efron_triton.py +359 -0
  149. statgpu/unsupervised/__init__.py +29 -0
  150. statgpu/unsupervised/_agglomerative.py +307 -0
  151. statgpu/unsupervised/_dbscan.py +263 -0
  152. statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
  153. statgpu/unsupervised/_gmm.py +332 -0
  154. statgpu/unsupervised/_incremental_pca.py +176 -0
  155. statgpu/unsupervised/_kmeans.py +261 -0
  156. statgpu/unsupervised/_minibatch_kmeans.py +299 -0
  157. statgpu/unsupervised/_minibatch_nmf.py +252 -0
  158. statgpu/unsupervised/_nmf.py +190 -0
  159. statgpu/unsupervised/_pca.py +189 -0
  160. statgpu/unsupervised/_truncated_svd.py +132 -0
  161. statgpu/unsupervised/_tsne.py +192 -0
  162. statgpu/unsupervised/_umap.py +224 -0
  163. statgpu/unsupervised/_utils.py +134 -0
  164. statgpu-0.1.0.dist-info/METADATA +245 -0
  165. statgpu-0.1.0.dist-info/RECORD +168 -0
  166. statgpu-0.1.0.dist-info/WHEEL +5 -0
  167. statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
  168. statgpu-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,263 @@
1
+ """Density-based spatial clustering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional, Union
6
+
7
+ import numpy as np
8
+ from scipy.sparse import csr_matrix
9
+ from scipy.sparse.csgraph import connected_components
10
+ from scipy.spatial import cKDTree
11
+ from scipy.spatial.distance import pdist
12
+
13
+ from statgpu._base import BaseEstimator
14
+ from statgpu._config import Device
15
+ from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_int
16
+
17
+ try:
18
+ from statgpu.unsupervised._dbscan_cpu import dbscan_dense_pairwise
19
+ except Exception: # pragma: no cover - optional compiled extension
20
+ dbscan_dense_pairwise = None
21
+
22
+
23
+ class DBSCAN(BaseEstimator):
24
+ """DBSCAN clustering for dense Euclidean data."""
25
+
26
+ def __init__(
27
+ self,
28
+ eps: float = 0.5,
29
+ min_samples: int = 5,
30
+ metric: str = "euclidean",
31
+ batch_size: Optional[int] = None,
32
+ device: Union[str, Device] = Device.AUTO,
33
+ n_jobs: Optional[int] = None,
34
+ ):
35
+ super().__init__(device=device, n_jobs=n_jobs)
36
+ self.eps = eps
37
+ self.min_samples = min_samples
38
+ self.metric = metric
39
+ self.batch_size = batch_size
40
+
41
+ def _validate_params(self, n_samples: int):
42
+ if float(self.eps) <= 0.0:
43
+ raise ValueError("eps must be positive")
44
+ if not isinstance(self.min_samples, (int, np.integer)) or int(self.min_samples) < 1:
45
+ raise ValueError("min_samples must be a positive integer")
46
+ if self.metric != "euclidean":
47
+ raise NotImplementedError("DBSCAN v1 only supports metric='euclidean'")
48
+ if self.batch_size is not None:
49
+ if not isinstance(self.batch_size, (int, np.integer)) or int(self.batch_size) < 1:
50
+ raise ValueError("batch_size must be None or a positive integer")
51
+ if n_samples < 1:
52
+ raise ValueError("DBSCAN requires at least one sample")
53
+
54
+ def _fit_numpy(self, X):
55
+ X_np = np.asarray(X, dtype=np.float64)
56
+ n_samples, n_features = X_np.shape
57
+ tree = cKDTree(X_np)
58
+ workers = 1 if self.n_jobs is None else int(self.n_jobs)
59
+
60
+ sample_size = min(n_samples, 512)
61
+ sample_idx = np.linspace(0, n_samples - 1, sample_size, dtype=np.int64)
62
+ sample_counts = self._query_counts(tree, X_np[sample_idx], sample_size, workers)
63
+ sample_core_fraction = float(np.mean(sample_counts >= int(self.min_samples)))
64
+ dense_threshold = max(64.0, float(self.min_samples) * 8.0)
65
+ if float(np.mean(sample_counts)) >= dense_threshold:
66
+ dense_pairwise_limit = 10000 if dbscan_dense_pairwise is not None else 0
67
+ condensed_bytes = n_samples * (n_samples - 1) // 2 * np.dtype(np.float64).itemsize
68
+ if n_samples <= dense_pairwise_limit and sample_core_fraction >= 0.75:
69
+ labels, core_indices = dbscan_dense_pairwise(
70
+ np.ascontiguousarray(X_np, dtype=np.float64),
71
+ float(self.eps),
72
+ int(self.min_samples),
73
+ )
74
+ elif condensed_bytes <= 256 * 1024 * 1024:
75
+ labels, core_indices = self._fit_numpy_dense_pdist(X_np)
76
+ else:
77
+ labels, core_indices = self._fit_numpy_dense(tree, n_samples)
78
+ else:
79
+ labels, core_indices = self._fit_numpy_sparse(tree, X_np, n_samples, workers)
80
+
81
+ self.labels_ = labels
82
+ self.core_sample_indices_ = core_indices
83
+ self.components_ = X_np[core_indices] if core_indices.size else X_np[:0]
84
+ self.n_features_in_ = int(n_features)
85
+ self._backend_name = "numpy"
86
+ self._fitted = True
87
+ return self
88
+
89
+ def _query_counts(self, tree, X, n_rows, workers):
90
+ try:
91
+ counts = tree.query_ball_point(X, r=float(self.eps), workers=workers, return_length=True)
92
+ except TypeError:
93
+ counts = np.fromiter(
94
+ (len(row) for row in tree.query_ball_point(X, r=float(self.eps))),
95
+ dtype=np.int64,
96
+ count=n_rows,
97
+ )
98
+ return np.asarray(counts, dtype=np.int64)
99
+
100
+ def _fit_numpy_dense(self, tree, n_samples):
101
+ neighbors = tree.sparse_distance_matrix(tree, float(self.eps), output_type="coo_matrix")
102
+ row_idx = neighbors.row.astype(np.int64, copy=False)
103
+ col_idx = neighbors.col.astype(np.int64, copy=False)
104
+ counts = np.bincount(row_idx, minlength=n_samples)
105
+ return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
106
+
107
+ def _fit_numpy_dense_pdist(self, X_np):
108
+ n_samples = X_np.shape[0]
109
+ distances = pdist(X_np, metric="sqeuclidean")
110
+ pair_rows, pair_cols = self._condensed_indices_to_pairs(distances <= float(self.eps) ** 2, n_samples)
111
+ row_idx = np.concatenate([np.arange(n_samples, dtype=np.int64), pair_rows, pair_cols])
112
+ col_idx = np.concatenate([np.arange(n_samples, dtype=np.int64), pair_cols, pair_rows])
113
+ counts = np.bincount(row_idx, minlength=n_samples)
114
+ return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
115
+
116
+ def _condensed_indices_to_pairs(self, mask, n_samples):
117
+ condensed = np.flatnonzero(mask)
118
+ if not condensed.size:
119
+ empty = np.empty(0, dtype=np.int64)
120
+ return empty, empty
121
+ b = 1 - 2 * n_samples
122
+ rows = np.floor((-b - np.sqrt(float(b * b) - 8.0 * condensed)) / 2.0).astype(np.int64)
123
+ row_start = n_samples * rows - rows * (rows + 1) // 2
124
+ cols = condensed - row_start + rows + 1
125
+ return rows, cols.astype(np.int64, copy=False)
126
+
127
+ def _fit_numpy_sparse(self, tree, X_np, n_samples, workers):
128
+ counts = self._query_counts(tree, X_np, n_samples, workers)
129
+ core_mask = counts >= int(self.min_samples)
130
+ core_indices = np.flatnonzero(core_mask).astype(np.int64)
131
+ if not core_indices.size:
132
+ return np.full(n_samples, -1, dtype=np.int64), core_indices
133
+ try:
134
+ pairs = tree.query_pairs(float(self.eps), output_type="ndarray")
135
+ except TypeError:
136
+ pairs = np.asarray(list(tree.query_pairs(float(self.eps))), dtype=np.int64)
137
+ pairs = np.asarray(pairs, dtype=np.int64)
138
+ if pairs.size:
139
+ row_idx = np.concatenate([pairs[:, 0], pairs[:, 1]])
140
+ col_idx = np.concatenate([pairs[:, 1], pairs[:, 0]])
141
+ else:
142
+ row_idx = np.empty(0, dtype=np.int64)
143
+ col_idx = np.empty(0, dtype=np.int64)
144
+ return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
145
+
146
+ def _labels_from_neighbor_edges(self, n_samples, counts, row_idx, col_idx):
147
+ counts = np.asarray(counts, dtype=np.int64)
148
+ core_mask = counts >= int(self.min_samples)
149
+ core_indices = np.flatnonzero(core_mask).astype(np.int64)
150
+ labels = np.full(n_samples, -1, dtype=np.int64)
151
+ if not core_indices.size:
152
+ return labels, core_indices
153
+
154
+ core_position = np.full(n_samples, -1, dtype=np.int64)
155
+ core_position[core_indices] = np.arange(core_indices.size, dtype=np.int64)
156
+ core_edges = core_mask[row_idx] & core_mask[col_idx]
157
+ graph = csr_matrix(
158
+ (
159
+ np.ones(int(np.sum(core_edges)), dtype=bool),
160
+ (core_position[row_idx[core_edges]], core_position[col_idx[core_edges]]),
161
+ ),
162
+ shape=(core_indices.size, core_indices.size),
163
+ )
164
+ _, core_labels = connected_components(graph, directed=False, return_labels=True)
165
+ labels[core_indices] = core_labels.astype(np.int64, copy=False)
166
+
167
+ border_edges = (~core_mask[row_idx]) & core_mask[col_idx]
168
+ if np.any(border_edges):
169
+ border_rows = row_idx[border_edges]
170
+ border_labels = labels[col_idx[border_edges]]
171
+ order = np.argsort(border_rows, kind="mergesort")
172
+ border_rows = border_rows[order]
173
+ border_labels = border_labels[order]
174
+ first = np.r_[True, border_rows[1:] != border_rows[:-1]]
175
+ labels[border_rows[first]] = border_labels[first]
176
+ return labels, core_indices
177
+
178
+ def _neighbor_graph(self, backend, X):
179
+ n_samples = X.shape[0]
180
+ batch_size = n_samples if self.batch_size is None else min(int(self.batch_size), n_samples)
181
+ x_norm = backend.sum(X * X, axis=1, keepdims=True)
182
+ rows = []
183
+ eps_sq = float(self.eps) ** 2
184
+ for start in range(0, n_samples, batch_size):
185
+ stop = min(start + batch_size, n_samples)
186
+ X_chunk = X[start:stop]
187
+ chunk_norm = x_norm[start:stop]
188
+ distances = chunk_norm + backend.reshape(x_norm, (1, n_samples)) - 2.0 * backend.matmul(X_chunk, X.T)
189
+ rows.append(backend.maximum(distances, 0.0) <= eps_sq)
190
+ return backend.concatenate(rows, axis=0) if len(rows) > 1 else rows[0]
191
+
192
+ def fit(self, X, y=None):
193
+ reject_sparse(X, "DBSCAN")
194
+ backend = self._get_backend()
195
+ X_arr = backend.asarray(X, dtype=backend.float64)
196
+ check_2d_array(X_arr)
197
+ n_samples, n_features = X_arr.shape
198
+ self._validate_params(n_samples)
199
+ if backend.name == "numpy":
200
+ return self._fit_numpy(X_arr)
201
+
202
+ neighbors = self._neighbor_graph(backend, X_arr)
203
+ counts = backend.sum(neighbors, axis=1)
204
+ core_mask = counts >= int(self.min_samples)
205
+ core_adj = neighbors & backend.expand_dims(core_mask, 0) & backend.expand_dims(core_mask, 1)
206
+
207
+ large = int(n_samples)
208
+ initial = backend.arange(n_samples, dtype=backend.int64)
209
+ labels = backend.where(core_mask, initial, backend.full((n_samples,), large, dtype=backend.int64))
210
+ for _ in range(n_samples):
211
+ candidate_labels = backend.where(
212
+ core_adj,
213
+ backend.expand_dims(labels, 0),
214
+ backend.full((n_samples, n_samples), large, dtype=backend.int64),
215
+ )
216
+ new_labels = backend.min(candidate_labels, axis=1)
217
+ new_labels = backend.where(core_mask, new_labels, backend.full((n_samples,), large, dtype=backend.int64))
218
+ changed = scalar_to_int(backend.sum(new_labels != labels))
219
+ labels = new_labels
220
+ if changed == 0:
221
+ break
222
+
223
+ labels_np = np.full(n_samples, -1, dtype=np.int64)
224
+ core_np = backend.to_numpy(core_mask).astype(bool, copy=False)
225
+ raw_core_labels = backend.to_numpy(labels).astype(np.int64, copy=False)
226
+ unique_core = sorted(int(v) for v in np.unique(raw_core_labels[core_np]) if int(v) < large)
227
+ label_map = {raw: i for i, raw in enumerate(unique_core)}
228
+ for i in np.flatnonzero(core_np):
229
+ labels_np[i] = label_map[int(raw_core_labels[i])]
230
+
231
+ neighbors_np = backend.to_numpy(neighbors)
232
+ core_indices = np.flatnonzero(core_np).astype(np.int64)
233
+ for i in np.flatnonzero(~core_np):
234
+ reachable_core = core_indices[neighbors_np[i, core_indices]]
235
+ if reachable_core.size:
236
+ labels_np[i] = labels_np[int(reachable_core[0])]
237
+
238
+ core_backend = backend.asarray(core_indices, dtype=backend.int64)
239
+ self.labels_ = backend.asarray(labels_np, dtype=backend.int64)
240
+ self.core_sample_indices_ = core_backend
241
+ self.components_ = X_arr[core_backend] if core_indices.size else X_arr[:0]
242
+ self.n_features_in_ = int(n_features)
243
+ self._backend_name = backend.name
244
+ self._fitted = True
245
+ return self
246
+
247
+ def fit_predict(self, X, y=None):
248
+ return self.fit(X, y=y).labels_
249
+
250
+ def predict(self, X):
251
+ raise NotImplementedError("DBSCAN does not support predict for unseen samples")
252
+
253
+ def get_params(self, deep=True):
254
+ params = super().get_params(deep=deep)
255
+ params.update(
256
+ {
257
+ "eps": self.eps,
258
+ "min_samples": self.min_samples,
259
+ "metric": self.metric,
260
+ "batch_size": self.batch_size,
261
+ }
262
+ )
263
+ return params
@@ -0,0 +1,125 @@
1
+ # cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True
2
+ """Cython helpers for CPU DBSCAN."""
3
+
4
+ import numpy as np
5
+ cimport numpy as cnp
6
+
7
+
8
+ ctypedef cnp.float64_t float64_t
9
+ ctypedef cnp.int64_t int64_t
10
+
11
+
12
+ cdef int64_t _find(int64_t[:] parent, int64_t x) noexcept nogil:
13
+ cdef int64_t root = x
14
+ cdef int64_t next_x
15
+ while parent[root] != root:
16
+ root = parent[root]
17
+ while parent[x] != x:
18
+ next_x = parent[x]
19
+ parent[x] = root
20
+ x = next_x
21
+ return root
22
+
23
+
24
+ cdef void _union(int64_t[:] parent, int64_t[:] rank, int64_t a, int64_t b) noexcept nogil:
25
+ cdef int64_t root_a = _find(parent, a)
26
+ cdef int64_t root_b = _find(parent, b)
27
+ if root_a == root_b:
28
+ return
29
+ if rank[root_a] < rank[root_b]:
30
+ parent[root_a] = root_b
31
+ elif rank[root_a] > rank[root_b]:
32
+ parent[root_b] = root_a
33
+ else:
34
+ parent[root_b] = root_a
35
+ rank[root_a] += 1
36
+
37
+
38
+ def dbscan_dense_pairwise(float64_t[:, ::1] X, double eps, int64_t min_samples):
39
+ """Exact dense Euclidean DBSCAN core for medium-size CPU inputs.
40
+
41
+ This routine avoids materializing the pairwise distance matrix. It performs
42
+ two pairwise scans: one to count eps-neighborhood sizes and another to union
43
+ core-core edges and attach border points.
44
+ """
45
+ cdef Py_ssize_t n_samples = X.shape[0]
46
+ cdef Py_ssize_t n_features = X.shape[1]
47
+ cdef double eps_sq = eps * eps
48
+ cdef Py_ssize_t i, j, f
49
+ cdef double dist, diff
50
+ cdef int64_t root
51
+ cdef int64_t label
52
+
53
+ cdef cnp.ndarray[int64_t, ndim=1] counts_arr = np.ones(n_samples, dtype=np.int64)
54
+ cdef cnp.ndarray[int64_t, ndim=1] parent_arr = np.arange(n_samples, dtype=np.int64)
55
+ cdef cnp.ndarray[int64_t, ndim=1] rank_arr = np.zeros(n_samples, dtype=np.int64)
56
+ cdef cnp.ndarray[int64_t, ndim=1] labels_arr = np.full(n_samples, -1, dtype=np.int64)
57
+ cdef cnp.ndarray[int64_t, ndim=1] root_label_arr = np.full(n_samples, -1, dtype=np.int64)
58
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] core_arr = np.zeros(n_samples, dtype=np.uint8)
59
+ cdef cnp.ndarray[int64_t, ndim=1] core_indices_arr
60
+
61
+ cdef int64_t[:] counts = counts_arr
62
+ cdef int64_t[:] parent = parent_arr
63
+ cdef int64_t[:] rank = rank_arr
64
+ cdef int64_t[:] labels = labels_arr
65
+ cdef int64_t[:] root_label = root_label_arr
66
+ cdef cnp.uint8_t[:] core = core_arr
67
+
68
+ for i in range(n_samples):
69
+ for j in range(i + 1, n_samples):
70
+ dist = 0.0
71
+ for f in range(n_features):
72
+ diff = X[i, f] - X[j, f]
73
+ dist += diff * diff
74
+ if dist > eps_sq:
75
+ break
76
+ if dist <= eps_sq:
77
+ counts[i] += 1
78
+ counts[j] += 1
79
+
80
+ for i in range(n_samples):
81
+ if counts[i] >= min_samples:
82
+ core[i] = 1
83
+
84
+ for i in range(n_samples):
85
+ if not core[i]:
86
+ continue
87
+ for j in range(i + 1, n_samples):
88
+ if not core[j]:
89
+ continue
90
+ dist = 0.0
91
+ for f in range(n_features):
92
+ diff = X[i, f] - X[j, f]
93
+ dist += diff * diff
94
+ if dist > eps_sq:
95
+ break
96
+ if dist <= eps_sq:
97
+ _union(parent, rank, i, j)
98
+
99
+ label = 0
100
+ for i in range(n_samples):
101
+ if core[i]:
102
+ root = _find(parent, i)
103
+ if root_label[root] < 0:
104
+ root_label[root] = label
105
+ label += 1
106
+ labels[i] = root_label[root]
107
+
108
+ for i in range(n_samples):
109
+ if core[i]:
110
+ continue
111
+ for j in range(n_samples):
112
+ if not core[j]:
113
+ continue
114
+ dist = 0.0
115
+ for f in range(n_features):
116
+ diff = X[i, f] - X[j, f]
117
+ dist += diff * diff
118
+ if dist > eps_sq:
119
+ break
120
+ if dist <= eps_sq:
121
+ labels[i] = labels[j]
122
+ break
123
+
124
+ core_indices_arr = np.flatnonzero(core_arr).astype(np.int64, copy=False)
125
+ return labels_arr, core_indices_arr