statgpu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statgpu/__init__.py +174 -0
- statgpu/_base.py +544 -0
- statgpu/_config.py +127 -0
- statgpu/anova/__init__.py +5 -0
- statgpu/anova/_oneway.py +194 -0
- statgpu/backends/__init__.py +83 -0
- statgpu/backends/_array_ops.py +529 -0
- statgpu/backends/_base.py +184 -0
- statgpu/backends/_cupy.py +453 -0
- statgpu/backends/_factory.py +65 -0
- statgpu/backends/_gpu_inference_cupy.py +214 -0
- statgpu/backends/_gpu_inference_torch.py +422 -0
- statgpu/backends/_numpy.py +324 -0
- statgpu/backends/_torch.py +685 -0
- statgpu/backends/_torch_safe.py +47 -0
- statgpu/backends/_utils.py +423 -0
- statgpu/core/__init__.py +10 -0
- statgpu/core/formula/__init__.py +33 -0
- statgpu/core/formula/_design.py +99 -0
- statgpu/core/formula/_parser.py +191 -0
- statgpu/core/formula/_terms.py +70 -0
- statgpu/core/formula/tests/__init__.py +0 -0
- statgpu/core/formula/tests/test_parser.py +194 -0
- statgpu/covariance/__init__.py +6 -0
- statgpu/covariance/_empirical.py +310 -0
- statgpu/covariance/_shrinkage.py +248 -0
- statgpu/cross_validation/__init__.py +31 -0
- statgpu/cross_validation/_base.py +410 -0
- statgpu/cross_validation/_engine.py +167 -0
- statgpu/diagnostics/__init__.py +7 -0
- statgpu/diagnostics/_regression_diagnostics.py +188 -0
- statgpu/feature_selection/__init__.py +24 -0
- statgpu/feature_selection/_knockoff.py +870 -0
- statgpu/feature_selection/_knockoff_utils.py +1003 -0
- statgpu/feature_selection/_stepwise.py +300 -0
- statgpu/glm_core/__init__.py +81 -0
- statgpu/glm_core/_base.py +202 -0
- statgpu/glm_core/_family.py +362 -0
- statgpu/glm_core/_fused.py +149 -0
- statgpu/glm_core/_gamma.py +111 -0
- statgpu/glm_core/_inverse_gaussian.py +62 -0
- statgpu/glm_core/_irls.py +561 -0
- statgpu/glm_core/_logistic.py +82 -0
- statgpu/glm_core/_negative_binomial.py +68 -0
- statgpu/glm_core/_poisson.py +60 -0
- statgpu/glm_core/_solver_legacy.py +100 -0
- statgpu/glm_core/_squared.py +53 -0
- statgpu/glm_core/_tweedie.py +74 -0
- statgpu/inference/__init__.py +239 -0
- statgpu/inference/_distributions_backend.py +2610 -0
- statgpu/inference/_multiple_testing.py +391 -0
- statgpu/inference/_resampling.py +1400 -0
- statgpu/inference/_results.py +265 -0
- statgpu/linear_model/__init__.py +75 -0
- statgpu/linear_model/_gaussian_inference.py +306 -0
- statgpu/linear_model/_glm_base.py +1261 -0
- statgpu/linear_model/_ordered_logit.py +52 -0
- statgpu/linear_model/_ordered_probit.py +50 -0
- statgpu/linear_model/_stats.py +170 -0
- statgpu/linear_model/cv/__init__.py +13 -0
- statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
- statgpu/linear_model/cv/_lasso_cv.py +253 -0
- statgpu/linear_model/cv/_logistic_cv.py +895 -0
- statgpu/linear_model/cv/_ridge_cv.py +1160 -0
- statgpu/linear_model/legacy/__init__.py +1 -0
- statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
- statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
- statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
- statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
- statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
- statgpu/linear_model/legacy/_solver_legacy.py +104 -0
- statgpu/linear_model/penalized/__init__.py +25 -0
- statgpu/linear_model/penalized/_base.py +437 -0
- statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
- statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
- statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
- statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
- statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
- statgpu/linear_model/penalized/_penalized_linear.py +236 -0
- statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
- statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
- statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
- statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
- statgpu/linear_model/penalized/_predict_mixin.py +182 -0
- statgpu/linear_model/wrappers/__init__.py +31 -0
- statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
- statgpu/linear_model/wrappers/_elasticnet.py +75 -0
- statgpu/linear_model/wrappers/_gamma.py +67 -0
- statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
- statgpu/linear_model/wrappers/_lasso.py +2124 -0
- statgpu/linear_model/wrappers/_linear.py +1127 -0
- statgpu/linear_model/wrappers/_logistic.py +1435 -0
- statgpu/linear_model/wrappers/_mcp.py +58 -0
- statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
- statgpu/linear_model/wrappers/_poisson.py +48 -0
- statgpu/linear_model/wrappers/_ridge.py +166 -0
- statgpu/linear_model/wrappers/_scad.py +58 -0
- statgpu/linear_model/wrappers/_tweedie.py +57 -0
- statgpu/metrics/__init__.py +21 -0
- statgpu/metrics/_classification.py +591 -0
- statgpu/nonparametric/__init__.py +50 -0
- statgpu/nonparametric/kernel_methods/__init__.py +25 -0
- statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
- statgpu/nonparametric/kernel_methods/_krr.py +234 -0
- statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
- statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
- statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
- statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
- statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
- statgpu/nonparametric/splines/__init__.py +5 -0
- statgpu/nonparametric/splines/_bspline_basis.py +336 -0
- statgpu/nonparametric/splines/_penalized.py +349 -0
- statgpu/panel/__init__.py +19 -0
- statgpu/panel/_covariance.py +140 -0
- statgpu/panel/_fixed_effects.py +420 -0
- statgpu/panel/_random_effects.py +385 -0
- statgpu/panel/_utils.py +482 -0
- statgpu/penalties/__init__.py +139 -0
- statgpu/penalties/_adaptive_l1.py +313 -0
- statgpu/penalties/_base.py +261 -0
- statgpu/penalties/_categories.py +39 -0
- statgpu/penalties/_elasticnet.py +98 -0
- statgpu/penalties/_group_lasso.py +678 -0
- statgpu/penalties/_group_mcp.py +553 -0
- statgpu/penalties/_group_scad.py +605 -0
- statgpu/penalties/_l1.py +107 -0
- statgpu/penalties/_l2.py +77 -0
- statgpu/penalties/_mcp.py +237 -0
- statgpu/penalties/_scad.py +260 -0
- statgpu/semiparametric/__init__.py +5 -0
- statgpu/semiparametric/_gam.py +401 -0
- statgpu/solvers/__init__.py +24 -0
- statgpu/solvers/_admm.py +241 -0
- statgpu/solvers/_constants.py +15 -0
- statgpu/solvers/_convergence.py +6 -0
- statgpu/solvers/_fista.py +436 -0
- statgpu/solvers/_fista_bb.py +513 -0
- statgpu/solvers/_fista_lla.py +541 -0
- statgpu/solvers/_lbfgs.py +206 -0
- statgpu/solvers/_newton.py +149 -0
- statgpu/solvers/_utils.py +277 -0
- statgpu/survival/__init__.py +14 -0
- statgpu/survival/_cox.py +3974 -0
- statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
- statgpu/survival/_cox_cv.py +1159 -0
- statgpu/survival/_cox_efron_cuda.py +1280 -0
- statgpu/survival/_cox_efron_triton.py +359 -0
- statgpu/unsupervised/__init__.py +29 -0
- statgpu/unsupervised/_agglomerative.py +307 -0
- statgpu/unsupervised/_dbscan.py +263 -0
- statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
- statgpu/unsupervised/_gmm.py +332 -0
- statgpu/unsupervised/_incremental_pca.py +176 -0
- statgpu/unsupervised/_kmeans.py +261 -0
- statgpu/unsupervised/_minibatch_kmeans.py +299 -0
- statgpu/unsupervised/_minibatch_nmf.py +252 -0
- statgpu/unsupervised/_nmf.py +190 -0
- statgpu/unsupervised/_pca.py +189 -0
- statgpu/unsupervised/_truncated_svd.py +132 -0
- statgpu/unsupervised/_tsne.py +192 -0
- statgpu/unsupervised/_umap.py +224 -0
- statgpu/unsupervised/_utils.py +134 -0
- statgpu-0.1.0.dist-info/METADATA +245 -0
- statgpu-0.1.0.dist-info/RECORD +168 -0
- statgpu-0.1.0.dist-info/WHEEL +5 -0
- statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
- statgpu-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""Density-based spatial clustering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy.sparse import csr_matrix
|
|
9
|
+
from scipy.sparse.csgraph import connected_components
|
|
10
|
+
from scipy.spatial import cKDTree
|
|
11
|
+
from scipy.spatial.distance import pdist
|
|
12
|
+
|
|
13
|
+
from statgpu._base import BaseEstimator
|
|
14
|
+
from statgpu._config import Device
|
|
15
|
+
from statgpu.unsupervised._utils import check_2d_array, reject_sparse, scalar_to_int
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from statgpu.unsupervised._dbscan_cpu import dbscan_dense_pairwise
|
|
19
|
+
except Exception: # pragma: no cover - optional compiled extension
|
|
20
|
+
dbscan_dense_pairwise = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DBSCAN(BaseEstimator):
|
|
24
|
+
"""DBSCAN clustering for dense Euclidean data."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
eps: float = 0.5,
|
|
29
|
+
min_samples: int = 5,
|
|
30
|
+
metric: str = "euclidean",
|
|
31
|
+
batch_size: Optional[int] = None,
|
|
32
|
+
device: Union[str, Device] = Device.AUTO,
|
|
33
|
+
n_jobs: Optional[int] = None,
|
|
34
|
+
):
|
|
35
|
+
super().__init__(device=device, n_jobs=n_jobs)
|
|
36
|
+
self.eps = eps
|
|
37
|
+
self.min_samples = min_samples
|
|
38
|
+
self.metric = metric
|
|
39
|
+
self.batch_size = batch_size
|
|
40
|
+
|
|
41
|
+
def _validate_params(self, n_samples: int):
|
|
42
|
+
if float(self.eps) <= 0.0:
|
|
43
|
+
raise ValueError("eps must be positive")
|
|
44
|
+
if not isinstance(self.min_samples, (int, np.integer)) or int(self.min_samples) < 1:
|
|
45
|
+
raise ValueError("min_samples must be a positive integer")
|
|
46
|
+
if self.metric != "euclidean":
|
|
47
|
+
raise NotImplementedError("DBSCAN v1 only supports metric='euclidean'")
|
|
48
|
+
if self.batch_size is not None:
|
|
49
|
+
if not isinstance(self.batch_size, (int, np.integer)) or int(self.batch_size) < 1:
|
|
50
|
+
raise ValueError("batch_size must be None or a positive integer")
|
|
51
|
+
if n_samples < 1:
|
|
52
|
+
raise ValueError("DBSCAN requires at least one sample")
|
|
53
|
+
|
|
54
|
+
def _fit_numpy(self, X):
|
|
55
|
+
X_np = np.asarray(X, dtype=np.float64)
|
|
56
|
+
n_samples, n_features = X_np.shape
|
|
57
|
+
tree = cKDTree(X_np)
|
|
58
|
+
workers = 1 if self.n_jobs is None else int(self.n_jobs)
|
|
59
|
+
|
|
60
|
+
sample_size = min(n_samples, 512)
|
|
61
|
+
sample_idx = np.linspace(0, n_samples - 1, sample_size, dtype=np.int64)
|
|
62
|
+
sample_counts = self._query_counts(tree, X_np[sample_idx], sample_size, workers)
|
|
63
|
+
sample_core_fraction = float(np.mean(sample_counts >= int(self.min_samples)))
|
|
64
|
+
dense_threshold = max(64.0, float(self.min_samples) * 8.0)
|
|
65
|
+
if float(np.mean(sample_counts)) >= dense_threshold:
|
|
66
|
+
dense_pairwise_limit = 10000 if dbscan_dense_pairwise is not None else 0
|
|
67
|
+
condensed_bytes = n_samples * (n_samples - 1) // 2 * np.dtype(np.float64).itemsize
|
|
68
|
+
if n_samples <= dense_pairwise_limit and sample_core_fraction >= 0.75:
|
|
69
|
+
labels, core_indices = dbscan_dense_pairwise(
|
|
70
|
+
np.ascontiguousarray(X_np, dtype=np.float64),
|
|
71
|
+
float(self.eps),
|
|
72
|
+
int(self.min_samples),
|
|
73
|
+
)
|
|
74
|
+
elif condensed_bytes <= 256 * 1024 * 1024:
|
|
75
|
+
labels, core_indices = self._fit_numpy_dense_pdist(X_np)
|
|
76
|
+
else:
|
|
77
|
+
labels, core_indices = self._fit_numpy_dense(tree, n_samples)
|
|
78
|
+
else:
|
|
79
|
+
labels, core_indices = self._fit_numpy_sparse(tree, X_np, n_samples, workers)
|
|
80
|
+
|
|
81
|
+
self.labels_ = labels
|
|
82
|
+
self.core_sample_indices_ = core_indices
|
|
83
|
+
self.components_ = X_np[core_indices] if core_indices.size else X_np[:0]
|
|
84
|
+
self.n_features_in_ = int(n_features)
|
|
85
|
+
self._backend_name = "numpy"
|
|
86
|
+
self._fitted = True
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def _query_counts(self, tree, X, n_rows, workers):
|
|
90
|
+
try:
|
|
91
|
+
counts = tree.query_ball_point(X, r=float(self.eps), workers=workers, return_length=True)
|
|
92
|
+
except TypeError:
|
|
93
|
+
counts = np.fromiter(
|
|
94
|
+
(len(row) for row in tree.query_ball_point(X, r=float(self.eps))),
|
|
95
|
+
dtype=np.int64,
|
|
96
|
+
count=n_rows,
|
|
97
|
+
)
|
|
98
|
+
return np.asarray(counts, dtype=np.int64)
|
|
99
|
+
|
|
100
|
+
def _fit_numpy_dense(self, tree, n_samples):
|
|
101
|
+
neighbors = tree.sparse_distance_matrix(tree, float(self.eps), output_type="coo_matrix")
|
|
102
|
+
row_idx = neighbors.row.astype(np.int64, copy=False)
|
|
103
|
+
col_idx = neighbors.col.astype(np.int64, copy=False)
|
|
104
|
+
counts = np.bincount(row_idx, minlength=n_samples)
|
|
105
|
+
return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
|
|
106
|
+
|
|
107
|
+
def _fit_numpy_dense_pdist(self, X_np):
|
|
108
|
+
n_samples = X_np.shape[0]
|
|
109
|
+
distances = pdist(X_np, metric="sqeuclidean")
|
|
110
|
+
pair_rows, pair_cols = self._condensed_indices_to_pairs(distances <= float(self.eps) ** 2, n_samples)
|
|
111
|
+
row_idx = np.concatenate([np.arange(n_samples, dtype=np.int64), pair_rows, pair_cols])
|
|
112
|
+
col_idx = np.concatenate([np.arange(n_samples, dtype=np.int64), pair_cols, pair_rows])
|
|
113
|
+
counts = np.bincount(row_idx, minlength=n_samples)
|
|
114
|
+
return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
|
|
115
|
+
|
|
116
|
+
def _condensed_indices_to_pairs(self, mask, n_samples):
|
|
117
|
+
condensed = np.flatnonzero(mask)
|
|
118
|
+
if not condensed.size:
|
|
119
|
+
empty = np.empty(0, dtype=np.int64)
|
|
120
|
+
return empty, empty
|
|
121
|
+
b = 1 - 2 * n_samples
|
|
122
|
+
rows = np.floor((-b - np.sqrt(float(b * b) - 8.0 * condensed)) / 2.0).astype(np.int64)
|
|
123
|
+
row_start = n_samples * rows - rows * (rows + 1) // 2
|
|
124
|
+
cols = condensed - row_start + rows + 1
|
|
125
|
+
return rows, cols.astype(np.int64, copy=False)
|
|
126
|
+
|
|
127
|
+
def _fit_numpy_sparse(self, tree, X_np, n_samples, workers):
|
|
128
|
+
counts = self._query_counts(tree, X_np, n_samples, workers)
|
|
129
|
+
core_mask = counts >= int(self.min_samples)
|
|
130
|
+
core_indices = np.flatnonzero(core_mask).astype(np.int64)
|
|
131
|
+
if not core_indices.size:
|
|
132
|
+
return np.full(n_samples, -1, dtype=np.int64), core_indices
|
|
133
|
+
try:
|
|
134
|
+
pairs = tree.query_pairs(float(self.eps), output_type="ndarray")
|
|
135
|
+
except TypeError:
|
|
136
|
+
pairs = np.asarray(list(tree.query_pairs(float(self.eps))), dtype=np.int64)
|
|
137
|
+
pairs = np.asarray(pairs, dtype=np.int64)
|
|
138
|
+
if pairs.size:
|
|
139
|
+
row_idx = np.concatenate([pairs[:, 0], pairs[:, 1]])
|
|
140
|
+
col_idx = np.concatenate([pairs[:, 1], pairs[:, 0]])
|
|
141
|
+
else:
|
|
142
|
+
row_idx = np.empty(0, dtype=np.int64)
|
|
143
|
+
col_idx = np.empty(0, dtype=np.int64)
|
|
144
|
+
return self._labels_from_neighbor_edges(n_samples, counts, row_idx, col_idx)
|
|
145
|
+
|
|
146
|
+
def _labels_from_neighbor_edges(self, n_samples, counts, row_idx, col_idx):
|
|
147
|
+
counts = np.asarray(counts, dtype=np.int64)
|
|
148
|
+
core_mask = counts >= int(self.min_samples)
|
|
149
|
+
core_indices = np.flatnonzero(core_mask).astype(np.int64)
|
|
150
|
+
labels = np.full(n_samples, -1, dtype=np.int64)
|
|
151
|
+
if not core_indices.size:
|
|
152
|
+
return labels, core_indices
|
|
153
|
+
|
|
154
|
+
core_position = np.full(n_samples, -1, dtype=np.int64)
|
|
155
|
+
core_position[core_indices] = np.arange(core_indices.size, dtype=np.int64)
|
|
156
|
+
core_edges = core_mask[row_idx] & core_mask[col_idx]
|
|
157
|
+
graph = csr_matrix(
|
|
158
|
+
(
|
|
159
|
+
np.ones(int(np.sum(core_edges)), dtype=bool),
|
|
160
|
+
(core_position[row_idx[core_edges]], core_position[col_idx[core_edges]]),
|
|
161
|
+
),
|
|
162
|
+
shape=(core_indices.size, core_indices.size),
|
|
163
|
+
)
|
|
164
|
+
_, core_labels = connected_components(graph, directed=False, return_labels=True)
|
|
165
|
+
labels[core_indices] = core_labels.astype(np.int64, copy=False)
|
|
166
|
+
|
|
167
|
+
border_edges = (~core_mask[row_idx]) & core_mask[col_idx]
|
|
168
|
+
if np.any(border_edges):
|
|
169
|
+
border_rows = row_idx[border_edges]
|
|
170
|
+
border_labels = labels[col_idx[border_edges]]
|
|
171
|
+
order = np.argsort(border_rows, kind="mergesort")
|
|
172
|
+
border_rows = border_rows[order]
|
|
173
|
+
border_labels = border_labels[order]
|
|
174
|
+
first = np.r_[True, border_rows[1:] != border_rows[:-1]]
|
|
175
|
+
labels[border_rows[first]] = border_labels[first]
|
|
176
|
+
return labels, core_indices
|
|
177
|
+
|
|
178
|
+
def _neighbor_graph(self, backend, X):
|
|
179
|
+
n_samples = X.shape[0]
|
|
180
|
+
batch_size = n_samples if self.batch_size is None else min(int(self.batch_size), n_samples)
|
|
181
|
+
x_norm = backend.sum(X * X, axis=1, keepdims=True)
|
|
182
|
+
rows = []
|
|
183
|
+
eps_sq = float(self.eps) ** 2
|
|
184
|
+
for start in range(0, n_samples, batch_size):
|
|
185
|
+
stop = min(start + batch_size, n_samples)
|
|
186
|
+
X_chunk = X[start:stop]
|
|
187
|
+
chunk_norm = x_norm[start:stop]
|
|
188
|
+
distances = chunk_norm + backend.reshape(x_norm, (1, n_samples)) - 2.0 * backend.matmul(X_chunk, X.T)
|
|
189
|
+
rows.append(backend.maximum(distances, 0.0) <= eps_sq)
|
|
190
|
+
return backend.concatenate(rows, axis=0) if len(rows) > 1 else rows[0]
|
|
191
|
+
|
|
192
|
+
def fit(self, X, y=None):
|
|
193
|
+
reject_sparse(X, "DBSCAN")
|
|
194
|
+
backend = self._get_backend()
|
|
195
|
+
X_arr = backend.asarray(X, dtype=backend.float64)
|
|
196
|
+
check_2d_array(X_arr)
|
|
197
|
+
n_samples, n_features = X_arr.shape
|
|
198
|
+
self._validate_params(n_samples)
|
|
199
|
+
if backend.name == "numpy":
|
|
200
|
+
return self._fit_numpy(X_arr)
|
|
201
|
+
|
|
202
|
+
neighbors = self._neighbor_graph(backend, X_arr)
|
|
203
|
+
counts = backend.sum(neighbors, axis=1)
|
|
204
|
+
core_mask = counts >= int(self.min_samples)
|
|
205
|
+
core_adj = neighbors & backend.expand_dims(core_mask, 0) & backend.expand_dims(core_mask, 1)
|
|
206
|
+
|
|
207
|
+
large = int(n_samples)
|
|
208
|
+
initial = backend.arange(n_samples, dtype=backend.int64)
|
|
209
|
+
labels = backend.where(core_mask, initial, backend.full((n_samples,), large, dtype=backend.int64))
|
|
210
|
+
for _ in range(n_samples):
|
|
211
|
+
candidate_labels = backend.where(
|
|
212
|
+
core_adj,
|
|
213
|
+
backend.expand_dims(labels, 0),
|
|
214
|
+
backend.full((n_samples, n_samples), large, dtype=backend.int64),
|
|
215
|
+
)
|
|
216
|
+
new_labels = backend.min(candidate_labels, axis=1)
|
|
217
|
+
new_labels = backend.where(core_mask, new_labels, backend.full((n_samples,), large, dtype=backend.int64))
|
|
218
|
+
changed = scalar_to_int(backend.sum(new_labels != labels))
|
|
219
|
+
labels = new_labels
|
|
220
|
+
if changed == 0:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
labels_np = np.full(n_samples, -1, dtype=np.int64)
|
|
224
|
+
core_np = backend.to_numpy(core_mask).astype(bool, copy=False)
|
|
225
|
+
raw_core_labels = backend.to_numpy(labels).astype(np.int64, copy=False)
|
|
226
|
+
unique_core = sorted(int(v) for v in np.unique(raw_core_labels[core_np]) if int(v) < large)
|
|
227
|
+
label_map = {raw: i for i, raw in enumerate(unique_core)}
|
|
228
|
+
for i in np.flatnonzero(core_np):
|
|
229
|
+
labels_np[i] = label_map[int(raw_core_labels[i])]
|
|
230
|
+
|
|
231
|
+
neighbors_np = backend.to_numpy(neighbors)
|
|
232
|
+
core_indices = np.flatnonzero(core_np).astype(np.int64)
|
|
233
|
+
for i in np.flatnonzero(~core_np):
|
|
234
|
+
reachable_core = core_indices[neighbors_np[i, core_indices]]
|
|
235
|
+
if reachable_core.size:
|
|
236
|
+
labels_np[i] = labels_np[int(reachable_core[0])]
|
|
237
|
+
|
|
238
|
+
core_backend = backend.asarray(core_indices, dtype=backend.int64)
|
|
239
|
+
self.labels_ = backend.asarray(labels_np, dtype=backend.int64)
|
|
240
|
+
self.core_sample_indices_ = core_backend
|
|
241
|
+
self.components_ = X_arr[core_backend] if core_indices.size else X_arr[:0]
|
|
242
|
+
self.n_features_in_ = int(n_features)
|
|
243
|
+
self._backend_name = backend.name
|
|
244
|
+
self._fitted = True
|
|
245
|
+
return self
|
|
246
|
+
|
|
247
|
+
def fit_predict(self, X, y=None):
|
|
248
|
+
return self.fit(X, y=y).labels_
|
|
249
|
+
|
|
250
|
+
def predict(self, X):
|
|
251
|
+
raise NotImplementedError("DBSCAN does not support predict for unseen samples")
|
|
252
|
+
|
|
253
|
+
def get_params(self, deep=True):
|
|
254
|
+
params = super().get_params(deep=deep)
|
|
255
|
+
params.update(
|
|
256
|
+
{
|
|
257
|
+
"eps": self.eps,
|
|
258
|
+
"min_samples": self.min_samples,
|
|
259
|
+
"metric": self.metric,
|
|
260
|
+
"batch_size": self.batch_size,
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
return params
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True
|
|
2
|
+
"""Cython helpers for CPU DBSCAN."""
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
cimport numpy as cnp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ctypedef cnp.float64_t float64_t
|
|
9
|
+
ctypedef cnp.int64_t int64_t
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
cdef int64_t _find(int64_t[:] parent, int64_t x) noexcept nogil:
|
|
13
|
+
cdef int64_t root = x
|
|
14
|
+
cdef int64_t next_x
|
|
15
|
+
while parent[root] != root:
|
|
16
|
+
root = parent[root]
|
|
17
|
+
while parent[x] != x:
|
|
18
|
+
next_x = parent[x]
|
|
19
|
+
parent[x] = root
|
|
20
|
+
x = next_x
|
|
21
|
+
return root
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
cdef void _union(int64_t[:] parent, int64_t[:] rank, int64_t a, int64_t b) noexcept nogil:
|
|
25
|
+
cdef int64_t root_a = _find(parent, a)
|
|
26
|
+
cdef int64_t root_b = _find(parent, b)
|
|
27
|
+
if root_a == root_b:
|
|
28
|
+
return
|
|
29
|
+
if rank[root_a] < rank[root_b]:
|
|
30
|
+
parent[root_a] = root_b
|
|
31
|
+
elif rank[root_a] > rank[root_b]:
|
|
32
|
+
parent[root_b] = root_a
|
|
33
|
+
else:
|
|
34
|
+
parent[root_b] = root_a
|
|
35
|
+
rank[root_a] += 1
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def dbscan_dense_pairwise(float64_t[:, ::1] X, double eps, int64_t min_samples):
|
|
39
|
+
"""Exact dense Euclidean DBSCAN core for medium-size CPU inputs.
|
|
40
|
+
|
|
41
|
+
This routine avoids materializing the pairwise distance matrix. It performs
|
|
42
|
+
two pairwise scans: one to count eps-neighborhood sizes and another to union
|
|
43
|
+
core-core edges and attach border points.
|
|
44
|
+
"""
|
|
45
|
+
cdef Py_ssize_t n_samples = X.shape[0]
|
|
46
|
+
cdef Py_ssize_t n_features = X.shape[1]
|
|
47
|
+
cdef double eps_sq = eps * eps
|
|
48
|
+
cdef Py_ssize_t i, j, f
|
|
49
|
+
cdef double dist, diff
|
|
50
|
+
cdef int64_t root
|
|
51
|
+
cdef int64_t label
|
|
52
|
+
|
|
53
|
+
cdef cnp.ndarray[int64_t, ndim=1] counts_arr = np.ones(n_samples, dtype=np.int64)
|
|
54
|
+
cdef cnp.ndarray[int64_t, ndim=1] parent_arr = np.arange(n_samples, dtype=np.int64)
|
|
55
|
+
cdef cnp.ndarray[int64_t, ndim=1] rank_arr = np.zeros(n_samples, dtype=np.int64)
|
|
56
|
+
cdef cnp.ndarray[int64_t, ndim=1] labels_arr = np.full(n_samples, -1, dtype=np.int64)
|
|
57
|
+
cdef cnp.ndarray[int64_t, ndim=1] root_label_arr = np.full(n_samples, -1, dtype=np.int64)
|
|
58
|
+
cdef cnp.ndarray[cnp.uint8_t, ndim=1] core_arr = np.zeros(n_samples, dtype=np.uint8)
|
|
59
|
+
cdef cnp.ndarray[int64_t, ndim=1] core_indices_arr
|
|
60
|
+
|
|
61
|
+
cdef int64_t[:] counts = counts_arr
|
|
62
|
+
cdef int64_t[:] parent = parent_arr
|
|
63
|
+
cdef int64_t[:] rank = rank_arr
|
|
64
|
+
cdef int64_t[:] labels = labels_arr
|
|
65
|
+
cdef int64_t[:] root_label = root_label_arr
|
|
66
|
+
cdef cnp.uint8_t[:] core = core_arr
|
|
67
|
+
|
|
68
|
+
for i in range(n_samples):
|
|
69
|
+
for j in range(i + 1, n_samples):
|
|
70
|
+
dist = 0.0
|
|
71
|
+
for f in range(n_features):
|
|
72
|
+
diff = X[i, f] - X[j, f]
|
|
73
|
+
dist += diff * diff
|
|
74
|
+
if dist > eps_sq:
|
|
75
|
+
break
|
|
76
|
+
if dist <= eps_sq:
|
|
77
|
+
counts[i] += 1
|
|
78
|
+
counts[j] += 1
|
|
79
|
+
|
|
80
|
+
for i in range(n_samples):
|
|
81
|
+
if counts[i] >= min_samples:
|
|
82
|
+
core[i] = 1
|
|
83
|
+
|
|
84
|
+
for i in range(n_samples):
|
|
85
|
+
if not core[i]:
|
|
86
|
+
continue
|
|
87
|
+
for j in range(i + 1, n_samples):
|
|
88
|
+
if not core[j]:
|
|
89
|
+
continue
|
|
90
|
+
dist = 0.0
|
|
91
|
+
for f in range(n_features):
|
|
92
|
+
diff = X[i, f] - X[j, f]
|
|
93
|
+
dist += diff * diff
|
|
94
|
+
if dist > eps_sq:
|
|
95
|
+
break
|
|
96
|
+
if dist <= eps_sq:
|
|
97
|
+
_union(parent, rank, i, j)
|
|
98
|
+
|
|
99
|
+
label = 0
|
|
100
|
+
for i in range(n_samples):
|
|
101
|
+
if core[i]:
|
|
102
|
+
root = _find(parent, i)
|
|
103
|
+
if root_label[root] < 0:
|
|
104
|
+
root_label[root] = label
|
|
105
|
+
label += 1
|
|
106
|
+
labels[i] = root_label[root]
|
|
107
|
+
|
|
108
|
+
for i in range(n_samples):
|
|
109
|
+
if core[i]:
|
|
110
|
+
continue
|
|
111
|
+
for j in range(n_samples):
|
|
112
|
+
if not core[j]:
|
|
113
|
+
continue
|
|
114
|
+
dist = 0.0
|
|
115
|
+
for f in range(n_features):
|
|
116
|
+
diff = X[i, f] - X[j, f]
|
|
117
|
+
dist += diff * diff
|
|
118
|
+
if dist > eps_sq:
|
|
119
|
+
break
|
|
120
|
+
if dist <= eps_sq:
|
|
121
|
+
labels[i] = labels[j]
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
core_indices_arr = np.flatnonzero(core_arr).astype(np.int64, copy=False)
|
|
125
|
+
return labels_arr, core_indices_arr
|