optuclust 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optuclust/__init__.py +2 -0
- optuclust/optuclust.py +637 -0
- optuclust-0.0.2.dist-info/METADATA +187 -0
- optuclust-0.0.2.dist-info/RECORD +9 -0
- optuclust-0.0.2.dist-info/WHEEL +5 -0
- optuclust-0.0.2.dist-info/licenses/LICENSE +21 -0
- optuclust-0.0.2.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_optuclust.py +172 -0
optuclust/__init__.py
ADDED
optuclust/optuclust.py
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import signal
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import optuna
|
|
7
|
+
from sklearn.base import BaseEstimator, ClusterMixin
|
|
8
|
+
from sklearn.cluster import (
|
|
9
|
+
DBSCAN,
|
|
10
|
+
OPTICS,
|
|
11
|
+
AffinityPropagation,
|
|
12
|
+
AgglomerativeClustering,
|
|
13
|
+
Birch,
|
|
14
|
+
KMeans,
|
|
15
|
+
MeanShift,
|
|
16
|
+
MiniBatchKMeans,
|
|
17
|
+
SpectralClustering,
|
|
18
|
+
)
|
|
19
|
+
from sklearn.metrics import (
|
|
20
|
+
calinski_harabasz_score,
|
|
21
|
+
davies_bouldin_score,
|
|
22
|
+
silhouette_score,
|
|
23
|
+
)
|
|
24
|
+
from sklearn.mixture import GaussianMixture
|
|
25
|
+
from sklearn.neighbors import KernelDensity
|
|
26
|
+
from sklearn.utils.validation import check_is_fitted
|
|
27
|
+
|
|
28
|
+
import hdbscan
|
|
29
|
+
from kmedoids import KMedoids
|
|
30
|
+
from sklearn_som.som import SOM
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger("optuclust")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Optimizer(BaseEstimator, ClusterMixin):
|
|
36
|
+
|
|
37
|
+
VALID_ALGORITHMS = [
|
|
38
|
+
"kmeans",
|
|
39
|
+
"minibatchkmeans",
|
|
40
|
+
"dbscan",
|
|
41
|
+
"meanshift",
|
|
42
|
+
"agglomerativeclustering",
|
|
43
|
+
"spectralclustering",
|
|
44
|
+
"affinitypropagation",
|
|
45
|
+
"birch",
|
|
46
|
+
"optics",
|
|
47
|
+
"gaussianmixture",
|
|
48
|
+
"hdbscan",
|
|
49
|
+
"kmedoids",
|
|
50
|
+
"sleep",
|
|
51
|
+
"som",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
VALID_SCORING = [
|
|
55
|
+
"silhouette_score",
|
|
56
|
+
"calinski_harabasz_score",
|
|
57
|
+
"davies_bouldin_score",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
ALGORITHMS_WITH_PREDICT = {
|
|
61
|
+
"kmeans",
|
|
62
|
+
"minibatchkmeans",
|
|
63
|
+
"meanshift",
|
|
64
|
+
"birch",
|
|
65
|
+
"gaussianmixture",
|
|
66
|
+
"kmedoids",
|
|
67
|
+
"som",
|
|
68
|
+
"sleep",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
SAFE_DEFAULTS = {
|
|
72
|
+
"kmeans": {"n_clusters": 3, "max_iter": 300, "tol": 1e-4, "n_init": 10},
|
|
73
|
+
"minibatchkmeans": {
|
|
74
|
+
"n_clusters": 8,
|
|
75
|
+
"batch_size": 100,
|
|
76
|
+
"max_iter": 300,
|
|
77
|
+
"tol": 1e-4,
|
|
78
|
+
"n_init": 10,
|
|
79
|
+
},
|
|
80
|
+
"dbscan": {
|
|
81
|
+
"eps": 0.5,
|
|
82
|
+
"min_samples": 5,
|
|
83
|
+
"metric": "euclidean",
|
|
84
|
+
"p": 2,
|
|
85
|
+
},
|
|
86
|
+
"meanshift": {"bandwidth": 2.5, "bin_seeding": True},
|
|
87
|
+
"agglomerativeclustering": {"n_clusters": 3, "linkage": "ward"},
|
|
88
|
+
"spectralclustering": {
|
|
89
|
+
"n_clusters": 3,
|
|
90
|
+
"n_neighbors": 10,
|
|
91
|
+
"eigen_tol": 1e-4,
|
|
92
|
+
},
|
|
93
|
+
"affinitypropagation": {"damping": 0.9, "convergence_iter": 15},
|
|
94
|
+
"birch": {"n_clusters": 3, "threshold": 0.5, "branching_factor": 50},
|
|
95
|
+
"optics": {
|
|
96
|
+
"min_samples": 5,
|
|
97
|
+
"cluster_method": "xi",
|
|
98
|
+
},
|
|
99
|
+
"gaussianmixture": {"n_components": 3, "covariance_type": "full"},
|
|
100
|
+
"hdbscan": {
|
|
101
|
+
"min_cluster_size": 5,
|
|
102
|
+
"min_samples": 1,
|
|
103
|
+
"cluster_selection_epsilon": 0.0,
|
|
104
|
+
"allow_single_cluster": False,
|
|
105
|
+
},
|
|
106
|
+
"kmedoids": {"n_clusters": 3, "method": "pam", "metric": "euclidean"},
|
|
107
|
+
"som": {
|
|
108
|
+
"m": 10,
|
|
109
|
+
"n": 10,
|
|
110
|
+
"dim": None,
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
algorithm,
|
|
117
|
+
n_trials=50,
|
|
118
|
+
scoring="silhouette_score",
|
|
119
|
+
verbose=False,
|
|
120
|
+
show_progress_bar=True,
|
|
121
|
+
timeout=None,
|
|
122
|
+
trial_timeout=None,
|
|
123
|
+
storage=None,
|
|
124
|
+
logfile=None,
|
|
125
|
+
):
|
|
126
|
+
if algorithm not in self.VALID_ALGORITHMS:
|
|
127
|
+
raise ValueError(f"Algorithm must be one of {self.VALID_ALGORITHMS}")
|
|
128
|
+
if scoring not in self.VALID_SCORING:
|
|
129
|
+
raise ValueError(f"Scoring must be one of {self.VALID_SCORING}")
|
|
130
|
+
if not isinstance(n_trials, int) or n_trials <= 0:
|
|
131
|
+
raise ValueError("n_trials must be a positive integer")
|
|
132
|
+
|
|
133
|
+
self.algorithm = algorithm
|
|
134
|
+
self.n_trials = n_trials
|
|
135
|
+
self.scoring = scoring
|
|
136
|
+
self.verbose = verbose
|
|
137
|
+
self.show_progress_bar = show_progress_bar
|
|
138
|
+
self.timeout = timeout
|
|
139
|
+
self.trial_timeout = trial_timeout
|
|
140
|
+
self.storage = storage
|
|
141
|
+
self.logfile = logfile
|
|
142
|
+
|
|
143
|
+
def fit(self, X, y=None):
|
|
144
|
+
# Configure optuna verbosity locally
|
|
145
|
+
if isinstance(self.verbose, bool):
|
|
146
|
+
optuna.logging.set_verbosity(
|
|
147
|
+
optuna.logging.INFO if self.verbose else optuna.logging.WARNING
|
|
148
|
+
)
|
|
149
|
+
_show_progress_bar = self.show_progress_bar if not self.verbose else False
|
|
150
|
+
elif isinstance(self.verbose, int):
|
|
151
|
+
optuna.logging.set_verbosity(self.verbose)
|
|
152
|
+
_show_progress_bar = self.show_progress_bar
|
|
153
|
+
else:
|
|
154
|
+
_show_progress_bar = self.show_progress_bar
|
|
155
|
+
|
|
156
|
+
# Resolve storage
|
|
157
|
+
storage = self.storage
|
|
158
|
+
if storage is None:
|
|
159
|
+
storage = optuna.storages.InMemoryStorage()
|
|
160
|
+
|
|
161
|
+
study_name = f"study_{self.algorithm}_{self.scoring}"
|
|
162
|
+
logger.info("Storage: %s, internal study name: %s", storage, study_name)
|
|
163
|
+
|
|
164
|
+
def timeout_handler(signum, frame):
|
|
165
|
+
raise TimeoutError("Objective function timed out")
|
|
166
|
+
|
|
167
|
+
def objective(trial):
|
|
168
|
+
if self.trial_timeout:
|
|
169
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
170
|
+
signal.alarm(int(self.trial_timeout))
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
model = self._suggest_model(trial, X)
|
|
174
|
+
model.fit(X)
|
|
175
|
+
labels = (
|
|
176
|
+
model.labels_ if hasattr(model, "labels_") else model.predict(X)
|
|
177
|
+
)
|
|
178
|
+
score = self._compute_score(X, labels)
|
|
179
|
+
return score
|
|
180
|
+
except TimeoutError:
|
|
181
|
+
trial.report(float("-inf"), step=0)
|
|
182
|
+
raise optuna.TrialPruned("Trial pruned due to timeout")
|
|
183
|
+
finally:
|
|
184
|
+
if self.trial_timeout:
|
|
185
|
+
signal.alarm(0)
|
|
186
|
+
|
|
187
|
+
# Determine direction of optimization
|
|
188
|
+
direction = "maximize"
|
|
189
|
+
if self.scoring == "davies_bouldin_score":
|
|
190
|
+
direction = "minimize"
|
|
191
|
+
|
|
192
|
+
self.study_ = optuna.create_study(
|
|
193
|
+
direction=direction,
|
|
194
|
+
study_name=study_name,
|
|
195
|
+
storage=storage,
|
|
196
|
+
load_if_exists=True,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
n_existing = len(self.study_.trials)
|
|
201
|
+
if n_existing > 0:
|
|
202
|
+
logger.info(
|
|
203
|
+
"Resuming optimization from storage, starting from trial %d.",
|
|
204
|
+
n_existing,
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
logger.info("Starting a new optimization.")
|
|
208
|
+
|
|
209
|
+
self.study_.optimize(
|
|
210
|
+
objective,
|
|
211
|
+
n_trials=self.n_trials,
|
|
212
|
+
show_progress_bar=_show_progress_bar,
|
|
213
|
+
timeout=self.timeout,
|
|
214
|
+
)
|
|
215
|
+
self.best_params_ = self.study_.best_params
|
|
216
|
+
logger.info(
|
|
217
|
+
"Optimization completed. Best parameters: %s", self.best_params_
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
self.model_ = self._get_best_model(X)
|
|
221
|
+
self.model_.fit(X)
|
|
222
|
+
|
|
223
|
+
self.labels_ = (
|
|
224
|
+
self.model_.labels_
|
|
225
|
+
if hasattr(self.model_, "labels_")
|
|
226
|
+
else self.model_.predict(X)
|
|
227
|
+
)
|
|
228
|
+
logger.info(
|
|
229
|
+
"Final model fitted. Number of clusters: %d",
|
|
230
|
+
len(set(self.labels_)),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Eagerly compute cluster descriptors
|
|
234
|
+
self.centroids_ = self._compute_centroids(X)
|
|
235
|
+
self.medoids_ = self._compute_medoids(X)
|
|
236
|
+
self.modes_ = self._compute_modes(X)
|
|
237
|
+
|
|
238
|
+
except ValueError as e:
|
|
239
|
+
if "No trials are completed yet" in str(e):
|
|
240
|
+
logger.warning(
|
|
241
|
+
"All trials were pruned. No valid results were obtained."
|
|
242
|
+
)
|
|
243
|
+
self.best_params_ = None
|
|
244
|
+
self.model_ = None
|
|
245
|
+
self.labels_ = None
|
|
246
|
+
self.centroids_ = None
|
|
247
|
+
self.medoids_ = None
|
|
248
|
+
self.modes_ = None
|
|
249
|
+
else:
|
|
250
|
+
logger.error("Error during optimization: %s", str(e))
|
|
251
|
+
raise
|
|
252
|
+
|
|
253
|
+
return self
|
|
254
|
+
|
|
255
|
+
def fit_predict(self, X, y=None):
|
|
256
|
+
self.fit(X, y)
|
|
257
|
+
return self.labels_
|
|
258
|
+
|
|
259
|
+
def predict(self, X):
|
|
260
|
+
check_is_fitted(self)
|
|
261
|
+
if self.model_ is None:
|
|
262
|
+
raise ValueError(
|
|
263
|
+
"No valid model available. Ensure that trials completed successfully."
|
|
264
|
+
)
|
|
265
|
+
if self.algorithm not in self.ALGORITHMS_WITH_PREDICT:
|
|
266
|
+
raise TypeError(
|
|
267
|
+
f"Algorithm '{self.algorithm}' does not support predict(). "
|
|
268
|
+
f"Algorithms with predict: {sorted(self.ALGORITHMS_WITH_PREDICT)}"
|
|
269
|
+
)
|
|
270
|
+
return self.model_.predict(X)
|
|
271
|
+
|
|
272
|
+
def _compute_score(self, X, labels):
|
|
273
|
+
# Filter out noise points for all metrics
|
|
274
|
+
mask = labels != -1
|
|
275
|
+
non_noise_labels = labels[mask]
|
|
276
|
+
|
|
277
|
+
if len(set(non_noise_labels)) <= 1:
|
|
278
|
+
raise optuna.TrialPruned(
|
|
279
|
+
"Only one cluster found (excluding noise), pruning this trial."
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
X_filtered = X[mask]
|
|
283
|
+
|
|
284
|
+
if self.scoring == "silhouette_score":
|
|
285
|
+
score = silhouette_score(X_filtered, non_noise_labels)
|
|
286
|
+
elif self.scoring == "calinski_harabasz_score":
|
|
287
|
+
score = calinski_harabasz_score(X_filtered, non_noise_labels)
|
|
288
|
+
elif self.scoring == "davies_bouldin_score":
|
|
289
|
+
score = davies_bouldin_score(X_filtered, non_noise_labels)
|
|
290
|
+
else:
|
|
291
|
+
raise ValueError(f"Unsupported scoring method: {self.scoring}")
|
|
292
|
+
return score
|
|
293
|
+
|
|
294
|
+
def _suggest_model(self, trial, X):
|
|
295
|
+
|
|
296
|
+
if self.algorithm == "kmeans":
|
|
297
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
298
|
+
max_iter = trial.suggest_int("max_iter", 100, 500)
|
|
299
|
+
tol = trial.suggest_float("tol", 1e-6, 1e-2)
|
|
300
|
+
return KMeans(
|
|
301
|
+
n_clusters=n_clusters, max_iter=max_iter, tol=tol, n_init="auto"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
elif self.algorithm == "minibatchkmeans":
|
|
305
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
306
|
+
batch_size = trial.suggest_int("batch_size", 10, 200)
|
|
307
|
+
max_iter = trial.suggest_int("max_iter", 100, 500)
|
|
308
|
+
tol = trial.suggest_float("tol", 1e-6, 1e-2)
|
|
309
|
+
return MiniBatchKMeans(
|
|
310
|
+
n_clusters=n_clusters,
|
|
311
|
+
batch_size=batch_size,
|
|
312
|
+
max_iter=max_iter,
|
|
313
|
+
tol=tol,
|
|
314
|
+
n_init="auto",
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
elif self.algorithm == "dbscan":
|
|
318
|
+
eps = trial.suggest_float("eps", 0.1, 10.0)
|
|
319
|
+
min_samples = trial.suggest_int("min_samples", 2, 10)
|
|
320
|
+
metric = trial.suggest_categorical(
|
|
321
|
+
"metric", ["euclidean", "manhattan", "chebyshev", "minkowski"]
|
|
322
|
+
)
|
|
323
|
+
if metric == "minkowski":
|
|
324
|
+
p = trial.suggest_int("p", 1, 5)
|
|
325
|
+
return DBSCAN(eps=eps, min_samples=min_samples, metric=metric, p=p)
|
|
326
|
+
else:
|
|
327
|
+
return DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
|
|
328
|
+
|
|
329
|
+
elif self.algorithm == "meanshift":
|
|
330
|
+
bandwidth = trial.suggest_float("bandwidth", 0.1, 10.0)
|
|
331
|
+
bin_seeding = trial.suggest_categorical("bin_seeding", [True, False])
|
|
332
|
+
return MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding)
|
|
333
|
+
|
|
334
|
+
elif self.algorithm == "agglomerativeclustering":
|
|
335
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
336
|
+
linkage = trial.suggest_categorical(
|
|
337
|
+
"linkage", ["ward", "complete", "average", "single"]
|
|
338
|
+
)
|
|
339
|
+
return AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
|
|
340
|
+
|
|
341
|
+
elif self.algorithm == "spectralclustering":
|
|
342
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
343
|
+
n_neighbors = trial.suggest_int("n_neighbors", 2, 20)
|
|
344
|
+
eigen_tol = trial.suggest_float("eigen_tol", 1e-6, 1e-2)
|
|
345
|
+
return SpectralClustering(
|
|
346
|
+
n_clusters=n_clusters, n_neighbors=n_neighbors, eigen_tol=eigen_tol
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
elif self.algorithm == "affinitypropagation":
|
|
350
|
+
damping = trial.suggest_float("damping", 0.5, 0.99)
|
|
351
|
+
convergence_iter = trial.suggest_int("convergence_iter", 10, 200)
|
|
352
|
+
return AffinityPropagation(
|
|
353
|
+
damping=damping, convergence_iter=convergence_iter
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
elif self.algorithm == "birch":
|
|
357
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
358
|
+
threshold = trial.suggest_float("threshold", 0.1, 1.0)
|
|
359
|
+
branching_factor = trial.suggest_int("branching_factor", 20, 100)
|
|
360
|
+
return Birch(
|
|
361
|
+
n_clusters=n_clusters,
|
|
362
|
+
threshold=threshold,
|
|
363
|
+
branching_factor=branching_factor,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
elif self.algorithm == "optics":
|
|
367
|
+
min_samples = trial.suggest_int("min_samples", 2, 10)
|
|
368
|
+
cluster_method = trial.suggest_categorical(
|
|
369
|
+
"cluster_method", ["xi", "dbscan"]
|
|
370
|
+
)
|
|
371
|
+
return OPTICS(
|
|
372
|
+
max_eps=np.inf,
|
|
373
|
+
min_samples=min_samples,
|
|
374
|
+
cluster_method=cluster_method,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
elif self.algorithm == "gaussianmixture":
|
|
378
|
+
n_components = trial.suggest_int("n_components", 2, 10)
|
|
379
|
+
covariance_type = trial.suggest_categorical(
|
|
380
|
+
"covariance_type", ["full", "tied", "diag", "spherical"]
|
|
381
|
+
)
|
|
382
|
+
return GaussianMixture(
|
|
383
|
+
n_components=n_components, covariance_type=covariance_type
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
elif self.algorithm == "hdbscan":
|
|
387
|
+
min_cluster_size = trial.suggest_int("min_cluster_size", 2, 50)
|
|
388
|
+
min_samples = trial.suggest_int("min_samples", 1, 10)
|
|
389
|
+
cluster_selection_epsilon = trial.suggest_float(
|
|
390
|
+
"cluster_selection_epsilon", 0, 1
|
|
391
|
+
)
|
|
392
|
+
allow_single_cluster = trial.suggest_categorical(
|
|
393
|
+
"allow_single_cluster", [True, False]
|
|
394
|
+
)
|
|
395
|
+
return hdbscan.HDBSCAN(
|
|
396
|
+
min_cluster_size=min_cluster_size,
|
|
397
|
+
min_samples=min_samples,
|
|
398
|
+
cluster_selection_epsilon=cluster_selection_epsilon,
|
|
399
|
+
allow_single_cluster=allow_single_cluster,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
elif self.algorithm == "kmedoids":
|
|
403
|
+
n_clusters = trial.suggest_int("n_clusters", 2, 50)
|
|
404
|
+
method = trial.suggest_categorical(
|
|
405
|
+
"method",
|
|
406
|
+
[
|
|
407
|
+
"fasterpam",
|
|
408
|
+
"pam",
|
|
409
|
+
"alternate",
|
|
410
|
+
"fastermsc",
|
|
411
|
+
"fastmsc",
|
|
412
|
+
"pamsil",
|
|
413
|
+
"pammedsil",
|
|
414
|
+
],
|
|
415
|
+
)
|
|
416
|
+
return KMedoids(n_clusters=n_clusters, method=method, metric="euclidean")
|
|
417
|
+
|
|
418
|
+
elif self.algorithm == "sleep":
|
|
419
|
+
# Fake algorithm to induce timeout for testing
|
|
420
|
+
time.sleep(3)
|
|
421
|
+
return KMeans(n_clusters=3, n_init="auto")
|
|
422
|
+
|
|
423
|
+
elif self.algorithm == "som":
|
|
424
|
+
m = trial.suggest_int("m", 2, 20)
|
|
425
|
+
n = trial.suggest_int("n", 2, 20)
|
|
426
|
+
return SOM(m=m, n=n, dim=X.shape[1])
|
|
427
|
+
|
|
428
|
+
else:
|
|
429
|
+
raise ValueError(f"Unsupported algorithm: {self.algorithm}")
|
|
430
|
+
|
|
431
|
+
def _get_best_model(self, X):
|
|
432
|
+
params = dict(self.best_params_)
|
|
433
|
+
# Remove conditional parameters that don't apply
|
|
434
|
+
if self.algorithm == "dbscan" and params.get("metric") != "minkowski":
|
|
435
|
+
params.pop("p", None)
|
|
436
|
+
trial = optuna.trial.FixedTrial(params)
|
|
437
|
+
return self._suggest_model(trial, X)
|
|
438
|
+
|
|
439
|
+
def _compute_centroids(self, X):
|
|
440
|
+
"""Compute arithmetic mean centroid for each cluster."""
|
|
441
|
+
if self.labels_ is None:
|
|
442
|
+
return None
|
|
443
|
+
unique_labels = np.unique(self.labels_)
|
|
444
|
+
centroids = []
|
|
445
|
+
for label in unique_labels:
|
|
446
|
+
if label == -1:
|
|
447
|
+
continue
|
|
448
|
+
cluster_points = X[self.labels_ == label]
|
|
449
|
+
centroids.append(cluster_points.mean(axis=0))
|
|
450
|
+
if len(centroids) == 0:
|
|
451
|
+
return None
|
|
452
|
+
return np.array(centroids)
|
|
453
|
+
|
|
454
|
+
def _compute_medoids(self, X):
|
|
455
|
+
"""Compute medoid (point with minimum total squared Euclidean distance) for each cluster."""
|
|
456
|
+
if self.labels_ is None:
|
|
457
|
+
return None
|
|
458
|
+
unique_labels = np.unique(self.labels_)
|
|
459
|
+
medoids = []
|
|
460
|
+
for label in unique_labels:
|
|
461
|
+
if label == -1:
|
|
462
|
+
continue
|
|
463
|
+
cluster_points = X[self.labels_ == label]
|
|
464
|
+
if len(cluster_points) == 0:
|
|
465
|
+
continue
|
|
466
|
+
# Squared Euclidean pairwise distances
|
|
467
|
+
distances = np.sum(
|
|
468
|
+
(cluster_points[:, np.newaxis] - cluster_points[np.newaxis, :]) ** 2,
|
|
469
|
+
axis=2,
|
|
470
|
+
)
|
|
471
|
+
medoid_index = np.argmin(np.sum(distances, axis=1))
|
|
472
|
+
medoids.append(cluster_points[medoid_index])
|
|
473
|
+
if len(medoids) == 0:
|
|
474
|
+
return None
|
|
475
|
+
return np.array(medoids)
|
|
476
|
+
|
|
477
|
+
def _compute_modes(self, X):
|
|
478
|
+
"""Compute mode (highest density point) for each cluster using KDE."""
|
|
479
|
+
if self.labels_ is None:
|
|
480
|
+
return None
|
|
481
|
+
unique_labels = np.unique(self.labels_)
|
|
482
|
+
modes = []
|
|
483
|
+
for label in unique_labels:
|
|
484
|
+
if label == -1:
|
|
485
|
+
continue
|
|
486
|
+
cluster_points = X[self.labels_ == label]
|
|
487
|
+
if len(cluster_points) == 0:
|
|
488
|
+
continue
|
|
489
|
+
kde = KernelDensity(kernel="gaussian", bandwidth="scott").fit(
|
|
490
|
+
cluster_points
|
|
491
|
+
)
|
|
492
|
+
# Evaluate density at actual data points instead of exponential grid
|
|
493
|
+
densities = kde.score_samples(cluster_points)
|
|
494
|
+
mode_index = np.argmax(densities)
|
|
495
|
+
modes.append(cluster_points[mode_index])
|
|
496
|
+
if len(modes) == 0:
|
|
497
|
+
return None
|
|
498
|
+
return np.array(modes)
|
|
499
|
+
|
|
500
|
+
@property
|
|
501
|
+
def cluster_centers_(self):
|
|
502
|
+
check_is_fitted(self)
|
|
503
|
+
if self.model_ is not None and hasattr(self.model_, "cluster_centers_"):
|
|
504
|
+
return self.model_.cluster_centers_
|
|
505
|
+
return None
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
class ClustGridSearch(BaseEstimator, ClusterMixin):
|
|
509
|
+
|
|
510
|
+
def __init__(
|
|
511
|
+
self,
|
|
512
|
+
mode="full",
|
|
513
|
+
n_trials=20,
|
|
514
|
+
scoring="silhouette_score",
|
|
515
|
+
verbose=False,
|
|
516
|
+
show_progress_bar=True,
|
|
517
|
+
):
|
|
518
|
+
"""
|
|
519
|
+
Initialize the ClustGridSearch.
|
|
520
|
+
|
|
521
|
+
:param mode: 'full' to test all algorithms, 'fast' to test a subset (kmeans and hdbscan).
|
|
522
|
+
:param n_trials: Number of trials for each algorithm's hyperparameter optimization.
|
|
523
|
+
:param scoring: The metric used to select the best clustering (default: 'silhouette_score').
|
|
524
|
+
:param verbose: Whether to print additional information during the search.
|
|
525
|
+
"""
|
|
526
|
+
self.mode = mode
|
|
527
|
+
self.n_trials = n_trials
|
|
528
|
+
self.scoring = scoring
|
|
529
|
+
self.verbose = verbose
|
|
530
|
+
self.show_progress_bar = show_progress_bar
|
|
531
|
+
|
|
532
|
+
if self.mode == "full":
|
|
533
|
+
self.algorithms = [
|
|
534
|
+
"kmeans",
|
|
535
|
+
"kmedoids",
|
|
536
|
+
"minibatchkmeans",
|
|
537
|
+
"dbscan",
|
|
538
|
+
"agglomerativeclustering",
|
|
539
|
+
"meanshift",
|
|
540
|
+
"spectralclustering",
|
|
541
|
+
"affinitypropagation",
|
|
542
|
+
"birch",
|
|
543
|
+
"optics",
|
|
544
|
+
"gaussianmixture",
|
|
545
|
+
"hdbscan",
|
|
546
|
+
]
|
|
547
|
+
elif self.mode == "fast":
|
|
548
|
+
self.algorithms = ["kmeans", "hdbscan"]
|
|
549
|
+
else:
|
|
550
|
+
raise ValueError("Invalid mode. Use 'full' or 'fast'.")
|
|
551
|
+
|
|
552
|
+
def fit(self, X, y=None):
|
|
553
|
+
"""
|
|
554
|
+
Run clustering for all selected algorithms and return the best one based on the chosen scoring.
|
|
555
|
+
|
|
556
|
+
:param X: Input data for clustering.
|
|
557
|
+
"""
|
|
558
|
+
results = []
|
|
559
|
+
for algorithm in self.algorithms:
|
|
560
|
+
logger.info("Testing algorithm: %s", algorithm)
|
|
561
|
+
|
|
562
|
+
optimizer = Optimizer(
|
|
563
|
+
algorithm=algorithm,
|
|
564
|
+
n_trials=self.n_trials,
|
|
565
|
+
scoring=self.scoring,
|
|
566
|
+
verbose=self.verbose,
|
|
567
|
+
show_progress_bar=self.show_progress_bar,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
optimizer.fit(X)
|
|
572
|
+
score = optimizer.study_.best_value
|
|
573
|
+
results.append(
|
|
574
|
+
{
|
|
575
|
+
"algorithm": algorithm,
|
|
576
|
+
"mean_test_score": score,
|
|
577
|
+
"params": optimizer.best_params_,
|
|
578
|
+
"model": optimizer,
|
|
579
|
+
}
|
|
580
|
+
)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
logger.error("Error for algorithm %s: %s", algorithm, e)
|
|
583
|
+
|
|
584
|
+
if not results:
|
|
585
|
+
raise ValueError("No algorithms produced valid results.")
|
|
586
|
+
|
|
587
|
+
self.cv_results_ = {
|
|
588
|
+
"algorithm": [res["algorithm"] for res in results],
|
|
589
|
+
"mean_test_score": [res["mean_test_score"] for res in results],
|
|
590
|
+
"params": [res["params"] for res in results],
|
|
591
|
+
"model": [res["model"] for res in results],
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
reverse = self.scoring != "davies_bouldin_score"
|
|
595
|
+
scores = self.cv_results_["mean_test_score"]
|
|
596
|
+
if reverse:
|
|
597
|
+
best_idx = np.argmax(scores)
|
|
598
|
+
else:
|
|
599
|
+
best_idx = np.argmin(scores)
|
|
600
|
+
self.best_index_ = best_idx
|
|
601
|
+
self.best_score_ = scores[best_idx]
|
|
602
|
+
self.best_params_ = self.cv_results_["params"][best_idx]
|
|
603
|
+
self.best_estimator_ = self.cv_results_["model"][best_idx]
|
|
604
|
+
return self
|
|
605
|
+
|
|
606
|
+
def predict(self, X):
|
|
607
|
+
check_is_fitted(self)
|
|
608
|
+
return self.best_estimator_.predict(X)
|
|
609
|
+
|
|
610
|
+
def fit_predict(self, X, y=None):
|
|
611
|
+
self.fit(X, y)
|
|
612
|
+
return self.best_estimator_.labels_
|
|
613
|
+
|
|
614
|
+
@property
|
|
615
|
+
def labels_(self):
|
|
616
|
+
check_is_fitted(self)
|
|
617
|
+
return self.best_estimator_.labels_
|
|
618
|
+
|
|
619
|
+
@property
|
|
620
|
+
def cluster_centers_(self):
|
|
621
|
+
check_is_fitted(self)
|
|
622
|
+
return self.best_estimator_.cluster_centers_
|
|
623
|
+
|
|
624
|
+
@property
|
|
625
|
+
def centroids_(self):
|
|
626
|
+
check_is_fitted(self)
|
|
627
|
+
return self.best_estimator_.centroids_
|
|
628
|
+
|
|
629
|
+
@property
|
|
630
|
+
def medoids_(self):
|
|
631
|
+
check_is_fitted(self)
|
|
632
|
+
return self.best_estimator_.medoids_
|
|
633
|
+
|
|
634
|
+
@property
|
|
635
|
+
def modes_(self):
|
|
636
|
+
check_is_fitted(self)
|
|
637
|
+
return self.best_estimator_.modes_
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: optuclust
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Hyperparameter optimization for multiple clustering algorithms using Optuna, with Scikit-learn API
|
|
5
|
+
Home-page: https://github.com/filipsPL/optuclust
|
|
6
|
+
Author: Filip S.
|
|
7
|
+
Author-email: filip.ursynow@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.21.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.1
|
|
16
|
+
Requires-Dist: hdbscan>=0.8.29
|
|
17
|
+
Requires-Dist: optuna>=3.0
|
|
18
|
+
Requires-Dist: kmedoids>=0.3.0
|
|
19
|
+
Requires-Dist: matplotlib>=3.4
|
|
20
|
+
Requires-Dist: pandas>=1.3
|
|
21
|
+
Requires-Dist: sklearn-som
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
# optuclust
|
|
35
|
+
|
|
36
|
+
**optuclust** is a Python module for optimizing clustering algorithms using the [Optuna](https://optuna.org/) framework. It provides a scikit-learn compatible API with support for a variety of clustering methods and offers additional capabilities such as the calculation of centroids, medoids, and modes for clusters.
|
|
37
|
+
|
|
38
|
+
[](https://github.com/filipsPL/optuclust/actions/workflows/python-package.yml) [](https://doi.org/10.5281/zenodo.18608559)
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- **Parameter Optimization:** Optimize clustering parameters for various algorithms using **Optuna**.
|
|
43
|
+
- **Supported Clustering Methods:**
|
|
44
|
+
- Algorithms from scikit-learn, such as KMeans, DBSCAN, and Agglomerative Clustering.
|
|
45
|
+
- Advanced methods like HDBSCAN, Self-Organizing Maps (SOM), and kMedoids.
|
|
46
|
+
- **Metrics and Scoring:**
|
|
47
|
+
- `silhouette_score`
|
|
48
|
+
- `calinski_harabasz_score`
|
|
49
|
+
- `davies_bouldin_score` (automatically minimized)
|
|
50
|
+
- Noise points (label=-1) are filtered out before score computation for density-based algorithms.
|
|
51
|
+
- **Clustering Insights:** Provides centroids (arithmetic mean), medoids (Euclidean), and modes (KDE with Scott's bandwidth) for clusters, even if the algorithm does not natively support these features. All descriptors are computed eagerly during `fit()` and work in any number of dimensions.
|
|
52
|
+
- **Scikit-learn Compatible:** Inherits from `BaseEstimator` and `ClusterMixin`. Works with `clone()`, `check_is_fitted()`, and scikit-learn pipelines.
|
|
53
|
+
- **ClustGridSearch Class:** A utility to test all clustering algorithms and identify the best one.
|
|
54
|
+
- **Timeout Management:** Separate timeouts for optimization runs (`timeout`) and individual trials (`trial_timeout`).
|
|
55
|
+
- **Storage and Resume:** Store optimization results in a SQLite database for future analysis, and resume the optimization process later.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
1. Clone this repository:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone git@github.com:filipsPL/optuclust.git
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
2. Navigate to the cloned directory and install the required dependencies:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
cd optuclust
|
|
69
|
+
pip install -r requirements.txt
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
3. Install **optuclust**:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
python setup.py install
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Requires:** Python >= 3.8, scikit-learn >= 1.1
|
|
79
|
+
|
|
80
|
+
## Usage
|
|
81
|
+
|
|
82
|
+
### 1. Optimizing a Clustering Algorithm
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from optuclust import Optimizer
|
|
86
|
+
from sklearn.datasets import make_blobs
|
|
87
|
+
|
|
88
|
+
# Generate synthetic data
|
|
89
|
+
X, _ = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42)
|
|
90
|
+
|
|
91
|
+
# Instantiate and fit the optimizer for KMeans
|
|
92
|
+
optimizer = Optimizer(algorithm="kmeans", n_trials=50, scoring="silhouette_score", verbose=True)
|
|
93
|
+
optimizer.fit(X)
|
|
94
|
+
|
|
95
|
+
# Access cluster details
|
|
96
|
+
print("Cluster Labels:", optimizer.labels_)
|
|
97
|
+
print("Centroids:", optimizer.centroids_)
|
|
98
|
+
print("Medoids:", optimizer.medoids_)
|
|
99
|
+
print("Modes:", optimizer.modes_)
|
|
100
|
+
print("Cluster Centers (native):", optimizer.cluster_centers_)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 2. ClustGridSearch
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from optuclust import ClustGridSearch
|
|
107
|
+
from sklearn.datasets import make_blobs
|
|
108
|
+
|
|
109
|
+
# Generate synthetic data
|
|
110
|
+
X, _ = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42)
|
|
111
|
+
|
|
112
|
+
# Initialize ClustGridSearch to test all algorithms
|
|
113
|
+
grid_search = ClustGridSearch(mode="full", scoring="silhouette_score", verbose=True)
|
|
114
|
+
|
|
115
|
+
# Fit and get the best method
|
|
116
|
+
grid_search.fit(X)
|
|
117
|
+
print("Best Algorithm:", grid_search.best_estimator_.algorithm)
|
|
118
|
+
print("Best Score:", grid_search.best_score_)
|
|
119
|
+
print("Best Parameters:", grid_search.best_params_)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 3. Benchmark Example
|
|
123
|
+
|
|
124
|
+
To benchmark different clustering algorithms, you can use the provided example script:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
python example-loop.py
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The benchmark will evaluate different clustering methods on various datasets and save the performance metrics and plots.
|
|
131
|
+
|
|
132
|
+
## Supported Algorithms
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
algorithms = [
|
|
136
|
+
'kmeans', 'kmedoids', 'minibatchkmeans', 'dbscan', 'agglomerativeclustering',
|
|
137
|
+
'meanshift', 'spectralclustering', 'gaussianmixture', 'hdbscan',
|
|
138
|
+
'affinitypropagation', 'birch', 'optics', 'som'
|
|
139
|
+
]
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Note:** Not all algorithms support `predict()` on new data. Algorithms with inductive prediction: `kmeans`, `minibatchkmeans`, `meanshift`, `birch`, `gaussianmixture`, `kmedoids`, `som`. Calling `predict()` on other algorithms (e.g. `dbscan`, `hdbscan`) will raise a `TypeError`.
|
|
143
|
+
|
|
144
|
+
## Parameters
|
|
145
|
+
|
|
146
|
+
### Optimizer Class
|
|
147
|
+
|
|
148
|
+
- **algorithm:** The clustering algorithm to optimize. Options include those listed in Supported Algorithms.
|
|
149
|
+
- **n_trials:** Number of Optuna trials for optimization. Default is 50.
|
|
150
|
+
- **scoring:** The metric to optimize. Options are `silhouette_score`, `calinski_harabasz_score`, and `davies_bouldin_score`.
|
|
151
|
+
- **verbose:** Enable additional logging if set to `True`. Can also be an `int` to set Optuna's verbosity level directly.
|
|
152
|
+
- **show_progress_bar:** Display a progress bar during optimization. Default is `True`.
|
|
153
|
+
- **timeout:** Maximum duration (in seconds) for all trials in the optimization process.
|
|
154
|
+
- **trial_timeout:** Maximum duration (in seconds) for each individual trial (Unix only, uses `SIGALRM`).
|
|
155
|
+
- **storage:** Optuna storage URI, e.g. `sqlite:///optimization.db`. When provided, enables resuming a previous optimization run.
|
|
156
|
+
- **logfile:** Reserved for future use.
|
|
157
|
+
|
|
158
|
+
### Fitted Attributes
|
|
159
|
+
|
|
160
|
+
After calling `fit(X)`:
|
|
161
|
+
|
|
162
|
+
- **labels\_:** Cluster labels for each sample.
|
|
163
|
+
- **best\_params\_:** Dictionary of the best hyperparameters found.
|
|
164
|
+
- **model\_:** The fitted clustering model with the best parameters.
|
|
165
|
+
- **study\_:** The Optuna `Study` object with full trial history.
|
|
166
|
+
- **centroids\_:** Arithmetic mean of each cluster (excludes noise points).
|
|
167
|
+
- **medoids\_:** Most central data point in each cluster (Euclidean distance).
|
|
168
|
+
- **modes\_:** Highest density point in each cluster (KDE with Scott's rule bandwidth).
|
|
169
|
+
- **cluster\_centers\_:** Native cluster centers from the model (if available), otherwise `None`.
|
|
170
|
+
|
|
171
|
+
### ClustGridSearch Class
|
|
172
|
+
|
|
173
|
+
- **mode:**
|
|
174
|
+
- `full`: Test all algorithms.
|
|
175
|
+
- `fast`: Test a subset of algorithms (`kmeans` and `hdbscan`).
|
|
176
|
+
- **n_trials:** Number of Optuna trials for each algorithm. Default is 20.
|
|
177
|
+
- **scoring:** Metric to select the best clustering algorithm. Options are `silhouette_score`, `calinski_harabasz_score`, and `davies_bouldin_score`.
|
|
178
|
+
- **verbose:** Enable detailed logging if set to `True`.
|
|
179
|
+
- **show_progress_bar:** Display a progress bar for each algorithm.
|
|
180
|
+
|
|
181
|
+
## Running Tests
|
|
182
|
+
|
|
183
|
+
We use **pytest** for testing. To run tests, simply run:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pytest -v
|
|
187
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
optuclust/__init__.py,sha256=1bwbNEMdHyRItYBEHlSyYyfre4-RnwnG9wLngLa3Eu4,71
|
|
2
|
+
optuclust/optuclust.py,sha256=nwdsr1av5bc3bsNb5l3iusYIiZCfsqurnaxl7beGP4g,21922
|
|
3
|
+
optuclust-0.0.2.dist-info/licenses/LICENSE,sha256=F7Zfn3KEW4YveQcP-f7E04EIEmbVXXrvwiS2W-1IPVI,1063
|
|
4
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
tests/test_optuclust.py,sha256=u0nIGgYPO6NaNaU90wvjwFJd1-3xr5nXdqRTnsDUYVU,5606
|
|
6
|
+
optuclust-0.0.2.dist-info/METADATA,sha256=zMnA0Na9xqCIpd2X-X9PgNCNmQFCU0oLlT9dtrcTZvQ,7478
|
|
7
|
+
optuclust-0.0.2.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
8
|
+
optuclust-0.0.2.dist-info/top_level.txt,sha256=_3ikUBQNQsleaezLn_of9BtjDTACey5e15hGW9lSeZ8,16
|
|
9
|
+
optuclust-0.0.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 filips
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_optuclust.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.exceptions import NotFittedError
|
|
4
|
+
from sklearn.datasets import make_blobs
|
|
5
|
+
|
|
6
|
+
from optuclust import Optimizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Fixture to generate a synthetic dataset
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def data():
|
|
12
|
+
X, _ = make_blobs(n_samples=100, centers=3, n_features=2, random_state=42)
|
|
13
|
+
return X
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True):
|
|
17
|
+
"""Helper to verify shape and type of labels, centroids, medoids, modes."""
|
|
18
|
+
n_samples, n_features = data.shape
|
|
19
|
+
|
|
20
|
+
assert optimizer.labels_ is not None
|
|
21
|
+
assert optimizer.labels_.shape == (n_samples,)
|
|
22
|
+
|
|
23
|
+
# Count non-noise clusters
|
|
24
|
+
non_noise = set(optimizer.labels_) - {-1}
|
|
25
|
+
n_clusters = len(non_noise)
|
|
26
|
+
assert n_clusters >= 1
|
|
27
|
+
|
|
28
|
+
if expect_cluster_centers:
|
|
29
|
+
assert optimizer.cluster_centers_ is not None
|
|
30
|
+
assert optimizer.cluster_centers_.shape[1] == n_features
|
|
31
|
+
else:
|
|
32
|
+
assert optimizer.cluster_centers_ is None
|
|
33
|
+
|
|
34
|
+
assert optimizer.centroids_ is not None
|
|
35
|
+
assert optimizer.centroids_.shape == (n_clusters, n_features)
|
|
36
|
+
|
|
37
|
+
assert optimizer.medoids_ is not None
|
|
38
|
+
assert optimizer.medoids_.shape == (n_clusters, n_features)
|
|
39
|
+
|
|
40
|
+
assert optimizer.modes_ is not None
|
|
41
|
+
assert optimizer.modes_.shape == (n_clusters, n_features)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Test for KMeans algorithm
|
|
45
|
+
def test_kmeans(data):
|
|
46
|
+
optimizer = Optimizer(algorithm="kmeans", n_trials=10, verbose=False)
|
|
47
|
+
optimizer.fit(data)
|
|
48
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Test for KMedoids algorithm
|
|
52
|
+
def test_kmedoids(data):
|
|
53
|
+
optimizer = Optimizer(algorithm="kmedoids", n_trials=10, verbose=False)
|
|
54
|
+
optimizer.fit(data)
|
|
55
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Test for MiniBatchKMeans algorithm
|
|
59
|
+
def test_minibatchkmeans(data):
|
|
60
|
+
optimizer = Optimizer(algorithm="minibatchkmeans", n_trials=10, verbose=False)
|
|
61
|
+
optimizer.fit(data)
|
|
62
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Test for DBSCAN algorithm (should not provide cluster centers)
|
|
66
|
+
def test_dbscan(data):
|
|
67
|
+
optimizer = Optimizer(algorithm="dbscan", n_trials=10, verbose=False)
|
|
68
|
+
optimizer.fit(data)
|
|
69
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=False)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Test for MeanShift algorithm
|
|
73
|
+
def test_meanshift(data):
|
|
74
|
+
optimizer = Optimizer(algorithm="meanshift", n_trials=10, verbose=False)
|
|
75
|
+
optimizer.fit(data)
|
|
76
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Test for HDBSCAN algorithm
|
|
80
|
+
def test_hdbscan(data):
|
|
81
|
+
optimizer = Optimizer(algorithm="hdbscan", n_trials=10, verbose=False)
|
|
82
|
+
optimizer.fit(data)
|
|
83
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=False)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Test for SOM algorithm
|
|
87
|
+
def test_som(data):
|
|
88
|
+
optimizer = Optimizer(algorithm="som", n_trials=10, verbose=False)
|
|
89
|
+
optimizer.fit(data)
|
|
90
|
+
|
|
91
|
+
n_samples, n_features = data.shape
|
|
92
|
+
assert optimizer.labels_ is not None
|
|
93
|
+
assert optimizer.labels_.shape == (n_samples,)
|
|
94
|
+
assert optimizer.centroids_ is not None
|
|
95
|
+
assert optimizer.medoids_ is not None
|
|
96
|
+
assert optimizer.modes_ is not None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# Test overall optimization timeout
|
|
100
|
+
def test_kmeans_timeout(data):
|
|
101
|
+
from time import time
|
|
102
|
+
|
|
103
|
+
optimizer = Optimizer(algorithm="kmeans", timeout=3, n_trials=1000, verbose=False)
|
|
104
|
+
start_time = time()
|
|
105
|
+
optimizer.fit(data)
|
|
106
|
+
elapsed_time = time() - start_time
|
|
107
|
+
|
|
108
|
+
assert elapsed_time <= 5.0, "Optimizer did not respect the timeout"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# Test per-trial timeout (should prune the sleeping trial)
|
|
112
|
+
def test_trial_timeout1(data):
|
|
113
|
+
optimizer = Optimizer(algorithm="sleep", trial_timeout=1, n_trials=2)
|
|
114
|
+
optimizer.fit(data)
|
|
115
|
+
# All trials should be pruned, so no valid model
|
|
116
|
+
assert optimizer.model_ is None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Test per-trial timeout (timeout > sleep, so trials succeed)
|
|
120
|
+
def test_trial_timeout10(data):
|
|
121
|
+
optimizer = Optimizer(algorithm="sleep", trial_timeout=10, n_trials=2)
|
|
122
|
+
optimizer.fit(data)
|
|
123
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Test for storage and resume
|
|
127
|
+
def test_storage_and_resume(data):
|
|
128
|
+
storage_path = "test-storage+resume.db"
|
|
129
|
+
storage_uri = f"sqlite:///{storage_path}"
|
|
130
|
+
|
|
131
|
+
import os
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
optimizer = Optimizer(
|
|
135
|
+
algorithm="kmeans", n_trials=10, verbose=False, storage=storage_uri
|
|
136
|
+
)
|
|
137
|
+
optimizer.fit(data)
|
|
138
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
139
|
+
|
|
140
|
+
# Run again to test resumption
|
|
141
|
+
optimizer = Optimizer(
|
|
142
|
+
algorithm="kmeans", n_trials=10, verbose=False, storage=storage_uri
|
|
143
|
+
)
|
|
144
|
+
optimizer.fit(data)
|
|
145
|
+
_assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
|
|
146
|
+
|
|
147
|
+
finally:
|
|
148
|
+
if os.path.exists(storage_path):
|
|
149
|
+
os.remove(storage_path)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_invalid_algorithm(data):
|
|
153
|
+
with pytest.raises(ValueError, match="Algorithm must be one of"):
|
|
154
|
+
Optimizer(algorithm="kmeans_dupa", n_trials=10, verbose=False)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_invalid_scoring(data):
|
|
158
|
+
with pytest.raises(ValueError, match="Scoring must be one of"):
|
|
159
|
+
Optimizer(algorithm="kmeans", scoring="filips_score", n_trials=10, verbose=False)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_not_fitted_error():
|
|
163
|
+
optimizer = Optimizer(algorithm="kmeans", n_trials=10, verbose=False)
|
|
164
|
+
with pytest.raises(NotFittedError):
|
|
165
|
+
_ = optimizer.cluster_centers_
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_predict_unsupported_algorithm(data):
|
|
169
|
+
optimizer = Optimizer(algorithm="dbscan", n_trials=10, verbose=False)
|
|
170
|
+
optimizer.fit(data)
|
|
171
|
+
with pytest.raises(TypeError, match="does not support predict"):
|
|
172
|
+
optimizer.predict(data)
|