optuclust 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
optuclust/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from .optuclust import Optimizer
2
+ from .optuclust import ClustGridSearch
optuclust/optuclust.py ADDED
@@ -0,0 +1,637 @@
1
+ import logging
2
+ import signal
3
+ import time
4
+
5
+ import numpy as np
6
+ import optuna
7
+ from sklearn.base import BaseEstimator, ClusterMixin
8
+ from sklearn.cluster import (
9
+ DBSCAN,
10
+ OPTICS,
11
+ AffinityPropagation,
12
+ AgglomerativeClustering,
13
+ Birch,
14
+ KMeans,
15
+ MeanShift,
16
+ MiniBatchKMeans,
17
+ SpectralClustering,
18
+ )
19
+ from sklearn.metrics import (
20
+ calinski_harabasz_score,
21
+ davies_bouldin_score,
22
+ silhouette_score,
23
+ )
24
+ from sklearn.mixture import GaussianMixture
25
+ from sklearn.neighbors import KernelDensity
26
+ from sklearn.utils.validation import check_is_fitted
27
+
28
+ import hdbscan
29
+ from kmedoids import KMedoids
30
+ from sklearn_som.som import SOM
31
+
32
+ logger = logging.getLogger("optuclust")
33
+
34
+
35
+ class Optimizer(BaseEstimator, ClusterMixin):
36
+
37
+ VALID_ALGORITHMS = [
38
+ "kmeans",
39
+ "minibatchkmeans",
40
+ "dbscan",
41
+ "meanshift",
42
+ "agglomerativeclustering",
43
+ "spectralclustering",
44
+ "affinitypropagation",
45
+ "birch",
46
+ "optics",
47
+ "gaussianmixture",
48
+ "hdbscan",
49
+ "kmedoids",
50
+ "sleep",
51
+ "som",
52
+ ]
53
+
54
+ VALID_SCORING = [
55
+ "silhouette_score",
56
+ "calinski_harabasz_score",
57
+ "davies_bouldin_score",
58
+ ]
59
+
60
+ ALGORITHMS_WITH_PREDICT = {
61
+ "kmeans",
62
+ "minibatchkmeans",
63
+ "meanshift",
64
+ "birch",
65
+ "gaussianmixture",
66
+ "kmedoids",
67
+ "som",
68
+ "sleep",
69
+ }
70
+
71
+ SAFE_DEFAULTS = {
72
+ "kmeans": {"n_clusters": 3, "max_iter": 300, "tol": 1e-4, "n_init": 10},
73
+ "minibatchkmeans": {
74
+ "n_clusters": 8,
75
+ "batch_size": 100,
76
+ "max_iter": 300,
77
+ "tol": 1e-4,
78
+ "n_init": 10,
79
+ },
80
+ "dbscan": {
81
+ "eps": 0.5,
82
+ "min_samples": 5,
83
+ "metric": "euclidean",
84
+ "p": 2,
85
+ },
86
+ "meanshift": {"bandwidth": 2.5, "bin_seeding": True},
87
+ "agglomerativeclustering": {"n_clusters": 3, "linkage": "ward"},
88
+ "spectralclustering": {
89
+ "n_clusters": 3,
90
+ "n_neighbors": 10,
91
+ "eigen_tol": 1e-4,
92
+ },
93
+ "affinitypropagation": {"damping": 0.9, "convergence_iter": 15},
94
+ "birch": {"n_clusters": 3, "threshold": 0.5, "branching_factor": 50},
95
+ "optics": {
96
+ "min_samples": 5,
97
+ "cluster_method": "xi",
98
+ },
99
+ "gaussianmixture": {"n_components": 3, "covariance_type": "full"},
100
+ "hdbscan": {
101
+ "min_cluster_size": 5,
102
+ "min_samples": 1,
103
+ "cluster_selection_epsilon": 0.0,
104
+ "allow_single_cluster": False,
105
+ },
106
+ "kmedoids": {"n_clusters": 3, "method": "pam", "metric": "euclidean"},
107
+ "som": {
108
+ "m": 10,
109
+ "n": 10,
110
+ "dim": None,
111
+ },
112
+ }
113
+
114
+ def __init__(
115
+ self,
116
+ algorithm,
117
+ n_trials=50,
118
+ scoring="silhouette_score",
119
+ verbose=False,
120
+ show_progress_bar=True,
121
+ timeout=None,
122
+ trial_timeout=None,
123
+ storage=None,
124
+ logfile=None,
125
+ ):
126
+ if algorithm not in self.VALID_ALGORITHMS:
127
+ raise ValueError(f"Algorithm must be one of {self.VALID_ALGORITHMS}")
128
+ if scoring not in self.VALID_SCORING:
129
+ raise ValueError(f"Scoring must be one of {self.VALID_SCORING}")
130
+ if not isinstance(n_trials, int) or n_trials <= 0:
131
+ raise ValueError("n_trials must be a positive integer")
132
+
133
+ self.algorithm = algorithm
134
+ self.n_trials = n_trials
135
+ self.scoring = scoring
136
+ self.verbose = verbose
137
+ self.show_progress_bar = show_progress_bar
138
+ self.timeout = timeout
139
+ self.trial_timeout = trial_timeout
140
+ self.storage = storage
141
+ self.logfile = logfile
142
+
143
+ def fit(self, X, y=None):
144
+ # Configure optuna verbosity locally
145
+ if isinstance(self.verbose, bool):
146
+ optuna.logging.set_verbosity(
147
+ optuna.logging.INFO if self.verbose else optuna.logging.WARNING
148
+ )
149
+ _show_progress_bar = self.show_progress_bar if not self.verbose else False
150
+ elif isinstance(self.verbose, int):
151
+ optuna.logging.set_verbosity(self.verbose)
152
+ _show_progress_bar = self.show_progress_bar
153
+ else:
154
+ _show_progress_bar = self.show_progress_bar
155
+
156
+ # Resolve storage
157
+ storage = self.storage
158
+ if storage is None:
159
+ storage = optuna.storages.InMemoryStorage()
160
+
161
+ study_name = f"study_{self.algorithm}_{self.scoring}"
162
+ logger.info("Storage: %s, internal study name: %s", storage, study_name)
163
+
164
+ def timeout_handler(signum, frame):
165
+ raise TimeoutError("Objective function timed out")
166
+
167
+ def objective(trial):
168
+ if self.trial_timeout:
169
+ signal.signal(signal.SIGALRM, timeout_handler)
170
+ signal.alarm(int(self.trial_timeout))
171
+
172
+ try:
173
+ model = self._suggest_model(trial, X)
174
+ model.fit(X)
175
+ labels = (
176
+ model.labels_ if hasattr(model, "labels_") else model.predict(X)
177
+ )
178
+ score = self._compute_score(X, labels)
179
+ return score
180
+ except TimeoutError:
181
+ trial.report(float("-inf"), step=0)
182
+ raise optuna.TrialPruned("Trial pruned due to timeout")
183
+ finally:
184
+ if self.trial_timeout:
185
+ signal.alarm(0)
186
+
187
+ # Determine direction of optimization
188
+ direction = "maximize"
189
+ if self.scoring == "davies_bouldin_score":
190
+ direction = "minimize"
191
+
192
+ self.study_ = optuna.create_study(
193
+ direction=direction,
194
+ study_name=study_name,
195
+ storage=storage,
196
+ load_if_exists=True,
197
+ )
198
+
199
+ try:
200
+ n_existing = len(self.study_.trials)
201
+ if n_existing > 0:
202
+ logger.info(
203
+ "Resuming optimization from storage, starting from trial %d.",
204
+ n_existing,
205
+ )
206
+ else:
207
+ logger.info("Starting a new optimization.")
208
+
209
+ self.study_.optimize(
210
+ objective,
211
+ n_trials=self.n_trials,
212
+ show_progress_bar=_show_progress_bar,
213
+ timeout=self.timeout,
214
+ )
215
+ self.best_params_ = self.study_.best_params
216
+ logger.info(
217
+ "Optimization completed. Best parameters: %s", self.best_params_
218
+ )
219
+
220
+ self.model_ = self._get_best_model(X)
221
+ self.model_.fit(X)
222
+
223
+ self.labels_ = (
224
+ self.model_.labels_
225
+ if hasattr(self.model_, "labels_")
226
+ else self.model_.predict(X)
227
+ )
228
+ logger.info(
229
+ "Final model fitted. Number of clusters: %d",
230
+ len(set(self.labels_)),
231
+ )
232
+
233
+ # Eagerly compute cluster descriptors
234
+ self.centroids_ = self._compute_centroids(X)
235
+ self.medoids_ = self._compute_medoids(X)
236
+ self.modes_ = self._compute_modes(X)
237
+
238
+ except ValueError as e:
239
+ if "No trials are completed yet" in str(e):
240
+ logger.warning(
241
+ "All trials were pruned. No valid results were obtained."
242
+ )
243
+ self.best_params_ = None
244
+ self.model_ = None
245
+ self.labels_ = None
246
+ self.centroids_ = None
247
+ self.medoids_ = None
248
+ self.modes_ = None
249
+ else:
250
+ logger.error("Error during optimization: %s", str(e))
251
+ raise
252
+
253
+ return self
254
+
255
+ def fit_predict(self, X, y=None):
256
+ self.fit(X, y)
257
+ return self.labels_
258
+
259
+ def predict(self, X):
260
+ check_is_fitted(self)
261
+ if self.model_ is None:
262
+ raise ValueError(
263
+ "No valid model available. Ensure that trials completed successfully."
264
+ )
265
+ if self.algorithm not in self.ALGORITHMS_WITH_PREDICT:
266
+ raise TypeError(
267
+ f"Algorithm '{self.algorithm}' does not support predict(). "
268
+ f"Algorithms with predict: {sorted(self.ALGORITHMS_WITH_PREDICT)}"
269
+ )
270
+ return self.model_.predict(X)
271
+
272
+ def _compute_score(self, X, labels):
273
+ # Filter out noise points for all metrics
274
+ mask = labels != -1
275
+ non_noise_labels = labels[mask]
276
+
277
+ if len(set(non_noise_labels)) <= 1:
278
+ raise optuna.TrialPruned(
279
+ "Only one cluster found (excluding noise), pruning this trial."
280
+ )
281
+
282
+ X_filtered = X[mask]
283
+
284
+ if self.scoring == "silhouette_score":
285
+ score = silhouette_score(X_filtered, non_noise_labels)
286
+ elif self.scoring == "calinski_harabasz_score":
287
+ score = calinski_harabasz_score(X_filtered, non_noise_labels)
288
+ elif self.scoring == "davies_bouldin_score":
289
+ score = davies_bouldin_score(X_filtered, non_noise_labels)
290
+ else:
291
+ raise ValueError(f"Unsupported scoring method: {self.scoring}")
292
+ return score
293
+
294
+ def _suggest_model(self, trial, X):
295
+
296
+ if self.algorithm == "kmeans":
297
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
298
+ max_iter = trial.suggest_int("max_iter", 100, 500)
299
+ tol = trial.suggest_float("tol", 1e-6, 1e-2)
300
+ return KMeans(
301
+ n_clusters=n_clusters, max_iter=max_iter, tol=tol, n_init="auto"
302
+ )
303
+
304
+ elif self.algorithm == "minibatchkmeans":
305
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
306
+ batch_size = trial.suggest_int("batch_size", 10, 200)
307
+ max_iter = trial.suggest_int("max_iter", 100, 500)
308
+ tol = trial.suggest_float("tol", 1e-6, 1e-2)
309
+ return MiniBatchKMeans(
310
+ n_clusters=n_clusters,
311
+ batch_size=batch_size,
312
+ max_iter=max_iter,
313
+ tol=tol,
314
+ n_init="auto",
315
+ )
316
+
317
+ elif self.algorithm == "dbscan":
318
+ eps = trial.suggest_float("eps", 0.1, 10.0)
319
+ min_samples = trial.suggest_int("min_samples", 2, 10)
320
+ metric = trial.suggest_categorical(
321
+ "metric", ["euclidean", "manhattan", "chebyshev", "minkowski"]
322
+ )
323
+ if metric == "minkowski":
324
+ p = trial.suggest_int("p", 1, 5)
325
+ return DBSCAN(eps=eps, min_samples=min_samples, metric=metric, p=p)
326
+ else:
327
+ return DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
328
+
329
+ elif self.algorithm == "meanshift":
330
+ bandwidth = trial.suggest_float("bandwidth", 0.1, 10.0)
331
+ bin_seeding = trial.suggest_categorical("bin_seeding", [True, False])
332
+ return MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding)
333
+
334
+ elif self.algorithm == "agglomerativeclustering":
335
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
336
+ linkage = trial.suggest_categorical(
337
+ "linkage", ["ward", "complete", "average", "single"]
338
+ )
339
+ return AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
340
+
341
+ elif self.algorithm == "spectralclustering":
342
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
343
+ n_neighbors = trial.suggest_int("n_neighbors", 2, 20)
344
+ eigen_tol = trial.suggest_float("eigen_tol", 1e-6, 1e-2)
345
+ return SpectralClustering(
346
+ n_clusters=n_clusters, n_neighbors=n_neighbors, eigen_tol=eigen_tol
347
+ )
348
+
349
+ elif self.algorithm == "affinitypropagation":
350
+ damping = trial.suggest_float("damping", 0.5, 0.99)
351
+ convergence_iter = trial.suggest_int("convergence_iter", 10, 200)
352
+ return AffinityPropagation(
353
+ damping=damping, convergence_iter=convergence_iter
354
+ )
355
+
356
+ elif self.algorithm == "birch":
357
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
358
+ threshold = trial.suggest_float("threshold", 0.1, 1.0)
359
+ branching_factor = trial.suggest_int("branching_factor", 20, 100)
360
+ return Birch(
361
+ n_clusters=n_clusters,
362
+ threshold=threshold,
363
+ branching_factor=branching_factor,
364
+ )
365
+
366
+ elif self.algorithm == "optics":
367
+ min_samples = trial.suggest_int("min_samples", 2, 10)
368
+ cluster_method = trial.suggest_categorical(
369
+ "cluster_method", ["xi", "dbscan"]
370
+ )
371
+ return OPTICS(
372
+ max_eps=np.inf,
373
+ min_samples=min_samples,
374
+ cluster_method=cluster_method,
375
+ )
376
+
377
+ elif self.algorithm == "gaussianmixture":
378
+ n_components = trial.suggest_int("n_components", 2, 10)
379
+ covariance_type = trial.suggest_categorical(
380
+ "covariance_type", ["full", "tied", "diag", "spherical"]
381
+ )
382
+ return GaussianMixture(
383
+ n_components=n_components, covariance_type=covariance_type
384
+ )
385
+
386
+ elif self.algorithm == "hdbscan":
387
+ min_cluster_size = trial.suggest_int("min_cluster_size", 2, 50)
388
+ min_samples = trial.suggest_int("min_samples", 1, 10)
389
+ cluster_selection_epsilon = trial.suggest_float(
390
+ "cluster_selection_epsilon", 0, 1
391
+ )
392
+ allow_single_cluster = trial.suggest_categorical(
393
+ "allow_single_cluster", [True, False]
394
+ )
395
+ return hdbscan.HDBSCAN(
396
+ min_cluster_size=min_cluster_size,
397
+ min_samples=min_samples,
398
+ cluster_selection_epsilon=cluster_selection_epsilon,
399
+ allow_single_cluster=allow_single_cluster,
400
+ )
401
+
402
+ elif self.algorithm == "kmedoids":
403
+ n_clusters = trial.suggest_int("n_clusters", 2, 50)
404
+ method = trial.suggest_categorical(
405
+ "method",
406
+ [
407
+ "fasterpam",
408
+ "pam",
409
+ "alternate",
410
+ "fastermsc",
411
+ "fastmsc",
412
+ "pamsil",
413
+ "pammedsil",
414
+ ],
415
+ )
416
+ return KMedoids(n_clusters=n_clusters, method=method, metric="euclidean")
417
+
418
+ elif self.algorithm == "sleep":
419
+ # Fake algorithm to induce timeout for testing
420
+ time.sleep(3)
421
+ return KMeans(n_clusters=3, n_init="auto")
422
+
423
+ elif self.algorithm == "som":
424
+ m = trial.suggest_int("m", 2, 20)
425
+ n = trial.suggest_int("n", 2, 20)
426
+ return SOM(m=m, n=n, dim=X.shape[1])
427
+
428
+ else:
429
+ raise ValueError(f"Unsupported algorithm: {self.algorithm}")
430
+
431
+ def _get_best_model(self, X):
432
+ params = dict(self.best_params_)
433
+ # Remove conditional parameters that don't apply
434
+ if self.algorithm == "dbscan" and params.get("metric") != "minkowski":
435
+ params.pop("p", None)
436
+ trial = optuna.trial.FixedTrial(params)
437
+ return self._suggest_model(trial, X)
438
+
439
+ def _compute_centroids(self, X):
440
+ """Compute arithmetic mean centroid for each cluster."""
441
+ if self.labels_ is None:
442
+ return None
443
+ unique_labels = np.unique(self.labels_)
444
+ centroids = []
445
+ for label in unique_labels:
446
+ if label == -1:
447
+ continue
448
+ cluster_points = X[self.labels_ == label]
449
+ centroids.append(cluster_points.mean(axis=0))
450
+ if len(centroids) == 0:
451
+ return None
452
+ return np.array(centroids)
453
+
454
+ def _compute_medoids(self, X):
455
+ """Compute medoid (point with minimum total squared Euclidean distance) for each cluster."""
456
+ if self.labels_ is None:
457
+ return None
458
+ unique_labels = np.unique(self.labels_)
459
+ medoids = []
460
+ for label in unique_labels:
461
+ if label == -1:
462
+ continue
463
+ cluster_points = X[self.labels_ == label]
464
+ if len(cluster_points) == 0:
465
+ continue
466
+ # Squared Euclidean pairwise distances
467
+ distances = np.sum(
468
+ (cluster_points[:, np.newaxis] - cluster_points[np.newaxis, :]) ** 2,
469
+ axis=2,
470
+ )
471
+ medoid_index = np.argmin(np.sum(distances, axis=1))
472
+ medoids.append(cluster_points[medoid_index])
473
+ if len(medoids) == 0:
474
+ return None
475
+ return np.array(medoids)
476
+
477
+ def _compute_modes(self, X):
478
+ """Compute mode (highest density point) for each cluster using KDE."""
479
+ if self.labels_ is None:
480
+ return None
481
+ unique_labels = np.unique(self.labels_)
482
+ modes = []
483
+ for label in unique_labels:
484
+ if label == -1:
485
+ continue
486
+ cluster_points = X[self.labels_ == label]
487
+ if len(cluster_points) == 0:
488
+ continue
489
+ kde = KernelDensity(kernel="gaussian", bandwidth="scott").fit(
490
+ cluster_points
491
+ )
492
+ # Evaluate density at actual data points instead of exponential grid
493
+ densities = kde.score_samples(cluster_points)
494
+ mode_index = np.argmax(densities)
495
+ modes.append(cluster_points[mode_index])
496
+ if len(modes) == 0:
497
+ return None
498
+ return np.array(modes)
499
+
500
+ @property
501
+ def cluster_centers_(self):
502
+ check_is_fitted(self)
503
+ if self.model_ is not None and hasattr(self.model_, "cluster_centers_"):
504
+ return self.model_.cluster_centers_
505
+ return None
506
+
507
+
508
+ class ClustGridSearch(BaseEstimator, ClusterMixin):
509
+
510
+ def __init__(
511
+ self,
512
+ mode="full",
513
+ n_trials=20,
514
+ scoring="silhouette_score",
515
+ verbose=False,
516
+ show_progress_bar=True,
517
+ ):
518
+ """
519
+ Initialize the ClustGridSearch.
520
+
521
+ :param mode: 'full' to test all algorithms, 'fast' to test a subset (kmeans and hdbscan).
522
+ :param n_trials: Number of trials for each algorithm's hyperparameter optimization.
523
+ :param scoring: The metric used to select the best clustering (default: 'silhouette_score').
524
+ :param verbose: Whether to print additional information during the search.
525
+ """
526
+ self.mode = mode
527
+ self.n_trials = n_trials
528
+ self.scoring = scoring
529
+ self.verbose = verbose
530
+ self.show_progress_bar = show_progress_bar
531
+
532
+ if self.mode == "full":
533
+ self.algorithms = [
534
+ "kmeans",
535
+ "kmedoids",
536
+ "minibatchkmeans",
537
+ "dbscan",
538
+ "agglomerativeclustering",
539
+ "meanshift",
540
+ "spectralclustering",
541
+ "affinitypropagation",
542
+ "birch",
543
+ "optics",
544
+ "gaussianmixture",
545
+ "hdbscan",
546
+ ]
547
+ elif self.mode == "fast":
548
+ self.algorithms = ["kmeans", "hdbscan"]
549
+ else:
550
+ raise ValueError("Invalid mode. Use 'full' or 'fast'.")
551
+
552
+ def fit(self, X, y=None):
553
+ """
554
+ Run clustering for all selected algorithms and return the best one based on the chosen scoring.
555
+
556
+ :param X: Input data for clustering.
557
+ """
558
+ results = []
559
+ for algorithm in self.algorithms:
560
+ logger.info("Testing algorithm: %s", algorithm)
561
+
562
+ optimizer = Optimizer(
563
+ algorithm=algorithm,
564
+ n_trials=self.n_trials,
565
+ scoring=self.scoring,
566
+ verbose=self.verbose,
567
+ show_progress_bar=self.show_progress_bar,
568
+ )
569
+
570
+ try:
571
+ optimizer.fit(X)
572
+ score = optimizer.study_.best_value
573
+ results.append(
574
+ {
575
+ "algorithm": algorithm,
576
+ "mean_test_score": score,
577
+ "params": optimizer.best_params_,
578
+ "model": optimizer,
579
+ }
580
+ )
581
+ except Exception as e:
582
+ logger.error("Error for algorithm %s: %s", algorithm, e)
583
+
584
+ if not results:
585
+ raise ValueError("No algorithms produced valid results.")
586
+
587
+ self.cv_results_ = {
588
+ "algorithm": [res["algorithm"] for res in results],
589
+ "mean_test_score": [res["mean_test_score"] for res in results],
590
+ "params": [res["params"] for res in results],
591
+ "model": [res["model"] for res in results],
592
+ }
593
+
594
+ reverse = self.scoring != "davies_bouldin_score"
595
+ scores = self.cv_results_["mean_test_score"]
596
+ if reverse:
597
+ best_idx = np.argmax(scores)
598
+ else:
599
+ best_idx = np.argmin(scores)
600
+ self.best_index_ = best_idx
601
+ self.best_score_ = scores[best_idx]
602
+ self.best_params_ = self.cv_results_["params"][best_idx]
603
+ self.best_estimator_ = self.cv_results_["model"][best_idx]
604
+ return self
605
+
606
+ def predict(self, X):
607
+ check_is_fitted(self)
608
+ return self.best_estimator_.predict(X)
609
+
610
+ def fit_predict(self, X, y=None):
611
+ self.fit(X, y)
612
+ return self.best_estimator_.labels_
613
+
614
+ @property
615
+ def labels_(self):
616
+ check_is_fitted(self)
617
+ return self.best_estimator_.labels_
618
+
619
+ @property
620
+ def cluster_centers_(self):
621
+ check_is_fitted(self)
622
+ return self.best_estimator_.cluster_centers_
623
+
624
+ @property
625
+ def centroids_(self):
626
+ check_is_fitted(self)
627
+ return self.best_estimator_.centroids_
628
+
629
+ @property
630
+ def medoids_(self):
631
+ check_is_fitted(self)
632
+ return self.best_estimator_.medoids_
633
+
634
+ @property
635
+ def modes_(self):
636
+ check_is_fitted(self)
637
+ return self.best_estimator_.modes_
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.4
2
+ Name: optuclust
3
+ Version: 0.0.2
4
+ Summary: Hyperparameter optimization for multiple clustering algorithms using Optuna, with Scikit-learn API
5
+ Home-page: https://github.com/filipsPL/optuclust
6
+ Author: Filip S.
7
+ Author-email: filip.ursynow@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: numpy>=1.21.0
15
+ Requires-Dist: scikit-learn>=1.1
16
+ Requires-Dist: hdbscan>=0.8.29
17
+ Requires-Dist: optuna>=3.0
18
+ Requires-Dist: kmedoids>=0.3.0
19
+ Requires-Dist: matplotlib>=3.4
20
+ Requires-Dist: pandas>=1.3
21
+ Requires-Dist: sklearn-som
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: license-file
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # optuclust
35
+
36
+ **optuclust** is a Python module for optimizing clustering algorithms using the [Optuna](https://optuna.org/) framework. It provides a scikit-learn compatible API with support for a variety of clustering methods and offers additional capabilities such as the calculation of centroids, medoids, and modes for clusters.
37
+
38
+ [![Python manual install](https://github.com/filipsPL/optuclust/actions/workflows/python-package.yml/badge.svg)](https://github.com/filipsPL/optuclust/actions/workflows/python-package.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.18608559.svg)](https://doi.org/10.5281/zenodo.18608559)
39
+
40
+ ## Features
41
+
42
+ - **Parameter Optimization:** Optimize clustering parameters for various algorithms using **Optuna**.
43
+ - **Supported Clustering Methods:**
44
+ - Algorithms from scikit-learn, such as KMeans, DBSCAN, and Agglomerative Clustering.
45
+ - Advanced methods like HDBSCAN, Self-Organizing Maps (SOM), and kMedoids.
46
+ - **Metrics and Scoring:**
47
+ - `silhouette_score`
48
+ - `calinski_harabasz_score`
49
+ - `davies_bouldin_score` (automatically minimized)
50
+ - Noise points (label=-1) are filtered out before score computation for density-based algorithms.
51
+ - **Clustering Insights:** Provides centroids (arithmetic mean), medoids (Euclidean), and modes (KDE with Scott's bandwidth) for clusters, even if the algorithm does not natively support these features. All descriptors are computed eagerly during `fit()` and work in any number of dimensions.
52
+ - **Scikit-learn Compatible:** Inherits from `BaseEstimator` and `ClusterMixin`. Works with `clone()`, `check_is_fitted()`, and scikit-learn pipelines.
53
+ - **ClustGridSearch Class:** A utility to test all clustering algorithms and identify the best one.
54
+ - **Timeout Management:** Separate timeouts for optimization runs (`timeout`) and individual trials (`trial_timeout`).
55
+ - **Storage and Resume:** Store optimization results in a SQLite database for future analysis, and resume the optimization process later.
56
+
57
+ ## Installation
58
+
59
+ 1. Clone this repository:
60
+
61
+ ```bash
62
+ git clone git@github.com:filipsPL/optuclust.git
63
+ ```
64
+
65
+ 2. Navigate to the cloned directory and install the required dependencies:
66
+
67
+ ```bash
68
+ cd optuclust
69
+ pip install -r requirements.txt
70
+ ```
71
+
72
+ 3. Install **optuclust**:
73
+
74
+ ```bash
75
+ python setup.py install
76
+ ```
77
+
78
+ **Requires:** Python >= 3.8, scikit-learn >= 1.1
79
+
80
+ ## Usage
81
+
82
+ ### 1. Optimizing a Clustering Algorithm
83
+
84
+ ```python
85
+ from optuclust import Optimizer
86
+ from sklearn.datasets import make_blobs
87
+
88
+ # Generate synthetic data
89
+ X, _ = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42)
90
+
91
+ # Instantiate and fit the optimizer for KMeans
92
+ optimizer = Optimizer(algorithm="kmeans", n_trials=50, scoring="silhouette_score", verbose=True)
93
+ optimizer.fit(X)
94
+
95
+ # Access cluster details
96
+ print("Cluster Labels:", optimizer.labels_)
97
+ print("Centroids:", optimizer.centroids_)
98
+ print("Medoids:", optimizer.medoids_)
99
+ print("Modes:", optimizer.modes_)
100
+ print("Cluster Centers (native):", optimizer.cluster_centers_)
101
+ ```
102
+
103
+ ### 2. ClustGridSearch
104
+
105
+ ```python
106
+ from optuclust import ClustGridSearch
107
+ from sklearn.datasets import make_blobs
108
+
109
+ # Generate synthetic data
110
+ X, _ = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42)
111
+
112
+ # Initialize ClustGridSearch to test all algorithms
113
+ grid_search = ClustGridSearch(mode="full", scoring="silhouette_score", verbose=True)
114
+
115
+ # Fit and get the best method
116
+ grid_search.fit(X)
117
+ print("Best Algorithm:", grid_search.best_estimator_.algorithm)
118
+ print("Best Score:", grid_search.best_score_)
119
+ print("Best Parameters:", grid_search.best_params_)
120
+ ```
121
+
122
+ ### 3. Benchmark Example
123
+
124
+ To benchmark different clustering algorithms, you can use the provided example script:
125
+
126
+ ```bash
127
+ python example-loop.py
128
+ ```
129
+
130
+ The benchmark will evaluate different clustering methods on various datasets and save the performance metrics and plots.
131
+
132
+ ## Supported Algorithms
133
+
134
+ ```python
135
+ algorithms = [
136
+ 'kmeans', 'kmedoids', 'minibatchkmeans', 'dbscan', 'agglomerativeclustering',
137
+ 'meanshift', 'spectralclustering', 'gaussianmixture', 'hdbscan',
138
+ 'affinitypropagation', 'birch', 'optics', 'som'
139
+ ]
140
+ ```
141
+
142
+ **Note:** Not all algorithms support `predict()` on new data. Algorithms with inductive prediction: `kmeans`, `minibatchkmeans`, `meanshift`, `birch`, `gaussianmixture`, `kmedoids`, `som`. Calling `predict()` on other algorithms (e.g. `dbscan`, `hdbscan`) will raise a `TypeError`.
143
+
144
+ ## Parameters
145
+
146
+ ### Optimizer Class
147
+
148
+ - **algorithm:** The clustering algorithm to optimize. Options include those listed in Supported Algorithms.
149
+ - **n_trials:** Number of Optuna trials for optimization. Default is 50.
150
+ - **scoring:** The metric to optimize. Options are `silhouette_score`, `calinski_harabasz_score`, and `davies_bouldin_score`.
151
+ - **verbose:** Enable additional logging if set to `True`. Can also be an `int` to set Optuna's verbosity level directly.
152
+ - **show_progress_bar:** Display a progress bar during optimization. Default is `True`.
153
+ - **timeout:** Maximum duration (in seconds) for all trials in the optimization process.
154
+ - **trial_timeout:** Maximum duration (in seconds) for each individual trial (Unix only, uses `SIGALRM`).
155
+ - **storage:** Optuna storage URI, e.g. `sqlite:///optimization.db`. When provided, enables resuming a previous optimization run.
156
+ - **logfile:** Reserved for future use.
157
+
158
+ ### Fitted Attributes
159
+
160
+ After calling `fit(X)`:
161
+
162
+ - **labels\_:** Cluster labels for each sample.
163
+ - **best\_params\_:** Dictionary of the best hyperparameters found.
164
+ - **model\_:** The fitted clustering model with the best parameters.
165
+ - **study\_:** The Optuna `Study` object with full trial history.
166
+ - **centroids\_:** Arithmetic mean of each cluster (excludes noise points).
167
+ - **medoids\_:** Most central data point in each cluster (Euclidean distance).
168
+ - **modes\_:** Highest density point in each cluster (KDE with Scott's rule bandwidth).
169
+ - **cluster\_centers\_:** Native cluster centers from the model (if available), otherwise `None`.
170
+
171
+ ### ClustGridSearch Class
172
+
173
+ - **mode:**
174
+ - `full`: Test all algorithms.
175
+ - `fast`: Test a subset of algorithms (`kmeans` and `hdbscan`).
176
+ - **n_trials:** Number of Optuna trials for each algorithm. Default is 20.
177
+ - **scoring:** Metric to select the best clustering algorithm. Options are `silhouette_score`, `calinski_harabasz_score`, and `davies_bouldin_score`.
178
+ - **verbose:** Enable detailed logging if set to `True`.
179
+ - **show_progress_bar:** Display a progress bar for each algorithm.
180
+
181
+ ## Running Tests
182
+
183
+ We use **pytest** for testing. To run tests, simply run:
184
+
185
+ ```bash
186
+ pytest -v
187
+ ```
@@ -0,0 +1,9 @@
1
+ optuclust/__init__.py,sha256=1bwbNEMdHyRItYBEHlSyYyfre4-RnwnG9wLngLa3Eu4,71
2
+ optuclust/optuclust.py,sha256=nwdsr1av5bc3bsNb5l3iusYIiZCfsqurnaxl7beGP4g,21922
3
+ optuclust-0.0.2.dist-info/licenses/LICENSE,sha256=F7Zfn3KEW4YveQcP-f7E04EIEmbVXXrvwiS2W-1IPVI,1063
4
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ tests/test_optuclust.py,sha256=u0nIGgYPO6NaNaU90wvjwFJd1-3xr5nXdqRTnsDUYVU,5606
6
+ optuclust-0.0.2.dist-info/METADATA,sha256=zMnA0Na9xqCIpd2X-X9PgNCNmQFCU0oLlT9dtrcTZvQ,7478
7
+ optuclust-0.0.2.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
8
+ optuclust-0.0.2.dist-info/top_level.txt,sha256=_3ikUBQNQsleaezLn_of9BtjDTACey5e15hGW9lSeZ8,16
9
+ optuclust-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 filips
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ optuclust
2
+ tests
tests/__init__.py ADDED
File without changes
@@ -0,0 +1,172 @@
1
+ import pytest
2
+ import numpy as np
3
+ from sklearn.exceptions import NotFittedError
4
+ from sklearn.datasets import make_blobs
5
+
6
+ from optuclust import Optimizer
7
+
8
+
9
+ # Fixture to generate a synthetic dataset
10
+ @pytest.fixture
11
+ def data():
12
+ X, _ = make_blobs(n_samples=100, centers=3, n_features=2, random_state=42)
13
+ return X
14
+
15
+
16
+ def _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True):
17
+ """Helper to verify shape and type of labels, centroids, medoids, modes."""
18
+ n_samples, n_features = data.shape
19
+
20
+ assert optimizer.labels_ is not None
21
+ assert optimizer.labels_.shape == (n_samples,)
22
+
23
+ # Count non-noise clusters
24
+ non_noise = set(optimizer.labels_) - {-1}
25
+ n_clusters = len(non_noise)
26
+ assert n_clusters >= 1
27
+
28
+ if expect_cluster_centers:
29
+ assert optimizer.cluster_centers_ is not None
30
+ assert optimizer.cluster_centers_.shape[1] == n_features
31
+ else:
32
+ assert optimizer.cluster_centers_ is None
33
+
34
+ assert optimizer.centroids_ is not None
35
+ assert optimizer.centroids_.shape == (n_clusters, n_features)
36
+
37
+ assert optimizer.medoids_ is not None
38
+ assert optimizer.medoids_.shape == (n_clusters, n_features)
39
+
40
+ assert optimizer.modes_ is not None
41
+ assert optimizer.modes_.shape == (n_clusters, n_features)
42
+
43
+
44
+ # Test for KMeans algorithm
45
+ def test_kmeans(data):
46
+ optimizer = Optimizer(algorithm="kmeans", n_trials=10, verbose=False)
47
+ optimizer.fit(data)
48
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
49
+
50
+
51
+ # Test for KMedoids algorithm
52
+ def test_kmedoids(data):
53
+ optimizer = Optimizer(algorithm="kmedoids", n_trials=10, verbose=False)
54
+ optimizer.fit(data)
55
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
56
+
57
+
58
+ # Test for MiniBatchKMeans algorithm
59
+ def test_minibatchkmeans(data):
60
+ optimizer = Optimizer(algorithm="minibatchkmeans", n_trials=10, verbose=False)
61
+ optimizer.fit(data)
62
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
63
+
64
+
65
+ # Test for DBSCAN algorithm (should not provide cluster centers)
66
+ def test_dbscan(data):
67
+ optimizer = Optimizer(algorithm="dbscan", n_trials=10, verbose=False)
68
+ optimizer.fit(data)
69
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=False)
70
+
71
+
72
+ # Test for MeanShift algorithm
73
+ def test_meanshift(data):
74
+ optimizer = Optimizer(algorithm="meanshift", n_trials=10, verbose=False)
75
+ optimizer.fit(data)
76
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
77
+
78
+
79
+ # Test for HDBSCAN algorithm
80
+ def test_hdbscan(data):
81
+ optimizer = Optimizer(algorithm="hdbscan", n_trials=10, verbose=False)
82
+ optimizer.fit(data)
83
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=False)
84
+
85
+
86
+ # Test for SOM algorithm
87
+ def test_som(data):
88
+ optimizer = Optimizer(algorithm="som", n_trials=10, verbose=False)
89
+ optimizer.fit(data)
90
+
91
+ n_samples, n_features = data.shape
92
+ assert optimizer.labels_ is not None
93
+ assert optimizer.labels_.shape == (n_samples,)
94
+ assert optimizer.centroids_ is not None
95
+ assert optimizer.medoids_ is not None
96
+ assert optimizer.modes_ is not None
97
+
98
+
99
+ # Test overall optimization timeout
100
+ def test_kmeans_timeout(data):
101
+ from time import time
102
+
103
+ optimizer = Optimizer(algorithm="kmeans", timeout=3, n_trials=1000, verbose=False)
104
+ start_time = time()
105
+ optimizer.fit(data)
106
+ elapsed_time = time() - start_time
107
+
108
+ assert elapsed_time <= 5.0, "Optimizer did not respect the timeout"
109
+
110
+
111
+ # Test per-trial timeout (should prune the sleeping trial)
112
+ def test_trial_timeout1(data):
113
+ optimizer = Optimizer(algorithm="sleep", trial_timeout=1, n_trials=2)
114
+ optimizer.fit(data)
115
+ # All trials should be pruned, so no valid model
116
+ assert optimizer.model_ is None
117
+
118
+
119
+ # Test per-trial timeout (timeout > sleep, so trials succeed)
120
+ def test_trial_timeout10(data):
121
+ optimizer = Optimizer(algorithm="sleep", trial_timeout=10, n_trials=2)
122
+ optimizer.fit(data)
123
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
124
+
125
+
126
+ # Test for storage and resume
127
+ def test_storage_and_resume(data):
128
+ storage_path = "test-storage+resume.db"
129
+ storage_uri = f"sqlite:///{storage_path}"
130
+
131
+ import os
132
+
133
+ try:
134
+ optimizer = Optimizer(
135
+ algorithm="kmeans", n_trials=10, verbose=False, storage=storage_uri
136
+ )
137
+ optimizer.fit(data)
138
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
139
+
140
+ # Run again to test resumption
141
+ optimizer = Optimizer(
142
+ algorithm="kmeans", n_trials=10, verbose=False, storage=storage_uri
143
+ )
144
+ optimizer.fit(data)
145
+ _assert_cluster_descriptors(optimizer, data, expect_cluster_centers=True)
146
+
147
+ finally:
148
+ if os.path.exists(storage_path):
149
+ os.remove(storage_path)
150
+
151
+
152
+ def test_invalid_algorithm(data):
153
+ with pytest.raises(ValueError, match="Algorithm must be one of"):
154
+ Optimizer(algorithm="kmeans_dupa", n_trials=10, verbose=False)
155
+
156
+
157
+ def test_invalid_scoring(data):
158
+ with pytest.raises(ValueError, match="Scoring must be one of"):
159
+ Optimizer(algorithm="kmeans", scoring="filips_score", n_trials=10, verbose=False)
160
+
161
+
162
+ def test_not_fitted_error():
163
+ optimizer = Optimizer(algorithm="kmeans", n_trials=10, verbose=False)
164
+ with pytest.raises(NotFittedError):
165
+ _ = optimizer.cluster_centers_
166
+
167
+
168
+ def test_predict_unsupported_algorithm(data):
169
+ optimizer = Optimizer(algorithm="dbscan", n_trials=10, verbose=False)
170
+ optimizer.fit(data)
171
+ with pytest.raises(TypeError, match="does not support predict"):
172
+ optimizer.predict(data)