scikit-clarans 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. scikit_clarans-0.1.1/LICENSE +21 -0
  2. scikit_clarans-0.1.1/MANIFEST.in +5 -0
  3. scikit_clarans-0.1.1/PKG-INFO +139 -0
  4. scikit_clarans-0.1.1/README.md +85 -0
  5. scikit_clarans-0.1.1/clarans/__init__.py +4 -0
  6. scikit_clarans-0.1.1/clarans/clarans.py +393 -0
  7. scikit_clarans-0.1.1/clarans/fast_clarans.py +325 -0
  8. scikit_clarans-0.1.1/clarans/initialization.py +183 -0
  9. scikit_clarans-0.1.1/clarans/tests/__init__.py +0 -0
  10. scikit_clarans-0.1.1/clarans/tests/test_algorithm_logic.py +433 -0
  11. scikit_clarans-0.1.1/clarans/tests/test_clarans.py +91 -0
  12. scikit_clarans-0.1.1/clarans/tests/test_common.py +11 -0
  13. scikit_clarans-0.1.1/clarans/tests/test_fast_clarans.py +71 -0
  14. scikit_clarans-0.1.1/clarans/tests/test_logic_verification.py +505 -0
  15. scikit_clarans-0.1.1/clarans/utils.py +27 -0
  16. scikit_clarans-0.1.1/docs/source/api.rst +33 -0
  17. scikit_clarans-0.1.1/docs/source/contributing.rst +22 -0
  18. scikit_clarans-0.1.1/docs/source/examples.rst +49 -0
  19. scikit_clarans-0.1.1/docs/source/gallery/2d_clustering.rst +134 -0
  20. scikit_clarans-0.1.1/docs/source/gallery/comparison.rst +60 -0
  21. scikit_clarans-0.1.1/docs/source/gallery/performance.rst +125 -0
  22. scikit_clarans-0.1.1/docs/source/gallery/quality_vs_k.rst +64 -0
  23. scikit_clarans-0.1.1/docs/source/index.rst +53 -0
  24. scikit_clarans-0.1.1/docs/source/installation.rst +36 -0
  25. scikit_clarans-0.1.1/docs/source/license.rst +8 -0
  26. scikit_clarans-0.1.1/docs/source/project.rst +17 -0
  27. scikit_clarans-0.1.1/docs/source/usage.rst +75 -0
  28. scikit_clarans-0.1.1/examples/01_quick_start.py +47 -0
  29. scikit_clarans-0.1.1/examples/02_compare_initializations.py +53 -0
  30. scikit_clarans-0.1.1/examples/03_metrics_demo.py +50 -0
  31. scikit_clarans-0.1.1/examples/04_sparse_input.py +31 -0
  32. scikit_clarans-0.1.1/examples/05_pipeline_gridsearch.py +69 -0
  33. scikit_clarans-0.1.1/examples/06_predict_new_data.py +33 -0
  34. scikit_clarans-0.1.1/examples/07_custom_init_centers.py +30 -0
  35. scikit_clarans-0.1.1/examples/08_performance_tuning.py +49 -0
  36. scikit_clarans-0.1.1/examples/09_compare_fastclarans_clarans.py +113 -0
  37. scikit_clarans-0.1.1/examples/10_transform_data.py +59 -0
  38. scikit_clarans-0.1.1/examples/archive/gallery_anisotropic.py +35 -0
  39. scikit_clarans-0.1.1/examples/archive/gallery_blobs.py +29 -0
  40. scikit_clarans-0.1.1/examples/archive/gallery_comparison.py +42 -0
  41. scikit_clarans-0.1.1/examples/archive/gallery_moons.py +31 -0
  42. scikit_clarans-0.1.1/examples/archive/gallery_parameter_sensitivity.py +57 -0
  43. scikit_clarans-0.1.1/examples/archive/gallery_runtime.py +51 -0
  44. scikit_clarans-0.1.1/examples/archive/gallery_silhouette.py +49 -0
  45. scikit_clarans-0.1.1/examples/clarans_examples.ipynb +963 -0
  46. scikit_clarans-0.1.1/scikit_clarans.egg-info/PKG-INFO +139 -0
  47. scikit_clarans-0.1.1/scikit_clarans.egg-info/SOURCES.txt +50 -0
  48. scikit_clarans-0.1.1/scikit_clarans.egg-info/dependency_links.txt +1 -0
  49. scikit_clarans-0.1.1/scikit_clarans.egg-info/requires.txt +23 -0
  50. scikit_clarans-0.1.1/scikit_clarans.egg-info/top_level.txt +1 -0
  51. scikit_clarans-0.1.1/setup.cfg +4 -0
  52. scikit_clarans-0.1.1/setup.py +55 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nguyễn Ngọc Thiện
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include clarans *.py
4
+ recursive-include examples *.py *.ipynb
5
+ recursive-include docs/source *.rst
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: scikit-clarans
3
+ Version: 0.1.1
4
+ Summary: A scikit-learn compatible implementation of CLARANS clustering algorithm
5
+ Home-page: https://github.com/ThienNguyen3001/scikit-clarans
6
+ Author: ThienNguyen3001
7
+ Author-email: thiennguyen03001@gmail.com
8
+ License: MIT
9
+ Keywords: clustering sklearn scikit-learn clarans k-medoids
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Requires-Python: >=3.8
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: numpy
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: scipy
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: pytest-cov; extra == "dev"
27
+ Requires-Dist: flake8; extra == "dev"
28
+ Requires-Dist: sphinx>=5.0; extra == "dev"
29
+ Requires-Dist: sphinx-rtd-theme; extra == "dev"
30
+ Requires-Dist: sphinx-copybutton; extra == "dev"
31
+ Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
32
+ Provides-Extra: test
33
+ Requires-Dist: pytest; extra == "test"
34
+ Requires-Dist: pytest-cov; extra == "test"
35
+ Requires-Dist: flake8; extra == "test"
36
+ Provides-Extra: docs
37
+ Requires-Dist: sphinx>=5.0; extra == "docs"
38
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
39
+ Requires-Dist: sphinx-copybutton; extra == "docs"
40
+ Requires-Dist: sphinx-autodoc-typehints; extra == "docs"
41
+ Dynamic: author
42
+ Dynamic: author-email
43
+ Dynamic: classifier
44
+ Dynamic: description
45
+ Dynamic: description-content-type
46
+ Dynamic: home-page
47
+ Dynamic: keywords
48
+ Dynamic: license
49
+ Dynamic: license-file
50
+ Dynamic: provides-extra
51
+ Dynamic: requires-dist
52
+ Dynamic: requires-python
53
+ Dynamic: summary
54
+
55
+ # scikit-clarans
56
+
57
+ > A scikit-learn compatible implementation of the **CLARANS** (Clustering Large Applications based on RANdomized Search) algorithm.
58
+
59
+ [![License](https://img.shields.io/github/license/ThienNguyen3001/scikit-clarans)](LICENSE)
60
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.18366802.svg)](https://doi.org/10.5281/zenodo.18366802)
61
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
62
+ [![Docs Build](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/docs-build.yml?branch=main&label=Docs%20Build)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/docs-build.yml)
63
+ [![Test Suite](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/test_suite.yml?branch=main&label=Test%20Suite)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/test_suite.yml)
64
+ [![Quality Check](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/lint_cov_check.yml?branch=main&label=Quality%20Check)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/lint_cov_check.yml)
65
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JdgVaZcbS1uwY7kPQZM8DtX97R9ga31d?usp=sharing)
66
+
67
+ **CLARANS** acts as a bridge between the high quality of **PAM (Partition Around Medoids)** and the speed required for large datasets. By using randomized search instead of exhaustive search, it finds high-quality medoids efficiently without exploring the entire graph of solutions.
68
+
69
+ ---
70
+
71
+ ## Features
72
+
73
+ * **Scikit-Learn Native**: Use it just like `KMeans` or `DBSCAN`. Drop-in compatibility for pipelines and cross-validation.
74
+ * **Scalable**: Designed to handle datasets where standard PAM/k-medoids is too slow.
75
+ * **Flexible**: Choose from multiple initialization strategies (`k-medoids++`, `build`, etc.) and distance metrics (`euclidean`, `manhattan`, `cosine`, etc.).
76
+
77
+ ## Installation
78
+
79
+ Install simply via pip:
80
+
81
+ ```bash
82
+ pip install .
83
+ ```
84
+
85
+ For development
86
+ ```bash
87
+ pip install -e .[dev]
88
+ ```
89
+
90
+ ## Quick Start
91
+ ### CLARANS
92
+ ```python
93
+ from clarans import CLARANS
94
+ from sklearn.datasets import make_blobs
95
+
96
+ # 1. Create dummy data
97
+ X, _ = make_blobs(n_samples=1000, centers=5, random_state=42)
98
+
99
+ # 2. Initialize CLARANS
100
+ # - n_clusters: 5 clusters
101
+ # - numlocal: 3 restarts for better quality
102
+ # - init: 'k-medoids++' for smart starting points
103
+ clarans = CLARANS(n_clusters=5, numlocal=3, init='k-medoids++', random_state=42)
104
+
105
+ # 3. Fit
106
+ clarans.fit(X)
107
+
108
+ # 4. Results
109
+ print("Medoid Indices:", clarans.medoid_indices_)
110
+ print("Labels:", clarans.labels_)
111
+ ```
112
+ ### FastCLARANS
113
+
114
+ For datasets that fit in memory, **FastCLARANS** can provide significant speedups by caching pairwise distances:
115
+
116
+ ```python
117
+ from clarans import FastCLARANS
118
+
119
+ fast_model = FastCLARANS(n_clusters=5, numlocal=3, random_state=42)
120
+ fast_model.fit(X)
121
+ ```
122
+
123
+ ## Examples
124
+
125
+ This repository includes a number of runnable examples in the `examples/` folder showing common usage patterns, integrations and a Jupyter notebook (`examples/clarans_examples.ipynb`) with many interactive recipes. Run any example with::
126
+
127
+ python examples/01_quick_start.py
128
+
129
+ ## Documentation
130
+
131
+ For full API reference and usage guides, please see the [Documentation](https://scikit-clarans.readthedocs.io/en/latest/index.html).
132
+
133
+ ## Contributing
134
+
135
+ Contributions are welcome! Please check out [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
136
+
137
+ ## License
138
+
139
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,85 @@
1
+ # scikit-clarans
2
+
3
+ > A scikit-learn compatible implementation of the **CLARANS** (Clustering Large Applications based on RANdomized Search) algorithm.
4
+
5
+ [![License](https://img.shields.io/github/license/ThienNguyen3001/scikit-clarans)](LICENSE)
6
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.18366802.svg)](https://doi.org/10.5281/zenodo.18366802)
7
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
8
+ [![Docs Build](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/docs-build.yml?branch=main&label=Docs%20Build)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/docs-build.yml)
9
+ [![Test Suite](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/test_suite.yml?branch=main&label=Test%20Suite)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/test_suite.yml)
10
+ [![Quality Check](https://img.shields.io/github/actions/workflow/status/ThienNguyen3001/scikit-clarans/lint_cov_check.yml?branch=main&label=Quality%20Check)](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/lint_cov_check.yml)
11
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JdgVaZcbS1uwY7kPQZM8DtX97R9ga31d?usp=sharing)
12
+
13
+ **CLARANS** acts as a bridge between the high quality of **PAM (Partition Around Medoids)** and the speed required for large datasets. By using randomized search instead of exhaustive search, it finds high-quality medoids efficiently without exploring the entire graph of solutions.
14
+
15
+ ---
16
+
17
+ ## Features
18
+
19
+ * **Scikit-Learn Native**: Use it just like `KMeans` or `DBSCAN`. Drop-in compatibility for pipelines and cross-validation.
20
+ * **Scalable**: Designed to handle datasets where standard PAM/k-medoids is too slow.
21
+ * **Flexible**: Choose from multiple initialization strategies (`k-medoids++`, `build`, etc.) and distance metrics (`euclidean`, `manhattan`, `cosine`, etc.).
22
+
23
+ ## Installation
24
+
25
+ Install simply via pip:
26
+
27
+ ```bash
28
+ pip install .
29
+ ```
30
+
31
+ For development
32
+ ```bash
33
+ pip install -e .[dev]
34
+ ```
35
+
36
+ ## Quick Start
37
+ ### CLARANS
38
+ ```python
39
+ from clarans import CLARANS
40
+ from sklearn.datasets import make_blobs
41
+
42
+ # 1. Create dummy data
43
+ X, _ = make_blobs(n_samples=1000, centers=5, random_state=42)
44
+
45
+ # 2. Initialize CLARANS
46
+ # - n_clusters: 5 clusters
47
+ # - numlocal: 3 restarts for better quality
48
+ # - init: 'k-medoids++' for smart starting points
49
+ clarans = CLARANS(n_clusters=5, numlocal=3, init='k-medoids++', random_state=42)
50
+
51
+ # 3. Fit
52
+ clarans.fit(X)
53
+
54
+ # 4. Results
55
+ print("Medoid Indices:", clarans.medoid_indices_)
56
+ print("Labels:", clarans.labels_)
57
+ ```
58
+ ### FastCLARANS
59
+
60
+ For datasets that fit in memory, **FastCLARANS** can provide significant speedups by caching pairwise distances:
61
+
62
+ ```python
63
+ from clarans import FastCLARANS
64
+
65
+ fast_model = FastCLARANS(n_clusters=5, numlocal=3, random_state=42)
66
+ fast_model.fit(X)
67
+ ```
68
+
69
+ ## Examples
70
+
71
+ This repository includes a number of runnable examples in the `examples/` folder showing common usage patterns, integrations and a Jupyter notebook (`examples/clarans_examples.ipynb`) with many interactive recipes. Run any example with::
72
+
73
+ python examples/01_quick_start.py
74
+
75
+ ## Documentation
76
+
77
+ For full API reference and usage guides, please see the [Documentation](https://scikit-clarans.readthedocs.io/en/latest/index.html).
78
+
79
+ ## Contributing
80
+
81
+ Contributions are welcome! Please check out [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
82
+
83
+ ## License
84
+
85
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,4 @@
1
+ from .clarans import CLARANS
2
+ from .fast_clarans import FastCLARANS
3
+
4
+ __all__ = ["CLARANS", "FastCLARANS"]
@@ -0,0 +1,393 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike
8
+ from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
9
+ from sklearn.metrics import pairwise_distances_argmin_min, pairwise_distances
10
+ from sklearn.utils.validation import check_array, check_is_fitted, check_random_state
11
+
12
+ from .initialization import (
13
+ initialize_build,
14
+ initialize_heuristic,
15
+ initialize_k_medoids_plus_plus,
16
+ )
17
+ from .utils import calculate_cost
18
+
19
+ if TYPE_CHECKING:
20
+ from scipy.sparse import spmatrix
21
+
22
+
23
+ class CLARANS(ClusterMixin, TransformerMixin, BaseEstimator):
24
+ """
25
+ Parameters
26
+ ----------
27
+ n_clusters : int, default=8
28
+ The number of clusters to form (also the number of medoids to
29
+ generate).
30
+
31
+ numlocal : int, default=2
32
+ The number of local searches to perform.
33
+ CLARANS runs the search process ``numlocal`` times starting from
34
+ different random nodes to reduce the chance of getting stuck in
35
+ poor local minima. Increasing this improves solution quality but
36
+ increases runtime.
37
+
38
+ maxneighbor : int, default=None
39
+ The maximum number of neighbors (random swaps) to examine during
40
+ each step. If ``None``, it defaults to ``max(250, 1.25% of k*(n-k))``.
41
+ Higher values make the algorithm behave more like PAM (checking
42
+ more neighbors); lower values make it faster but more random.
43
+
44
+ max_iter : int, default=300
45
+ The maximum number of successful swaps (improvements) allowed per
46
+ local search. This acts as a safeguard against infinite loops.
47
+
48
+ init : {'random', 'heuristic', 'k-medoids++', 'build', array-like}, default='random'
49
+ Strategy for selecting initial medoids:
50
+
51
+ - ``'random'``: Selects ``n_clusters`` random points. Fast but can
52
+ result in poor starting points.
53
+ - ``'heuristic'``: Selects points that are "central" to the data
54
+ (minimizing distance to all others).
55
+ - ``'k-medoids++'``: Optimized probabilistic initialization (similar
56
+ to k-means++) for faster convergence.
57
+ - ``'build'``: The greedy initialization from the original PAM
58
+ algorithm. High quality but slow (O(N^2)).
59
+
60
+ metric : str or callable, default='euclidean'
61
+ The distance metric to use. Supports all metrics from
62
+ ``sklearn.metrics.pairwise_distances`` (e.g., 'euclidean',
63
+ 'manhattan', 'cosine').
64
+
65
+ random_state : int, RandomState instance or None, default=None
66
+ Determines random number generation for centroid initialization.
67
+ Use an int to make the randomness deterministic.
68
+
69
+ Attributes
70
+ ----------
71
+ cluster_centers_ : ndarray of shape (n_clusters, n_features)
72
+ Coordinates of cluster centers (medoids).
73
+
74
+ labels_ : ndarray of shape (n_samples,)
75
+ Labels of each point.
76
+
77
+ medoid_indices_ : ndarray of shape (n_clusters,)
78
+ Indices of the medoids in the training set X.
79
+
80
+ Notes
81
+ -----
82
+ - Time complexity: each local search evaluates up to ``maxneighbor``
83
+ candidate swaps, and each cost evaluation is O(n * k) (distance
84
+ to medoids), so the worst-case runtime is roughly
85
+ O(numlocal * maxneighbor * n * k).
86
+ - Initialization methods such as ``'heuristic'`` and ``'build'``
87
+ may compute the full pairwise distance matrix and therefore have
88
+ O(n^2) time and memory costs.
89
+ - Compared with ``FastCLARANS``, this implementation avoids
90
+ caching the full distance matrix and is more memory-friendly for
91
+ very large datasets at the cost of repeated distance computations.
92
+
93
+ References
94
+ ----------
95
+ Ng, R. T., & Han, J. (2002). CLARANS: A method for clustering objects for spatial data mining.
96
+ IEEE transactions on knowledge and data engineering, 14(5), 1003-1016.
97
+
98
+ Examples
99
+ --------
100
+ >>> from clarans import CLARANS
101
+ >>> model = CLARANS(n_clusters=3, random_state=0)
102
+ >>> model.fit(X)
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ n_clusters=8,
108
+ numlocal=2,
109
+ maxneighbor=None,
110
+ max_iter=300,
111
+ init="random",
112
+ metric="euclidean",
113
+ random_state=None,
114
+ ):
115
+ self.n_clusters = n_clusters
116
+ self.numlocal = numlocal
117
+ self.maxneighbor = maxneighbor
118
+ self.max_iter = max_iter
119
+ self.init = init
120
+ self.metric = metric
121
+ self.random_state = random_state
122
+
123
+ def __sklearn_tags__(self):
124
+ """Declare estimator capabilities for scikit-learn's check_estimator.
125
+
126
+ CLARANS can accept CSR/CSC sparse matrices as input.
127
+ """
128
+ tags = super().__sklearn_tags__()
129
+ tags.input_tags.sparse = True
130
+ return tags
131
+
132
+ def fit(self, X: ArrayLike | "spmatrix", y: Any = None) -> "CLARANS":
133
+ """
134
+ Compute CLARANS clustering.
135
+
136
+ Parameters
137
+ ----------
138
+ X : array-like or sparse matrix of shape (n_samples, n_features)
139
+ Training instances to cluster. Accepts CSR/CSC sparse matrices.
140
+
141
+ y : Ignored
142
+ Not used, present here for API consistency.
143
+
144
+ Returns
145
+ -------
146
+ self : CLARANS
147
+ Fitted estimator. Attributes set on the estimator include
148
+ ``medoid_indices_``, ``cluster_centers_``, and ``labels_``.
149
+
150
+ Raises
151
+ ------
152
+ ValueError
153
+ If ``n_clusters >= n_samples``, or if an explicit ``init`` array
154
+ has an incompatible shape, or if not enough unique points exist
155
+ to initialize the requested number of clusters.
156
+
157
+ Notes
158
+ -----
159
+ - Time complexity: each local search evaluates up to ``maxneighbor``
160
+ candidate swaps, and each cost evaluation is O(n * k) (distance
161
+ to medoids), so the worst-case runtime is roughly
162
+ O(numlocal * maxneighbor * n * k).
163
+ - Initialization methods such as ``'heuristic'`` and ``'build'``
164
+ may compute the full pairwise distance matrix and therefore have
165
+ O(n^2) time and memory costs.
166
+ - Compared with ``FastCLARANS``, this implementation avoids
167
+ caching the full distance matrix and is more memory-friendly for
168
+ very large datasets at the cost of repeated distance computations.
169
+
170
+ Examples
171
+ --------
172
+ >>> from clarans import CLARANS
173
+ >>> model = CLARANS(n_clusters=3, random_state=0)
174
+ >>> model.fit(X)
175
+ """
176
+ try:
177
+ from sklearn.utils.validation import validate_data
178
+
179
+ X = validate_data(
180
+ self, X=X, ensure_min_samples=2, accept_sparse=["csr", "csc"]
181
+ )
182
+ except ImportError:
183
+ if hasattr(self, "_validate_data"):
184
+ X = self._validate_data(
185
+ X, ensure_min_samples=2, accept_sparse=["csr", "csc"]
186
+ )
187
+ else:
188
+ X = check_array(X, ensure_min_samples=2, accept_sparse=["csr", "csc"])
189
+ self.n_features_in_ = X.shape[1]
190
+
191
+ random_state = check_random_state(self.random_state)
192
+ n_samples, n_features = X.shape
193
+
194
+ if self.n_clusters >= n_samples:
195
+ raise ValueError("n_clusters must be less than n_samples")
196
+
197
+ if self.maxneighbor is None:
198
+ self.maxneighbor_ = max(
199
+ 250, int(0.0125 * self.n_clusters * (n_samples - self.n_clusters))
200
+ )
201
+ else:
202
+ self.maxneighbor_ = self.maxneighbor
203
+
204
+ best_cost = np.inf
205
+ best_medoids = np.empty(self.n_clusters, dtype=int)
206
+ self.n_iter_ = 0
207
+
208
+ all_indices = np.arange(n_samples)
209
+
210
+ for loc_idx in range(self.numlocal):
211
+ if isinstance(self.init, str) and self.init == "random":
212
+ current_medoids_indices = random_state.choice(
213
+ n_samples, self.n_clusters, replace=False
214
+ )
215
+ elif isinstance(self.init, str) and self.init == "k-medoids++":
216
+ current_medoids_indices = initialize_k_medoids_plus_plus(
217
+ X, self.n_clusters, random_state, self.metric
218
+ )
219
+ elif isinstance(self.init, str) and self.init == "heuristic":
220
+ current_medoids_indices = initialize_heuristic(
221
+ X, self.n_clusters, self.metric
222
+ )
223
+ elif isinstance(self.init, str) and self.init == "build":
224
+ current_medoids_indices = initialize_build(
225
+ X, self.n_clusters, self.metric
226
+ )
227
+ elif hasattr(self.init, "__array__") or isinstance(self.init, list):
228
+ init_centers = check_array(self.init)
229
+ if init_centers.shape != (self.n_clusters, n_features):
230
+ raise ValueError(
231
+ f"init array must be of shape ({self.n_clusters}, {n_features})"
232
+ )
233
+
234
+ current_medoids_indices, _ = pairwise_distances_argmin_min(
235
+ init_centers, X, metric=self.metric
236
+ )
237
+
238
+ current_medoids_indices = np.array(current_medoids_indices, dtype=int)
239
+
240
+ current_medoids_indices = np.unique(current_medoids_indices)
241
+
242
+ if len(current_medoids_indices) < self.n_clusters:
243
+ warnings.warn(
244
+ "Provided init centers map to duplicate points in X. "
245
+ "Filling duplicates with random points."
246
+ )
247
+ remaining = self.n_clusters - len(current_medoids_indices)
248
+ available = np.setdiff1d(
249
+ all_indices, current_medoids_indices, assume_unique=True
250
+ )
251
+
252
+ if len(available) < remaining:
253
+ raise ValueError(
254
+ "Not enough unique points to fill up to n_clusters."
255
+ )
256
+
257
+ fillers = random_state.choice(available, remaining, replace=False)
258
+ current_medoids_indices = np.concatenate(
259
+ [current_medoids_indices, fillers]
260
+ )
261
+ else:
262
+ raise ValueError(f"Unknown init method: {self.init}")
263
+
264
+ current_medoids_indices = np.array(current_medoids_indices, dtype=int)
265
+
266
+ current_cost = calculate_cost(X, current_medoids_indices, self.metric)
267
+
268
+ i = 0
269
+ iter_count = 0
270
+
271
+ while i < self.maxneighbor_:
272
+ if self.max_iter is not None and iter_count >= self.max_iter:
273
+ break
274
+
275
+ random_medoid_pos = random_state.randint(0, self.n_clusters)
276
+
277
+ mask = np.ones(n_samples, dtype=bool)
278
+ mask[current_medoids_indices] = False
279
+ available_candidates = np.flatnonzero(mask)
280
+
281
+ if available_candidates.size == 0:
282
+ break
283
+
284
+ random_non_medoid_candidate = random_state.choice(available_candidates)
285
+
286
+ neighbor_medoids_indices = current_medoids_indices.copy()
287
+ neighbor_medoids_indices[random_medoid_pos] = (
288
+ random_non_medoid_candidate
289
+ )
290
+
291
+ neighbor_cost = calculate_cost(X, neighbor_medoids_indices, self.metric)
292
+
293
+ if neighbor_cost < current_cost:
294
+ current_medoids_indices = neighbor_medoids_indices
295
+ current_cost = neighbor_cost
296
+ i = 0
297
+ iter_count += 1
298
+ else:
299
+ i += 1
300
+
301
+ self.n_iter_ += max(1, iter_count)
302
+
303
+ if current_cost < best_cost:
304
+ best_cost = current_cost
305
+ best_medoids = current_medoids_indices
306
+
307
+ self.medoid_indices_ = best_medoids
308
+ self.cluster_centers_ = X[self.medoid_indices_]
309
+
310
+ self.labels_, _ = pairwise_distances_argmin_min(
311
+ X, self.cluster_centers_, metric=self.metric
312
+ )
313
+
314
+ return self
315
+
316
+ def predict(self, X: ArrayLike | "spmatrix") -> np.ndarray:
317
+ """
318
+ Predict the closest cluster each sample in X belongs to.
319
+
320
+ Parameters
321
+ ----------
322
+ X : array-like or sparse matrix of shape (n_samples, n_features)
323
+ New data to predict. Accepts CSR/CSC sparse matrices.
324
+
325
+ Returns
326
+ -------
327
+ labels : ndarray of shape (n_samples,)
328
+ Index of the cluster each sample belongs to.
329
+
330
+ Raises
331
+ ------
332
+ ValueError
333
+ If the number of features in ``X`` does not match the number of
334
+ features seen during fitting.
335
+
336
+ Notes
337
+ -----
338
+ This method uses ``pairwise_distances_argmin_min`` from scikit-learn
339
+ to assign each sample to the nearest medoid.
340
+ """
341
+ check_is_fitted(self)
342
+
343
+ try:
344
+ from sklearn.utils.validation import validate_data
345
+
346
+ X = validate_data(self, X=X, reset=False, accept_sparse=["csr", "csc"])
347
+ except ImportError:
348
+ if hasattr(self, "_validate_data"):
349
+ X = self._validate_data(X, reset=False, accept_sparse=["csr", "csc"])
350
+ else:
351
+ X = check_array(X, accept_sparse=["csr", "csc"])
352
+ if (
353
+ hasattr(self, "n_features_in_")
354
+ and X.shape[1] != self.n_features_in_
355
+ ):
356
+ raise ValueError(
357
+ f"X has {X.shape[1]} features, but CLARANS is expecting "
358
+ f"{self.n_features_in_} features as input"
359
+ )
360
+
361
+ labels, _ = pairwise_distances_argmin_min(
362
+ X, self.cluster_centers_, metric=self.metric
363
+ )
364
+ return labels
365
+
366
+ def transform(self, X: ArrayLike | "spmatrix") -> np.ndarray:
367
+ """
368
+ Transform X to a cluster-distance space.
369
+
370
+ In the new space, each dimension is the distance to the cluster centers.
371
+
372
+ Parameters
373
+ ----------
374
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
375
+ New data to transform.
376
+
377
+ Returns
378
+ -------
379
+ X_new : ndarray of shape (n_samples, n_clusters)
380
+ X transformed in the new space.
381
+ """
382
+ check_is_fitted(self)
383
+
384
+ try:
385
+ from sklearn.utils.validation import validate_data
386
+ X = validate_data(self, X=X, reset=False, accept_sparse=["csr", "csc"])
387
+ except ImportError:
388
+ if hasattr(self, "_validate_data"):
389
+ X = self._validate_data(X, reset=False, accept_sparse=["csr", "csc"])
390
+ else:
391
+ X = check_array(X, accept_sparse=["csr", "csc"])
392
+
393
+ return pairwise_distances(X, self.cluster_centers_, metric=self.metric)