scikit-clarans 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_clarans-0.1.1/LICENSE +21 -0
- scikit_clarans-0.1.1/MANIFEST.in +5 -0
- scikit_clarans-0.1.1/PKG-INFO +139 -0
- scikit_clarans-0.1.1/README.md +85 -0
- scikit_clarans-0.1.1/clarans/__init__.py +4 -0
- scikit_clarans-0.1.1/clarans/clarans.py +393 -0
- scikit_clarans-0.1.1/clarans/fast_clarans.py +325 -0
- scikit_clarans-0.1.1/clarans/initialization.py +183 -0
- scikit_clarans-0.1.1/clarans/tests/__init__.py +0 -0
- scikit_clarans-0.1.1/clarans/tests/test_algorithm_logic.py +433 -0
- scikit_clarans-0.1.1/clarans/tests/test_clarans.py +91 -0
- scikit_clarans-0.1.1/clarans/tests/test_common.py +11 -0
- scikit_clarans-0.1.1/clarans/tests/test_fast_clarans.py +71 -0
- scikit_clarans-0.1.1/clarans/tests/test_logic_verification.py +505 -0
- scikit_clarans-0.1.1/clarans/utils.py +27 -0
- scikit_clarans-0.1.1/docs/source/api.rst +33 -0
- scikit_clarans-0.1.1/docs/source/contributing.rst +22 -0
- scikit_clarans-0.1.1/docs/source/examples.rst +49 -0
- scikit_clarans-0.1.1/docs/source/gallery/2d_clustering.rst +134 -0
- scikit_clarans-0.1.1/docs/source/gallery/comparison.rst +60 -0
- scikit_clarans-0.1.1/docs/source/gallery/performance.rst +125 -0
- scikit_clarans-0.1.1/docs/source/gallery/quality_vs_k.rst +64 -0
- scikit_clarans-0.1.1/docs/source/index.rst +53 -0
- scikit_clarans-0.1.1/docs/source/installation.rst +36 -0
- scikit_clarans-0.1.1/docs/source/license.rst +8 -0
- scikit_clarans-0.1.1/docs/source/project.rst +17 -0
- scikit_clarans-0.1.1/docs/source/usage.rst +75 -0
- scikit_clarans-0.1.1/examples/01_quick_start.py +47 -0
- scikit_clarans-0.1.1/examples/02_compare_initializations.py +53 -0
- scikit_clarans-0.1.1/examples/03_metrics_demo.py +50 -0
- scikit_clarans-0.1.1/examples/04_sparse_input.py +31 -0
- scikit_clarans-0.1.1/examples/05_pipeline_gridsearch.py +69 -0
- scikit_clarans-0.1.1/examples/06_predict_new_data.py +33 -0
- scikit_clarans-0.1.1/examples/07_custom_init_centers.py +30 -0
- scikit_clarans-0.1.1/examples/08_performance_tuning.py +49 -0
- scikit_clarans-0.1.1/examples/09_compare_fastclarans_clarans.py +113 -0
- scikit_clarans-0.1.1/examples/10_transform_data.py +59 -0
- scikit_clarans-0.1.1/examples/archive/gallery_anisotropic.py +35 -0
- scikit_clarans-0.1.1/examples/archive/gallery_blobs.py +29 -0
- scikit_clarans-0.1.1/examples/archive/gallery_comparison.py +42 -0
- scikit_clarans-0.1.1/examples/archive/gallery_moons.py +31 -0
- scikit_clarans-0.1.1/examples/archive/gallery_parameter_sensitivity.py +57 -0
- scikit_clarans-0.1.1/examples/archive/gallery_runtime.py +51 -0
- scikit_clarans-0.1.1/examples/archive/gallery_silhouette.py +49 -0
- scikit_clarans-0.1.1/examples/clarans_examples.ipynb +963 -0
- scikit_clarans-0.1.1/scikit_clarans.egg-info/PKG-INFO +139 -0
- scikit_clarans-0.1.1/scikit_clarans.egg-info/SOURCES.txt +50 -0
- scikit_clarans-0.1.1/scikit_clarans.egg-info/dependency_links.txt +1 -0
- scikit_clarans-0.1.1/scikit_clarans.egg-info/requires.txt +23 -0
- scikit_clarans-0.1.1/scikit_clarans.egg-info/top_level.txt +1 -0
- scikit_clarans-0.1.1/setup.cfg +4 -0
- scikit_clarans-0.1.1/setup.py +55 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nguyễn Ngọc Thiện
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scikit-clarans
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A scikit-learn compatible implementation of CLARANS clustering algorithm
|
|
5
|
+
Home-page: https://github.com/ThienNguyen3001/scikit-clarans
|
|
6
|
+
Author: ThienNguyen3001
|
|
7
|
+
Author-email: thiennguyen03001@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: clustering sklearn scikit-learn clarans k-medoids
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: scikit-learn
|
|
23
|
+
Requires-Dist: scipy
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
27
|
+
Requires-Dist: flake8; extra == "dev"
|
|
28
|
+
Requires-Dist: sphinx>=5.0; extra == "dev"
|
|
29
|
+
Requires-Dist: sphinx-rtd-theme; extra == "dev"
|
|
30
|
+
Requires-Dist: sphinx-copybutton; extra == "dev"
|
|
31
|
+
Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
|
|
32
|
+
Provides-Extra: test
|
|
33
|
+
Requires-Dist: pytest; extra == "test"
|
|
34
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
35
|
+
Requires-Dist: flake8; extra == "test"
|
|
36
|
+
Provides-Extra: docs
|
|
37
|
+
Requires-Dist: sphinx>=5.0; extra == "docs"
|
|
38
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
|
39
|
+
Requires-Dist: sphinx-copybutton; extra == "docs"
|
|
40
|
+
Requires-Dist: sphinx-autodoc-typehints; extra == "docs"
|
|
41
|
+
Dynamic: author
|
|
42
|
+
Dynamic: author-email
|
|
43
|
+
Dynamic: classifier
|
|
44
|
+
Dynamic: description
|
|
45
|
+
Dynamic: description-content-type
|
|
46
|
+
Dynamic: home-page
|
|
47
|
+
Dynamic: keywords
|
|
48
|
+
Dynamic: license
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
Dynamic: provides-extra
|
|
51
|
+
Dynamic: requires-dist
|
|
52
|
+
Dynamic: requires-python
|
|
53
|
+
Dynamic: summary
|
|
54
|
+
|
|
55
|
+
# scikit-clarans
|
|
56
|
+
|
|
57
|
+
> A scikit-learn compatible implementation of the **CLARANS** (Clustering Large Applications based on RANdomized Search) algorithm.
|
|
58
|
+
|
|
59
|
+
[](LICENSE)
|
|
60
|
+
[](https://doi.org/10.5281/zenodo.18366802)
|
|
61
|
+
[](https://www.python.org/downloads/)
|
|
62
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/docs-build.yml)
|
|
63
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/test_suite.yml)
|
|
64
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/lint_cov_check.yml)
|
|
65
|
+
[](https://colab.research.google.com/drive/1JdgVaZcbS1uwY7kPQZM8DtX97R9ga31d?usp=sharing)
|
|
66
|
+
|
|
67
|
+
**CLARANS** acts as a bridge between the high quality of **PAM (Partition Around Medoids)** and the speed required for large datasets. By using randomized search instead of exhaustive search, it finds high-quality medoids efficiently without exploring the entire graph of solutions.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Features
|
|
72
|
+
|
|
73
|
+
* **Scikit-Learn Native**: Use it just like `KMeans` or `DBSCAN`. Drop-in compatibility for pipelines and cross-validation.
|
|
74
|
+
* **Scalable**: Designed to handle datasets where standard PAM/k-medoids is too slow.
|
|
75
|
+
* **Flexible**: Choose from multiple initialization strategies (`k-medoids++`, `build`, etc.) and distance metrics (`euclidean`, `manhattan`, `cosine`, etc.).
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
Install simply via pip:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install .
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
For development
|
|
86
|
+
```bash
|
|
87
|
+
pip install -e .[dev]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Quick Start
|
|
91
|
+
### CLARANS
|
|
92
|
+
```python
|
|
93
|
+
from clarans import CLARANS
|
|
94
|
+
from sklearn.datasets import make_blobs
|
|
95
|
+
|
|
96
|
+
# 1. Create dummy data
|
|
97
|
+
X, _ = make_blobs(n_samples=1000, centers=5, random_state=42)
|
|
98
|
+
|
|
99
|
+
# 2. Initialize CLARANS
|
|
100
|
+
# - n_clusters: 5 clusters
|
|
101
|
+
# - numlocal: 3 restarts for better quality
|
|
102
|
+
# - init: 'k-medoids++' for smart starting points
|
|
103
|
+
clarans = CLARANS(n_clusters=5, numlocal=3, init='k-medoids++', random_state=42)
|
|
104
|
+
|
|
105
|
+
# 3. Fit
|
|
106
|
+
clarans.fit(X)
|
|
107
|
+
|
|
108
|
+
# 4. Results
|
|
109
|
+
print("Medoid Indices:", clarans.medoid_indices_)
|
|
110
|
+
print("Labels:", clarans.labels_)
|
|
111
|
+
```
|
|
112
|
+
### FastCLARANS
|
|
113
|
+
|
|
114
|
+
For datasets that fit in memory, **FastCLARANS** can provide significant speedups by caching pairwise distances:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from clarans import FastCLARANS
|
|
118
|
+
|
|
119
|
+
fast_model = FastCLARANS(n_clusters=5, numlocal=3, random_state=42)
|
|
120
|
+
fast_model.fit(X)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Examples
|
|
124
|
+
|
|
125
|
+
This repository includes a number of runnable examples in the `examples/` folder showing common usage patterns, integrations and a Jupyter notebook (`examples/clarans_examples.ipynb`) with many interactive recipes. Run any example with::
|
|
126
|
+
|
|
127
|
+
python examples/01_quick_start.py
|
|
128
|
+
|
|
129
|
+
## Documentation
|
|
130
|
+
|
|
131
|
+
For full API reference and usage guides, please see the [Documentation](https://scikit-clarans.readthedocs.io/en/latest/index.html).
|
|
132
|
+
|
|
133
|
+
## Contributing
|
|
134
|
+
|
|
135
|
+
Contributions are welcome! Please check out [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# scikit-clarans
|
|
2
|
+
|
|
3
|
+
> A scikit-learn compatible implementation of the **CLARANS** (Clustering Large Applications based on RANdomized Search) algorithm.
|
|
4
|
+
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://doi.org/10.5281/zenodo.18366802)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/docs-build.yml)
|
|
9
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/test_suite.yml)
|
|
10
|
+
[](https://github.com/ThienNguyen3001/scikit-clarans/actions/workflows/lint_cov_check.yml)
|
|
11
|
+
[](https://colab.research.google.com/drive/1JdgVaZcbS1uwY7kPQZM8DtX97R9ga31d?usp=sharing)
|
|
12
|
+
|
|
13
|
+
**CLARANS** acts as a bridge between the high quality of **PAM (Partition Around Medoids)** and the speed required for large datasets. By using randomized search instead of exhaustive search, it finds high-quality medoids efficiently without exploring the entire graph of solutions.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
* **Scikit-Learn Native**: Use it just like `KMeans` or `DBSCAN`. Drop-in compatibility for pipelines and cross-validation.
|
|
20
|
+
* **Scalable**: Designed to handle datasets where standard PAM/k-medoids is too slow.
|
|
21
|
+
* **Flexible**: Choose from multiple initialization strategies (`k-medoids++`, `build`, etc.) and distance metrics (`euclidean`, `manhattan`, `cosine`, etc.).
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
Install simply via pip:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For development
|
|
32
|
+
```bash
|
|
33
|
+
pip install -e .[dev]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
### CLARANS
|
|
38
|
+
```python
|
|
39
|
+
from clarans import CLARANS
|
|
40
|
+
from sklearn.datasets import make_blobs
|
|
41
|
+
|
|
42
|
+
# 1. Create dummy data
|
|
43
|
+
X, _ = make_blobs(n_samples=1000, centers=5, random_state=42)
|
|
44
|
+
|
|
45
|
+
# 2. Initialize CLARANS
|
|
46
|
+
# - n_clusters: 5 clusters
|
|
47
|
+
# - numlocal: 3 restarts for better quality
|
|
48
|
+
# - init: 'k-medoids++' for smart starting points
|
|
49
|
+
clarans = CLARANS(n_clusters=5, numlocal=3, init='k-medoids++', random_state=42)
|
|
50
|
+
|
|
51
|
+
# 3. Fit
|
|
52
|
+
clarans.fit(X)
|
|
53
|
+
|
|
54
|
+
# 4. Results
|
|
55
|
+
print("Medoid Indices:", clarans.medoid_indices_)
|
|
56
|
+
print("Labels:", clarans.labels_)
|
|
57
|
+
```
|
|
58
|
+
### FastCLARANS
|
|
59
|
+
|
|
60
|
+
For datasets that fit in memory, **FastCLARANS** can provide significant speedups by caching pairwise distances:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from clarans import FastCLARANS
|
|
64
|
+
|
|
65
|
+
fast_model = FastCLARANS(n_clusters=5, numlocal=3, random_state=42)
|
|
66
|
+
fast_model.fit(X)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Examples
|
|
70
|
+
|
|
71
|
+
This repository includes a number of runnable examples in the `examples/` folder showing common usage patterns, integrations and a Jupyter notebook (`examples/clarans_examples.ipynb`) with many interactive recipes. Run any example with::
|
|
72
|
+
|
|
73
|
+
python examples/01_quick_start.py
|
|
74
|
+
|
|
75
|
+
## Documentation
|
|
76
|
+
|
|
77
|
+
For full API reference and usage guides, please see the [Documentation](https://scikit-clarans.readthedocs.io/en/latest/index.html).
|
|
78
|
+
|
|
79
|
+
## Contributing
|
|
80
|
+
|
|
81
|
+
Contributions are welcome! Please check out [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
82
|
+
|
|
83
|
+
## License
|
|
84
|
+
|
|
85
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from numpy.typing import ArrayLike
|
|
8
|
+
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
|
|
9
|
+
from sklearn.metrics import pairwise_distances_argmin_min, pairwise_distances
|
|
10
|
+
from sklearn.utils.validation import check_array, check_is_fitted, check_random_state
|
|
11
|
+
|
|
12
|
+
from .initialization import (
|
|
13
|
+
initialize_build,
|
|
14
|
+
initialize_heuristic,
|
|
15
|
+
initialize_k_medoids_plus_plus,
|
|
16
|
+
)
|
|
17
|
+
from .utils import calculate_cost
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from scipy.sparse import spmatrix
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CLARANS(ClusterMixin, TransformerMixin, BaseEstimator):
|
|
24
|
+
"""
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
n_clusters : int, default=8
|
|
28
|
+
The number of clusters to form (also the number of medoids to
|
|
29
|
+
generate).
|
|
30
|
+
|
|
31
|
+
numlocal : int, default=2
|
|
32
|
+
The number of local searches to perform.
|
|
33
|
+
CLARANS runs the search process ``numlocal`` times starting from
|
|
34
|
+
different random nodes to reduce the chance of getting stuck in
|
|
35
|
+
poor local minima. Increasing this improves solution quality but
|
|
36
|
+
increases runtime.
|
|
37
|
+
|
|
38
|
+
maxneighbor : int, default=None
|
|
39
|
+
The maximum number of neighbors (random swaps) to examine during
|
|
40
|
+
each step. If ``None``, it defaults to ``max(250, 1.25% of k*(n-k))``.
|
|
41
|
+
Higher values make the algorithm behave more like PAM (checking
|
|
42
|
+
more neighbors); lower values make it faster but more random.
|
|
43
|
+
|
|
44
|
+
max_iter : int, default=300
|
|
45
|
+
The maximum number of successful swaps (improvements) allowed per
|
|
46
|
+
local search. This acts as a safeguard against infinite loops.
|
|
47
|
+
|
|
48
|
+
init : {'random', 'heuristic', 'k-medoids++', 'build', array-like}, default='random'
|
|
49
|
+
Strategy for selecting initial medoids:
|
|
50
|
+
|
|
51
|
+
- ``'random'``: Selects ``n_clusters`` random points. Fast but can
|
|
52
|
+
result in poor starting points.
|
|
53
|
+
- ``'heuristic'``: Selects points that are "central" to the data
|
|
54
|
+
(minimizing distance to all others).
|
|
55
|
+
- ``'k-medoids++'``: Optimized probabilistic initialization (similar
|
|
56
|
+
to k-means++) for faster convergence.
|
|
57
|
+
- ``'build'``: The greedy initialization from the original PAM
|
|
58
|
+
algorithm. High quality but slow (O(N^2)).
|
|
59
|
+
|
|
60
|
+
metric : str or callable, default='euclidean'
|
|
61
|
+
The distance metric to use. Supports all metrics from
|
|
62
|
+
``sklearn.metrics.pairwise_distances`` (e.g., 'euclidean',
|
|
63
|
+
'manhattan', 'cosine').
|
|
64
|
+
|
|
65
|
+
random_state : int, RandomState instance or None, default=None
|
|
66
|
+
Determines random number generation for centroid initialization.
|
|
67
|
+
Use an int to make the randomness deterministic.
|
|
68
|
+
|
|
69
|
+
Attributes
|
|
70
|
+
----------
|
|
71
|
+
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
|
72
|
+
Coordinates of cluster centers (medoids).
|
|
73
|
+
|
|
74
|
+
labels_ : ndarray of shape (n_samples,)
|
|
75
|
+
Labels of each point.
|
|
76
|
+
|
|
77
|
+
medoid_indices_ : ndarray of shape (n_clusters,)
|
|
78
|
+
Indices of the medoids in the training set X.
|
|
79
|
+
|
|
80
|
+
Notes
|
|
81
|
+
-----
|
|
82
|
+
- Time complexity: each local search evaluates up to ``maxneighbor``
|
|
83
|
+
candidate swaps, and each cost evaluation is O(n * k) (distance
|
|
84
|
+
to medoids), so the worst-case runtime is roughly
|
|
85
|
+
O(numlocal * maxneighbor * n * k).
|
|
86
|
+
- Initialization methods such as ``'heuristic'`` and ``'build'``
|
|
87
|
+
may compute the full pairwise distance matrix and therefore have
|
|
88
|
+
O(n^2) time and memory costs.
|
|
89
|
+
- Compared with ``FastCLARANS``, this implementation avoids
|
|
90
|
+
caching the full distance matrix and is more memory-friendly for
|
|
91
|
+
very large datasets at the cost of repeated distance computations.
|
|
92
|
+
|
|
93
|
+
References
|
|
94
|
+
----------
|
|
95
|
+
Ng, R. T., & Han, J. (2002). CLARANS: A method for clustering objects for spatial data mining.
|
|
96
|
+
IEEE transactions on knowledge and data engineering, 14(5), 1003-1016.
|
|
97
|
+
|
|
98
|
+
Examples
|
|
99
|
+
--------
|
|
100
|
+
>>> from clarans import CLARANS
|
|
101
|
+
>>> model = CLARANS(n_clusters=3, random_state=0)
|
|
102
|
+
>>> model.fit(X)
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
n_clusters=8,
|
|
108
|
+
numlocal=2,
|
|
109
|
+
maxneighbor=None,
|
|
110
|
+
max_iter=300,
|
|
111
|
+
init="random",
|
|
112
|
+
metric="euclidean",
|
|
113
|
+
random_state=None,
|
|
114
|
+
):
|
|
115
|
+
self.n_clusters = n_clusters
|
|
116
|
+
self.numlocal = numlocal
|
|
117
|
+
self.maxneighbor = maxneighbor
|
|
118
|
+
self.max_iter = max_iter
|
|
119
|
+
self.init = init
|
|
120
|
+
self.metric = metric
|
|
121
|
+
self.random_state = random_state
|
|
122
|
+
|
|
123
|
+
def __sklearn_tags__(self):
|
|
124
|
+
"""Declare estimator capabilities for scikit-learn's check_estimator.
|
|
125
|
+
|
|
126
|
+
CLARANS can accept CSR/CSC sparse matrices as input.
|
|
127
|
+
"""
|
|
128
|
+
tags = super().__sklearn_tags__()
|
|
129
|
+
tags.input_tags.sparse = True
|
|
130
|
+
return tags
|
|
131
|
+
|
|
132
|
+
def fit(self, X: ArrayLike | "spmatrix", y: Any = None) -> "CLARANS":
|
|
133
|
+
"""
|
|
134
|
+
Compute CLARANS clustering.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
139
|
+
Training instances to cluster. Accepts CSR/CSC sparse matrices.
|
|
140
|
+
|
|
141
|
+
y : Ignored
|
|
142
|
+
Not used, present here for API consistency.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
self : CLARANS
|
|
147
|
+
Fitted estimator. Attributes set on the estimator include
|
|
148
|
+
``medoid_indices_``, ``cluster_centers_``, and ``labels_``.
|
|
149
|
+
|
|
150
|
+
Raises
|
|
151
|
+
------
|
|
152
|
+
ValueError
|
|
153
|
+
If ``n_clusters >= n_samples``, or if an explicit ``init`` array
|
|
154
|
+
has an incompatible shape, or if not enough unique points exist
|
|
155
|
+
to initialize the requested number of clusters.
|
|
156
|
+
|
|
157
|
+
Notes
|
|
158
|
+
-----
|
|
159
|
+
- Time complexity: each local search evaluates up to ``maxneighbor``
|
|
160
|
+
candidate swaps, and each cost evaluation is O(n * k) (distance
|
|
161
|
+
to medoids), so the worst-case runtime is roughly
|
|
162
|
+
O(numlocal * maxneighbor * n * k).
|
|
163
|
+
- Initialization methods such as ``'heuristic'`` and ``'build'``
|
|
164
|
+
may compute the full pairwise distance matrix and therefore have
|
|
165
|
+
O(n^2) time and memory costs.
|
|
166
|
+
- Compared with ``FastCLARANS``, this implementation avoids
|
|
167
|
+
caching the full distance matrix and is more memory-friendly for
|
|
168
|
+
very large datasets at the cost of repeated distance computations.
|
|
169
|
+
|
|
170
|
+
Examples
|
|
171
|
+
--------
|
|
172
|
+
>>> from clarans import CLARANS
|
|
173
|
+
>>> model = CLARANS(n_clusters=3, random_state=0)
|
|
174
|
+
>>> model.fit(X)
|
|
175
|
+
"""
|
|
176
|
+
try:
|
|
177
|
+
from sklearn.utils.validation import validate_data
|
|
178
|
+
|
|
179
|
+
X = validate_data(
|
|
180
|
+
self, X=X, ensure_min_samples=2, accept_sparse=["csr", "csc"]
|
|
181
|
+
)
|
|
182
|
+
except ImportError:
|
|
183
|
+
if hasattr(self, "_validate_data"):
|
|
184
|
+
X = self._validate_data(
|
|
185
|
+
X, ensure_min_samples=2, accept_sparse=["csr", "csc"]
|
|
186
|
+
)
|
|
187
|
+
else:
|
|
188
|
+
X = check_array(X, ensure_min_samples=2, accept_sparse=["csr", "csc"])
|
|
189
|
+
self.n_features_in_ = X.shape[1]
|
|
190
|
+
|
|
191
|
+
random_state = check_random_state(self.random_state)
|
|
192
|
+
n_samples, n_features = X.shape
|
|
193
|
+
|
|
194
|
+
if self.n_clusters >= n_samples:
|
|
195
|
+
raise ValueError("n_clusters must be less than n_samples")
|
|
196
|
+
|
|
197
|
+
if self.maxneighbor is None:
|
|
198
|
+
self.maxneighbor_ = max(
|
|
199
|
+
250, int(0.0125 * self.n_clusters * (n_samples - self.n_clusters))
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
self.maxneighbor_ = self.maxneighbor
|
|
203
|
+
|
|
204
|
+
best_cost = np.inf
|
|
205
|
+
best_medoids = np.empty(self.n_clusters, dtype=int)
|
|
206
|
+
self.n_iter_ = 0
|
|
207
|
+
|
|
208
|
+
all_indices = np.arange(n_samples)
|
|
209
|
+
|
|
210
|
+
for loc_idx in range(self.numlocal):
|
|
211
|
+
if isinstance(self.init, str) and self.init == "random":
|
|
212
|
+
current_medoids_indices = random_state.choice(
|
|
213
|
+
n_samples, self.n_clusters, replace=False
|
|
214
|
+
)
|
|
215
|
+
elif isinstance(self.init, str) and self.init == "k-medoids++":
|
|
216
|
+
current_medoids_indices = initialize_k_medoids_plus_plus(
|
|
217
|
+
X, self.n_clusters, random_state, self.metric
|
|
218
|
+
)
|
|
219
|
+
elif isinstance(self.init, str) and self.init == "heuristic":
|
|
220
|
+
current_medoids_indices = initialize_heuristic(
|
|
221
|
+
X, self.n_clusters, self.metric
|
|
222
|
+
)
|
|
223
|
+
elif isinstance(self.init, str) and self.init == "build":
|
|
224
|
+
current_medoids_indices = initialize_build(
|
|
225
|
+
X, self.n_clusters, self.metric
|
|
226
|
+
)
|
|
227
|
+
elif hasattr(self.init, "__array__") or isinstance(self.init, list):
|
|
228
|
+
init_centers = check_array(self.init)
|
|
229
|
+
if init_centers.shape != (self.n_clusters, n_features):
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"init array must be of shape ({self.n_clusters}, {n_features})"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
current_medoids_indices, _ = pairwise_distances_argmin_min(
|
|
235
|
+
init_centers, X, metric=self.metric
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
current_medoids_indices = np.array(current_medoids_indices, dtype=int)
|
|
239
|
+
|
|
240
|
+
current_medoids_indices = np.unique(current_medoids_indices)
|
|
241
|
+
|
|
242
|
+
if len(current_medoids_indices) < self.n_clusters:
|
|
243
|
+
warnings.warn(
|
|
244
|
+
"Provided init centers map to duplicate points in X. "
|
|
245
|
+
"Filling duplicates with random points."
|
|
246
|
+
)
|
|
247
|
+
remaining = self.n_clusters - len(current_medoids_indices)
|
|
248
|
+
available = np.setdiff1d(
|
|
249
|
+
all_indices, current_medoids_indices, assume_unique=True
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if len(available) < remaining:
|
|
253
|
+
raise ValueError(
|
|
254
|
+
"Not enough unique points to fill up to n_clusters."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
fillers = random_state.choice(available, remaining, replace=False)
|
|
258
|
+
current_medoids_indices = np.concatenate(
|
|
259
|
+
[current_medoids_indices, fillers]
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError(f"Unknown init method: {self.init}")
|
|
263
|
+
|
|
264
|
+
current_medoids_indices = np.array(current_medoids_indices, dtype=int)
|
|
265
|
+
|
|
266
|
+
current_cost = calculate_cost(X, current_medoids_indices, self.metric)
|
|
267
|
+
|
|
268
|
+
i = 0
|
|
269
|
+
iter_count = 0
|
|
270
|
+
|
|
271
|
+
while i < self.maxneighbor_:
|
|
272
|
+
if self.max_iter is not None and iter_count >= self.max_iter:
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
random_medoid_pos = random_state.randint(0, self.n_clusters)
|
|
276
|
+
|
|
277
|
+
mask = np.ones(n_samples, dtype=bool)
|
|
278
|
+
mask[current_medoids_indices] = False
|
|
279
|
+
available_candidates = np.flatnonzero(mask)
|
|
280
|
+
|
|
281
|
+
if available_candidates.size == 0:
|
|
282
|
+
break
|
|
283
|
+
|
|
284
|
+
random_non_medoid_candidate = random_state.choice(available_candidates)
|
|
285
|
+
|
|
286
|
+
neighbor_medoids_indices = current_medoids_indices.copy()
|
|
287
|
+
neighbor_medoids_indices[random_medoid_pos] = (
|
|
288
|
+
random_non_medoid_candidate
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
neighbor_cost = calculate_cost(X, neighbor_medoids_indices, self.metric)
|
|
292
|
+
|
|
293
|
+
if neighbor_cost < current_cost:
|
|
294
|
+
current_medoids_indices = neighbor_medoids_indices
|
|
295
|
+
current_cost = neighbor_cost
|
|
296
|
+
i = 0
|
|
297
|
+
iter_count += 1
|
|
298
|
+
else:
|
|
299
|
+
i += 1
|
|
300
|
+
|
|
301
|
+
self.n_iter_ += max(1, iter_count)
|
|
302
|
+
|
|
303
|
+
if current_cost < best_cost:
|
|
304
|
+
best_cost = current_cost
|
|
305
|
+
best_medoids = current_medoids_indices
|
|
306
|
+
|
|
307
|
+
self.medoid_indices_ = best_medoids
|
|
308
|
+
self.cluster_centers_ = X[self.medoid_indices_]
|
|
309
|
+
|
|
310
|
+
self.labels_, _ = pairwise_distances_argmin_min(
|
|
311
|
+
X, self.cluster_centers_, metric=self.metric
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
return self
|
|
315
|
+
|
|
316
|
+
def predict(self, X: ArrayLike | "spmatrix") -> np.ndarray:
|
|
317
|
+
"""
|
|
318
|
+
Predict the closest cluster each sample in X belongs to.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
323
|
+
New data to predict. Accepts CSR/CSC sparse matrices.
|
|
324
|
+
|
|
325
|
+
Returns
|
|
326
|
+
-------
|
|
327
|
+
labels : ndarray of shape (n_samples,)
|
|
328
|
+
Index of the cluster each sample belongs to.
|
|
329
|
+
|
|
330
|
+
Raises
|
|
331
|
+
------
|
|
332
|
+
ValueError
|
|
333
|
+
If the number of features in ``X`` does not match the number of
|
|
334
|
+
features seen during fitting.
|
|
335
|
+
|
|
336
|
+
Notes
|
|
337
|
+
-----
|
|
338
|
+
This method uses ``pairwise_distances_argmin_min`` from scikit-learn
|
|
339
|
+
to assign each sample to the nearest medoid.
|
|
340
|
+
"""
|
|
341
|
+
check_is_fitted(self)
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
from sklearn.utils.validation import validate_data
|
|
345
|
+
|
|
346
|
+
X = validate_data(self, X=X, reset=False, accept_sparse=["csr", "csc"])
|
|
347
|
+
except ImportError:
|
|
348
|
+
if hasattr(self, "_validate_data"):
|
|
349
|
+
X = self._validate_data(X, reset=False, accept_sparse=["csr", "csc"])
|
|
350
|
+
else:
|
|
351
|
+
X = check_array(X, accept_sparse=["csr", "csc"])
|
|
352
|
+
if (
|
|
353
|
+
hasattr(self, "n_features_in_")
|
|
354
|
+
and X.shape[1] != self.n_features_in_
|
|
355
|
+
):
|
|
356
|
+
raise ValueError(
|
|
357
|
+
f"X has {X.shape[1]} features, but CLARANS is expecting "
|
|
358
|
+
f"{self.n_features_in_} features as input"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
labels, _ = pairwise_distances_argmin_min(
|
|
362
|
+
X, self.cluster_centers_, metric=self.metric
|
|
363
|
+
)
|
|
364
|
+
return labels
|
|
365
|
+
|
|
366
|
+
def transform(self, X: ArrayLike | "spmatrix") -> np.ndarray:
|
|
367
|
+
"""
|
|
368
|
+
Transform X to a cluster-distance space.
|
|
369
|
+
|
|
370
|
+
In the new space, each dimension is the distance to the cluster centers.
|
|
371
|
+
|
|
372
|
+
Parameters
|
|
373
|
+
----------
|
|
374
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
375
|
+
New data to transform.
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
X_new : ndarray of shape (n_samples, n_clusters)
|
|
380
|
+
X transformed in the new space.
|
|
381
|
+
"""
|
|
382
|
+
check_is_fitted(self)
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
from sklearn.utils.validation import validate_data
|
|
386
|
+
X = validate_data(self, X=X, reset=False, accept_sparse=["csr", "csc"])
|
|
387
|
+
except ImportError:
|
|
388
|
+
if hasattr(self, "_validate_data"):
|
|
389
|
+
X = self._validate_data(X, reset=False, accept_sparse=["csr", "csc"])
|
|
390
|
+
else:
|
|
391
|
+
X = check_array(X, accept_sparse=["csr", "csc"])
|
|
392
|
+
|
|
393
|
+
return pairwise_distances(X, self.cluster_centers_, metric=self.metric)
|