sbcluster 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sbcluster-0.1.0/.gitlab-ci.yml +39 -0
- sbcluster-0.1.0/LICENSE +21 -0
- sbcluster-0.1.0/PKG-INFO +12 -0
- sbcluster-0.1.0/README.md +78 -0
- sbcluster-0.1.0/docs/Makefile +20 -0
- sbcluster-0.1.0/docs/make.bat +35 -0
- sbcluster-0.1.0/docs/source/_static/.gitkeep +0 -0
- sbcluster-0.1.0/docs/source/_templates/autosummary/class.rst +8 -0
- sbcluster-0.1.0/docs/source/conf.py +46 -0
- sbcluster-0.1.0/docs/source/index.rst +71 -0
- sbcluster-0.1.0/pyproject.toml +41 -0
- sbcluster-0.1.0/sbcluster/__init__.py +4 -0
- sbcluster-0.1.0/sbcluster/_bridges.py +269 -0
- sbcluster-0.1.0/sbcluster/_defs.py +95 -0
- sbcluster-0.1.0/sbcluster/_kmeans.py +159 -0
- sbcluster-0.1.0/sbcluster/_spectral.py +84 -0
- sbcluster-0.1.0/sbcluster.egg-info/PKG-INFO +12 -0
- sbcluster-0.1.0/sbcluster.egg-info/SOURCES.txt +20 -0
- sbcluster-0.1.0/sbcluster.egg-info/dependency_links.txt +1 -0
- sbcluster-0.1.0/sbcluster.egg-info/requires.txt +4 -0
- sbcluster-0.1.0/sbcluster.egg-info/top_level.txt +1 -0
- sbcluster-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
stages:
|
|
2
|
+
- lint
|
|
3
|
+
- deploy
|
|
4
|
+
- publish
|
|
5
|
+
|
|
6
|
+
lint:
|
|
7
|
+
image: python:latest
|
|
8
|
+
stage: lint
|
|
9
|
+
script:
|
|
10
|
+
- pip install ruff
|
|
11
|
+
- ruff format sbcluster
|
|
12
|
+
- ruff check sbcluster
|
|
13
|
+
rules:
|
|
14
|
+
- if: $CI_COMMIT_BRANCH
|
|
15
|
+
|
|
16
|
+
pages:
|
|
17
|
+
image: python:latest
|
|
18
|
+
stage: deploy
|
|
19
|
+
script:
|
|
20
|
+
- apt-get update && apt-get install -y git
|
|
21
|
+
- pip install sphinx furo
|
|
22
|
+
- pip install . --extra-index-url https://download.pytorch.org/whl/cpu
|
|
23
|
+
- sphinx-build -b html docs/source public
|
|
24
|
+
artifacts:
|
|
25
|
+
paths:
|
|
26
|
+
- public
|
|
27
|
+
rules:
|
|
28
|
+
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
29
|
+
|
|
30
|
+
publish-to-pypi:
|
|
31
|
+
image: python:latest
|
|
32
|
+
stage: publish
|
|
33
|
+
script:
|
|
34
|
+
- python -m pip install --upgrade pip build twine setuptools-scm
|
|
35
|
+
- python -m build
|
|
36
|
+
- TWINE_PASSWORD=${PYPI_TOKEN} TWINE_USERNAME=__token__ python -m twine upload --verbose dist/*
|
|
37
|
+
rules:
|
|
38
|
+
- if: '$CI_COMMIT_TAG =~ /^v.*$/'
|
|
39
|
+
|
sbcluster-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Felix Laplante <felixlaplante0@proton.me>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
sbcluster-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sbcluster
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Spectral Bridges clustering algorithm
|
|
5
|
+
Author-email: Félix Laplante <felixlaplante0@proton.me>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scipy
|
|
10
|
+
Requires-Dist: pydantic
|
|
11
|
+
Requires-Dist: faiss-cpu
|
|
12
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# 📊 Spectral Bridges
|
|
2
|
+
|
|
3
|
+
**sbcluster** is a Python package that implements a novel clustering algorithm combining k-means and spectral clustering techniques, called **Spectral Bridges**. It leverages efficient affinity matrix computation and merges clusters based on a connectivity measure inspired by SVM's margin concept. This package is designed to provide robust clustering solutions, particularly suited for large datasets.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## ✨ Features
|
|
8
|
+
|
|
9
|
+
- **Spectral Bridges Algorithm**: Integrates k-means and spectral clustering with efficient affinity matrix calculation for improved clustering results.
|
|
10
|
+
- **Scalability**: Designed to handle large datasets by optimizing cluster formation through advanced affinity matrix computations.
|
|
11
|
+
- **Customizable**: Parameters such as number of clusters, iterations, and random state allow flexibility in clustering configurations.
|
|
12
|
+
- **Model selection**: Automatic model selection for number of nodes (m) according to a normalized eigengap metric.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## ⚡ Speed
|
|
17
|
+
|
|
18
|
+
Spectral Bridges not only utilizes FAISS's efficient k-means implementation but also uses a scikit-learn method clone for centroid initialization, which is much faster than using scikit-learn's implementation (over 2x improvement).
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 🚀 Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install sbcluster
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## 🔧 Usage
|
|
29
|
+
|
|
30
|
+
### Example
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import numpy as np
|
|
34
|
+
|
|
35
|
+
from sbcluster import SpectralBridges
|
|
36
|
+
|
|
37
|
+
# Generate sample data
|
|
38
|
+
np.random.seed(0)
|
|
39
|
+
X = np.random.rand(100, 10) # Replace with your dataset
|
|
40
|
+
|
|
41
|
+
# Initialize and fit Spectral Bridges (with a specified number of nodes if needed) and random seed
|
|
42
|
+
model = SpectralBridges(n_clusters=5, random_state=42)
|
|
43
|
+
|
|
44
|
+
# Define range of nodes to evaluate, should be an iterable of integers, or None if n_nodes is already set.
|
|
45
|
+
n_nodes_range = [10, 15, 20]
|
|
46
|
+
|
|
47
|
+
# Find the optimal number of nodes for a given value of clusters
|
|
48
|
+
# Modifies the instance attributes, returns a dict
|
|
49
|
+
# If n_nodes_range is None, then the model selects using self.n_nodes if not None
|
|
50
|
+
mean_ngaps = model.fit_select(X, n_nodes_range)
|
|
51
|
+
|
|
52
|
+
print("Optimal number of nodes:", model.n_nodes)
|
|
53
|
+
print("Dict of mean normalized eigengaps:", mean_ngaps)
|
|
54
|
+
|
|
55
|
+
# Predict clusters for new data points
|
|
56
|
+
new_data = np.random.rand(20, 10) # Replace with new data
|
|
57
|
+
predicted_clusters = model.predict(new_data)
|
|
58
|
+
|
|
59
|
+
print("Predicted clusters:", predicted_clusters)
|
|
60
|
+
|
|
61
|
+
# With a custom number of nodes
|
|
62
|
+
custom_model = SpectralBridges(n_clusters=5, n_nodes=12, p=1) # And a p-bridge affinity
|
|
63
|
+
|
|
64
|
+
# Fit the model
|
|
65
|
+
custom_model.fit(X)
|
|
66
|
+
|
|
67
|
+
# Predict the same way...
|
|
68
|
+
custom_predicted_clusters = custom_model.predict(new_data)
|
|
69
|
+
|
|
70
|
+
print("Predicted clusters:", custom_predicted_clusters)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## 📖 Learn More
|
|
76
|
+
|
|
77
|
+
For tutorials, API reference, visit the official site:
|
|
78
|
+
👉 [sbcluster Documentation](https://felixlaplante0.gitlab.io/sbcluster)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Minimal makefile for Sphinx documentation
|
|
2
|
+
#
|
|
3
|
+
|
|
4
|
+
# You can set these variables from the command line, and also
|
|
5
|
+
# from the environment for the first two.
|
|
6
|
+
SPHINXOPTS ?=
|
|
7
|
+
SPHINXBUILD ?= sphinx-build
|
|
8
|
+
SOURCEDIR = source
|
|
9
|
+
BUILDDIR = build
|
|
10
|
+
|
|
11
|
+
# Put it first so that "make" without argument is like "make help".
|
|
12
|
+
help:
|
|
13
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
14
|
+
|
|
15
|
+
.PHONY: help Makefile
|
|
16
|
+
|
|
17
|
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
|
18
|
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
|
19
|
+
%: Makefile
|
|
20
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
@ECHO OFF
|
|
2
|
+
|
|
3
|
+
pushd %~dp0
|
|
4
|
+
|
|
5
|
+
REM Command file for Sphinx documentation
|
|
6
|
+
|
|
7
|
+
if "%SPHINXBUILD%" == "" (
|
|
8
|
+
set SPHINXBUILD=sphinx-build
|
|
9
|
+
)
|
|
10
|
+
set SOURCEDIR=source
|
|
11
|
+
set BUILDDIR=build
|
|
12
|
+
|
|
13
|
+
%SPHINXBUILD% >NUL 2>NUL
|
|
14
|
+
if errorlevel 9009 (
|
|
15
|
+
echo.
|
|
16
|
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
|
17
|
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
|
18
|
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
|
19
|
+
echo.may add the Sphinx directory to PATH.
|
|
20
|
+
echo.
|
|
21
|
+
echo.If you don't have Sphinx installed, grab it from
|
|
22
|
+
echo.https://www.sphinx-doc.org/
|
|
23
|
+
exit /b 1
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if "%1" == "" goto help
|
|
27
|
+
|
|
28
|
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
29
|
+
goto end
|
|
30
|
+
|
|
31
|
+
:help
|
|
32
|
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
33
|
+
|
|
34
|
+
:end
|
|
35
|
+
popd
|
|
File without changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
|
2
|
+
#
|
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
|
5
|
+
|
|
6
|
+
# -- Project information -----------------------------------------------------
|
|
7
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
release = os.getenv("CI_COMMIT_TAG", "v0.0.0")
|
|
13
|
+
version = release.lstrip("v")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
project = "sbcluster"
|
|
17
|
+
copyright = "2025, Félix Laplante"
|
|
18
|
+
author = "Félix Laplante"
|
|
19
|
+
|
|
20
|
+
# -- General configuration ---------------------------------------------------
|
|
21
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
22
|
+
|
|
23
|
+
extensions = [
|
|
24
|
+
"sphinx.ext.autodoc",
|
|
25
|
+
"sphinx.ext.napoleon",
|
|
26
|
+
"sphinx.ext.viewcode",
|
|
27
|
+
"sphinx.ext.autosummary",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
templates_path = ["_templates"]
|
|
31
|
+
exclude_patterns = []
|
|
32
|
+
|
|
33
|
+
autodoc_member_order = "bysource"
|
|
34
|
+
autodoc_typehints = "description"
|
|
35
|
+
autodoc_typehints_format = "short"
|
|
36
|
+
autodoc_inherit_docstrings = True
|
|
37
|
+
autosummary_generate = True
|
|
38
|
+
add_module_names = False
|
|
39
|
+
napoleon_use_ivar = True
|
|
40
|
+
napoleon_attr_annotations = True
|
|
41
|
+
|
|
42
|
+
# -- Options for HTML output -------------------------------------------------
|
|
43
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
|
44
|
+
|
|
45
|
+
html_theme = "furo"
|
|
46
|
+
html_static_path = ["_static"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Spectral Bridges
|
|
2
|
+
================
|
|
3
|
+
|
|
4
|
+
**sbcluster** is a Python package that implements a novel clustering algorithm combining k-means and spectral clustering techniques, called **Spectral Bridges**. It leverages efficient affinity matrix computation and merges clusters based on a connectivity measure inspired by SVM's margin concept. This package is designed to provide robust clustering solutions, particularly suited for large datasets.
|
|
5
|
+
|
|
6
|
+
Features
|
|
7
|
+
--------
|
|
8
|
+
|
|
9
|
+
- **Spectral Bridges Algorithm**: Integrates k-means and spectral clustering with efficient affinity matrix calculation for improved clustering results.
|
|
10
|
+
- **Scalability**: Designed to handle large datasets by optimizing cluster formation through advanced affinity matrix computations.
|
|
11
|
+
- **Customizable**: Parameters such as number of clusters, iterations, and random state allow flexibility in clustering configurations.
|
|
12
|
+
- **Model selection**: Automatic model selection for number of nodes (m) according to a normalized eigengap metric.
|
|
13
|
+
|
|
14
|
+
Speed
|
|
15
|
+
-----
|
|
16
|
+
|
|
17
|
+
Spectral Bridges not only utilizes FAISS's efficient k-means implementation but also uses a scikit-learn method clone for centroid initialization which is much faster than using scikit-learn's implementation (over 2x improvement).
|
|
18
|
+
|
|
19
|
+
Installation
|
|
20
|
+
------------
|
|
21
|
+
|
|
22
|
+
You can install the package via pip:
|
|
23
|
+
|
|
24
|
+
.. code-block:: bash
|
|
25
|
+
|
|
26
|
+
pip install sbcluster
|
|
27
|
+
|
|
28
|
+
Usage
|
|
29
|
+
-----
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
|
|
33
|
+
.. code-block:: python
|
|
34
|
+
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
37
|
+
from sbcluster import SpectralBridges
|
|
38
|
+
|
|
39
|
+
# Generate sample data
|
|
40
|
+
np.random.seed(0)
|
|
41
|
+
X = np.random.rand(100, 10) # Replace with your dataset
|
|
42
|
+
|
|
43
|
+
# Initialize and fit Spectral Bridges (with a specified number of nodes if needed) and random seed
|
|
44
|
+
model = SpectralBridges(n_clusters=5, random_state=42)
|
|
45
|
+
|
|
46
|
+
# Define range of nodes to evaluate, should be an iterable of integers, or None if n_nodes is already set.
|
|
47
|
+
n_nodes_range = [10, 15, 20]
|
|
48
|
+
|
|
49
|
+
# Find the optimal number of nodes for a given value of clusters
|
|
50
|
+
mean_ngaps = model.fit_select(X, n_nodes_range)
|
|
51
|
+
print("Optimal number of nodes:", model.n_nodes)
|
|
52
|
+
print("Dict of mean normalized eigengaps:", mean_ngaps)
|
|
53
|
+
|
|
54
|
+
# Predict clusters for new data points
|
|
55
|
+
new_data = np.random.rand(20, 10) # Replace with new data
|
|
56
|
+
predicted_clusters = model.predict(new_data)
|
|
57
|
+
print("Predicted clusters:", predicted_clusters)
|
|
58
|
+
|
|
59
|
+
# With a custom number of nodes
|
|
60
|
+
custom_model = SpectralBridges(n_clusters=5, n_nodes=12, p=1) # And a p-bridge affinity
|
|
61
|
+
custom_model.fit(X)
|
|
62
|
+
custom_predicted_clusters = custom_model.predict(new_data)
|
|
63
|
+
print("Predicted clusters:", custom_predicted_clusters)
|
|
64
|
+
|
|
65
|
+
API Reference
|
|
66
|
+
-------------
|
|
67
|
+
|
|
68
|
+
.. autoclass:: sbcluster._bridges.SpectralBridges
|
|
69
|
+
:members:
|
|
70
|
+
:undoc-members:
|
|
71
|
+
:show-inheritance:
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sbcluster"
|
|
3
|
+
description = "Spectral Bridges clustering algorithm"
|
|
4
|
+
authors = [{ name = "Félix Laplante", email = "felixlaplante0@proton.me" }]
|
|
5
|
+
requires-python = ">=3.10"
|
|
6
|
+
dependencies = ["numpy", "scipy", "pydantic", "faiss-cpu"]
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
|
|
9
|
+
[build-system]
|
|
10
|
+
requires = ["setuptools>=42", "setuptools-scm[toml]>=6.0", "wheel"]
|
|
11
|
+
build-backend = "setuptools.build_meta"
|
|
12
|
+
|
|
13
|
+
[tool.setuptools_scm]
|
|
14
|
+
version_scheme = "post-release"
|
|
15
|
+
local_scheme = "no-local-version"
|
|
16
|
+
|
|
17
|
+
[tool.ruff]
|
|
18
|
+
lint.select = [
|
|
19
|
+
"D", # pydocstyle (docstring conventions)
|
|
20
|
+
"E", # pycodestyle errors
|
|
21
|
+
"W", # pycodestyle warnings
|
|
22
|
+
"F", # Pyflakes
|
|
23
|
+
"I", # isort
|
|
24
|
+
"UP", # pyupgrade
|
|
25
|
+
"B", # flake8-bugbear
|
|
26
|
+
"C4", # flake8-comprehensions
|
|
27
|
+
"S", # flake8-bandit (security)
|
|
28
|
+
"T20", # flake8-print
|
|
29
|
+
"PT", # flake8-pytest-style
|
|
30
|
+
"Q", # flake8-quotes
|
|
31
|
+
"RET", # flake8-return
|
|
32
|
+
"SIM", # flake8-simplify
|
|
33
|
+
"ARG", # flake8-unused-arguments
|
|
34
|
+
"ERA", # eradicate (commented code)
|
|
35
|
+
"PL", # Pylint
|
|
36
|
+
"RUF", # Ruff-specific rules
|
|
37
|
+
]
|
|
38
|
+
lint.ignore = ["D417", "PLR0913"]
|
|
39
|
+
|
|
40
|
+
[tool.ruff.lint.pydocstyle]
|
|
41
|
+
convention = "google"
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import Final, cast
|
|
3
|
+
|
|
4
|
+
import faiss # type: ignore
|
|
5
|
+
import numpy as np
|
|
6
|
+
from numpy.typing import NDArray
|
|
7
|
+
from pydantic import ConfigDict, validate_call
|
|
8
|
+
from scipy.linalg.blas import sgemm # type: ignore
|
|
9
|
+
|
|
10
|
+
from ._defs import (
|
|
11
|
+
AffinityTransform,
|
|
12
|
+
ExpQuantileTransform,
|
|
13
|
+
FloatGtZeroLtHalf,
|
|
14
|
+
IntStrictlyPositive,
|
|
15
|
+
NumStrictlyPositive,
|
|
16
|
+
)
|
|
17
|
+
from ._kmeans import KMeans
|
|
18
|
+
from ._spectral import SpectralClustering
|
|
19
|
+
|
|
20
|
+
# Constants
|
|
21
|
+
DEFAULT_AFFINITY_TRANSFORM: Final = ExpQuantileTransform(0.1, 1e4)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SpectralBridges:
|
|
25
|
+
"""Spectral Bridges clustering algorithm.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
random_state (int | None): Determines random number generation for centroid
|
|
29
|
+
initialization.
|
|
30
|
+
n_clusters (int): The number of clusters to form.
|
|
31
|
+
n_nodes (int): Number of nodes or initial clusters.
|
|
32
|
+
p (float): Power of the alpha_i.
|
|
33
|
+
n_iter (int): Number of iterations to run the k-means algorithm.
|
|
34
|
+
n_local_trials (int or None): Number of seeding trials for centroids
|
|
35
|
+
initialization.
|
|
36
|
+
random_state (int | None): Determines random number generation for centroid
|
|
37
|
+
initialization.
|
|
38
|
+
affinity_transform (AffinityTransform): Affinity transform to apply to the
|
|
39
|
+
affinity matrix.
|
|
40
|
+
cluster_centers_ (list[NDArray[np.float32]] | None): Coordinates of cluster
|
|
41
|
+
centers.
|
|
42
|
+
eigvals_ (NDArray[np.float32 | np.float64] | None): The eigenvalues of the
|
|
43
|
+
(normalized) laplacian matrix.
|
|
44
|
+
ngap_ (float): The normalized eigengap.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
n_clusters: int
|
|
48
|
+
n_nodes: int | None
|
|
49
|
+
p: float
|
|
50
|
+
n_iter: int
|
|
51
|
+
n_local_trials: IntStrictlyPositive | None
|
|
52
|
+
random_state: int | None
|
|
53
|
+
cluster_centers_: list[NDArray[np.float32]] | None
|
|
54
|
+
eigvals_: NDArray[np.float32 | np.float64] | None
|
|
55
|
+
ngap_: float | None
|
|
56
|
+
affinity_transform: AffinityTransform
|
|
57
|
+
|
|
58
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
n_clusters: IntStrictlyPositive,
|
|
62
|
+
n_nodes: IntStrictlyPositive | None = None,
|
|
63
|
+
*,
|
|
64
|
+
p: NumStrictlyPositive = 2,
|
|
65
|
+
alpha: FloatGtZeroLtHalf = 0.1,
|
|
66
|
+
n_iter: IntStrictlyPositive = 20,
|
|
67
|
+
n_local_trials: IntStrictlyPositive | None = None,
|
|
68
|
+
random_state: int | None = None,
|
|
69
|
+
affinity_transform: AffinityTransform = DEFAULT_AFFINITY_TRANSFORM,
|
|
70
|
+
):
|
|
71
|
+
"""Initialize the Spectral Bridges model.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
n_clusters (IntStrictlyPositive): The number of clusters to form.
|
|
75
|
+
n_nodes (IntStrictlyPositive | None): Number of nodes or initial clusters.
|
|
76
|
+
p (NumStrictlyPositive, optional): Power of the alpha_i. Defaults to 2.
|
|
77
|
+
alpha (FloatGtZeroLtHalf, optional): Quantile for affinity matrix
|
|
78
|
+
computation. Defaults to 0.1.
|
|
79
|
+
n_iter (int, optional): Number of iterations to run the k-means algorithm.
|
|
80
|
+
Defaults to 20.
|
|
81
|
+
n_local_trials (int or None, optional): Number of seeding trials for
|
|
82
|
+
centroids initialization.
|
|
83
|
+
random_state (int or None, optional): Determines random number generation
|
|
84
|
+
for centroid initialization.
|
|
85
|
+
affinity_transform (AffinityTransform, optional): Affinity transform
|
|
86
|
+
to apply to the affinity matrix. Defaults to DEFAULT_AFFINITY_TRANSFORM.
|
|
87
|
+
"""
|
|
88
|
+
self.n_clusters = n_clusters
|
|
89
|
+
self.n_nodes = n_nodes
|
|
90
|
+
self.p = p
|
|
91
|
+
self.alpha = alpha
|
|
92
|
+
self.n_iter = n_iter
|
|
93
|
+
self.n_local_trials = n_local_trials
|
|
94
|
+
self.random_state = random_state
|
|
95
|
+
self.affinity_transform = affinity_transform
|
|
96
|
+
self.cluster_centers_ = None
|
|
97
|
+
self.eigvals_ = None
|
|
98
|
+
self.ngap_ = None
|
|
99
|
+
|
|
100
|
+
if self.n_nodes is not None and self.n_nodes <= self.n_clusters:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"n_nodes must be greater than n_clusters, got {self.n_nodes} <= "
|
|
103
|
+
"{self.n_clusters}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
107
|
+
def fit(self, X: np.ndarray):
|
|
108
|
+
"""Fit the Spectral Bridges model on the input data X.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
X : numpy.ndarray
|
|
112
|
+
Input data to cluster.
|
|
113
|
+
"""
|
|
114
|
+
if self.n_nodes is None:
|
|
115
|
+
raise ValueError("n_nodes must be provided")
|
|
116
|
+
|
|
117
|
+
kmeans = KMeans(
|
|
118
|
+
self.n_nodes,
|
|
119
|
+
self.n_iter,
|
|
120
|
+
self.n_local_trials,
|
|
121
|
+
self.random_state,
|
|
122
|
+
)
|
|
123
|
+
kmeans.fit(X)
|
|
124
|
+
centers = cast(NDArray[np.float32], kmeans.cluster_centers_)
|
|
125
|
+
|
|
126
|
+
affinity: NDArray[np.float64] = np.empty((self.n_nodes, self.n_nodes))
|
|
127
|
+
|
|
128
|
+
X_centered = [
|
|
129
|
+
np.array(
|
|
130
|
+
X[kmeans.labels_ == i]
|
|
131
|
+
- cast(NDArray[np.float32], kmeans.cluster_centers_)[i],
|
|
132
|
+
dtype=np.float32,
|
|
133
|
+
order="F",
|
|
134
|
+
)
|
|
135
|
+
for i in range(self.n_nodes)
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
counts = np.array([X_centered[i].shape[0] for i in range(self.n_nodes)])
|
|
139
|
+
counts = counts[None, :] + counts[:, None]
|
|
140
|
+
|
|
141
|
+
for i in range(self.n_nodes):
|
|
142
|
+
segments = np.asfortranarray(centers - centers[i])
|
|
143
|
+
dists = np.einsum("ij,ij->i", segments, segments)
|
|
144
|
+
dists[i] = 1
|
|
145
|
+
|
|
146
|
+
projs = sgemm(1.0, X_centered[i], segments, trans_b=True)
|
|
147
|
+
np.clip(projs / dists, 0, None, out=projs)
|
|
148
|
+
projs = np.power(projs, self.p)
|
|
149
|
+
|
|
150
|
+
affinity[i] = projs.sum(axis=0)
|
|
151
|
+
|
|
152
|
+
affinity = np.power((affinity + affinity.T) / counts, 1 / self.p)
|
|
153
|
+
|
|
154
|
+
affinity = cast(NDArray[np.float64], self.affinity_transform(affinity))
|
|
155
|
+
|
|
156
|
+
spectralclustering = SpectralClustering(
|
|
157
|
+
self.n_clusters, self.n_iter, self.n_local_trials, self.random_state
|
|
158
|
+
)
|
|
159
|
+
spectralclustering.fit(affinity)
|
|
160
|
+
|
|
161
|
+
self.eigvals_ = spectralclustering.eigvals_
|
|
162
|
+
self.ngap_ = spectralclustering.ngap_
|
|
163
|
+
self.cluster_centers_ = [
|
|
164
|
+
centers[spectralclustering.labels_ == i] for i in range(self.n_clusters)
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
168
|
+
def fit_select(
|
|
169
|
+
self,
|
|
170
|
+
X: np.ndarray,
|
|
171
|
+
n_nodes_range: Iterable[int] | None = None,
|
|
172
|
+
n_redo: IntStrictlyPositive = 10,
|
|
173
|
+
) -> dict[int, float]:
|
|
174
|
+
"""Selects and fits the best model from a range of possible node counts.
|
|
175
|
+
|
|
176
|
+
It evaluates the mean normalized eigengap (ngap) for each candidate.
|
|
177
|
+
|
|
178
|
+
For each `n_nodes` in `n_nodes_range`, multiple models are fit to the data,
|
|
179
|
+
and the one with the highest mean normalized eigengap over `n_redo` runs
|
|
180
|
+
is selected. The method then updates the current instance to use the
|
|
181
|
+
attributes of the best candidate model.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
X (np.ndarray): The input data.
|
|
185
|
+
n_nodes_range (Iterable[int] | None): The range of possible node counts.
|
|
186
|
+
n_redo (int): The number of times to run the model.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
dict[int, float]: The mean normalized eigengap for each node count.
|
|
190
|
+
"""
|
|
191
|
+
if n_nodes_range is None:
|
|
192
|
+
if self.n_nodes is None:
|
|
193
|
+
raise ValueError("n_nodes_range or self.n_nodes must be provided")
|
|
194
|
+
n_nodes_range = [self.n_nodes]
|
|
195
|
+
|
|
196
|
+
rng = np.random.default_rng(self.random_state)
|
|
197
|
+
max_int = np.iinfo(np.int32).max
|
|
198
|
+
|
|
199
|
+
best_candidate = None
|
|
200
|
+
best_mean_ngap = -1
|
|
201
|
+
mean_ngaps: dict[int, float] = {}
|
|
202
|
+
|
|
203
|
+
for n_nodes in n_nodes_range:
|
|
204
|
+
candidate = None
|
|
205
|
+
cum_ngap = 0
|
|
206
|
+
|
|
207
|
+
for _ in range(n_redo):
|
|
208
|
+
model = SpectralBridges(
|
|
209
|
+
n_clusters=self.n_clusters,
|
|
210
|
+
n_nodes=n_nodes,
|
|
211
|
+
p=self.p,
|
|
212
|
+
n_iter=self.n_iter,
|
|
213
|
+
n_local_trials=self.n_local_trials,
|
|
214
|
+
random_state=self.random_state,
|
|
215
|
+
affinity_transform=self.affinity_transform,
|
|
216
|
+
)
|
|
217
|
+
model.fit(X)
|
|
218
|
+
|
|
219
|
+
cum_ngap += cast(float, model.ngap_)
|
|
220
|
+
|
|
221
|
+
if candidate is None or cast(float, model.ngap_) > cast(
|
|
222
|
+
float, candidate.ngap_
|
|
223
|
+
):
|
|
224
|
+
candidate = model
|
|
225
|
+
|
|
226
|
+
self.random_state = int(rng.integers(max_int + 1))
|
|
227
|
+
|
|
228
|
+
mean_ngap = cum_ngap / n_redo
|
|
229
|
+
mean_ngaps[n_nodes] = mean_ngap
|
|
230
|
+
|
|
231
|
+
if mean_ngap > best_mean_ngap:
|
|
232
|
+
best_candidate = candidate
|
|
233
|
+
best_mean_ngap = mean_ngap
|
|
234
|
+
|
|
235
|
+
self.__dict__.update(best_candidate.__dict__)
|
|
236
|
+
|
|
237
|
+
return mean_ngaps
|
|
238
|
+
|
|
239
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
240
|
+
def predict(self, x: np.ndarray) -> np.ndarray:
|
|
241
|
+
"""Predict the nearest cluster index for each input data point x.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
x (np.ndarray): The input data.
|
|
245
|
+
|
|
246
|
+
Raises:
|
|
247
|
+
ValueError: If `x` contains inf or NaN values.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
NDArray[np.int32]: The predicted cluster indices.
|
|
251
|
+
"""
|
|
252
|
+
if np.isinf(x).any():
|
|
253
|
+
raise ValueError("x must not contain inf values")
|
|
254
|
+
if np.isnan(x).any():
|
|
255
|
+
raise ValueError("x must not contain NaN values")
|
|
256
|
+
|
|
257
|
+
centers = cast(list[NDArray[np.float32]], self.cluster_centers_)
|
|
258
|
+
|
|
259
|
+
cluster_centers = np.vstack(centers)
|
|
260
|
+
cluster_cutoffs = np.cumsum([cluster.shape[0] for cluster in centers])
|
|
261
|
+
|
|
262
|
+
index = faiss.IndexFlatL2(x.shape[1])
|
|
263
|
+
index.add(cluster_centers.astype(np.float32)) # type: ignore
|
|
264
|
+
winners = index.search(x.astype(np.float32), 1)[1].ravel() # type: ignore
|
|
265
|
+
|
|
266
|
+
return cast(
|
|
267
|
+
NDArray[np.int32],
|
|
268
|
+
np.searchsorted(cluster_cutoffs, winners, side="right"), # type: ignore
|
|
269
|
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Annotated, Protocol, TypeAlias, runtime_checkable
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
from pydantic import AfterValidator, ConfigDict, validate_call
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# Validators
|
|
9
|
+
def is_strict_pos(x: int | float) -> int | float:
|
|
10
|
+
"""Checks if the argument is strictly positive.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
x (int | float): The input number.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
ValueError: If the number is not strictly positive.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
int | float | torch.Tensor: The output number or tensor.
|
|
20
|
+
"""
|
|
21
|
+
if x <= 0:
|
|
22
|
+
raise ValueError(f"Expected strictly positive number, got {x}")
|
|
23
|
+
return x
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_gt_zero_lt_half(x: float) -> float:
|
|
27
|
+
"""Checks if the argument is between 0 and 1/2.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
x (float): The input number.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If the number is not between 0 and 1/2.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
float: The output number.
|
|
37
|
+
"""
|
|
38
|
+
if x <= 0 or x >= 0.5: # noqa: PLR2004
|
|
39
|
+
raise ValueError(f"Expected number > 0 and < 1/2, got {x}")
|
|
40
|
+
return x
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Num: TypeAlias = int | float
|
|
44
|
+
IntStrictlyPositive = Annotated[int, AfterValidator(is_strict_pos)]
|
|
45
|
+
NumStrictlyPositive = Annotated[Num, AfterValidator(is_strict_pos)]
|
|
46
|
+
FloatGtZeroLtHalf = Annotated[float, AfterValidator(is_gt_zero_lt_half)]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Protocols
|
|
50
|
+
@runtime_checkable
|
|
51
|
+
class AffinityTransform(Protocol):
|
|
52
|
+
"""Protocol for affinity transforms.
|
|
53
|
+
|
|
54
|
+
Use this protocol to define custom affinity transforms.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __call__(
|
|
58
|
+
self, x: NDArray[np.float32 | np.float64]
|
|
59
|
+
) -> NDArray[np.float32 | np.float64]: ...
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Transformations
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ExpQuantileTransform(AffinityTransform):
|
|
66
|
+
"""Exponential quantile transform.
|
|
67
|
+
|
|
68
|
+
Attributes:
|
|
69
|
+
alpha (float): Quantile for affinity matrix computation.
|
|
70
|
+
mult_factor (int | float): Scaling parameter for affinity matrix computation.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
alpha: float
|
|
74
|
+
mult_factor: int | float
|
|
75
|
+
|
|
76
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
77
|
+
def __init__(
|
|
78
|
+
self, alpha: FloatGtZeroLtHalf = 0.1, mult_factor: NumStrictlyPositive = 1e4
|
|
79
|
+
):
|
|
80
|
+
"""Initialize the Exponential quantile transform.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
alpha (FloatGtZeroLtHalf): Quantile for affinity matrix computation.
|
|
84
|
+
mult_factor (NumStrictlyPositive): Scaling parameter for affinity matrix
|
|
85
|
+
computation.
|
|
86
|
+
"""
|
|
87
|
+
self.alpha = alpha
|
|
88
|
+
self.mult_factor = mult_factor
|
|
89
|
+
|
|
90
|
+
def __call__(
|
|
91
|
+
self, x: NDArray[np.float32 | np.float64]
|
|
92
|
+
) -> NDArray[np.float32 | np.float64]:
|
|
93
|
+
q1, q2 = np.quantile(x, [self.alpha, 1 - self.alpha])
|
|
94
|
+
gamma = np.log(self.mult_factor) / (q2 - q1)
|
|
95
|
+
return np.exp(gamma * (x - x.max()))
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from typing import cast
|
|
2
|
+
|
|
3
|
+
import faiss # type: ignore
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
from pydantic import ConfigDict, validate_call
|
|
7
|
+
from scipy.linalg.blas import sgemm # type: ignore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class KMeans:
|
|
11
|
+
"""K-means clustering using FAISS.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
cluster_centers_ (NDArray[np.float32] | None): Coordinates of cluster centers.
|
|
15
|
+
labels_ (NDArray[np.int32] | None): Labels of each point (index) in X.
|
|
16
|
+
|
|
17
|
+
Methods:
|
|
18
|
+
--------
|
|
19
|
+
fit(X):
|
|
20
|
+
Run k-means clustering on the input data X.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
cluster_centers_: NDArray[np.float32] | None
|
|
24
|
+
labels_: NDArray[np.int32] | None
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
n_clusters: int,
|
|
29
|
+
n_iter: int,
|
|
30
|
+
n_local_trials: int | None,
|
|
31
|
+
random_state: int | None,
|
|
32
|
+
):
|
|
33
|
+
"""Initializes the KMeans class.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
n_clusters (int): The number of clusters to form.
|
|
37
|
+
n_iter (int): The number of iterations to run the k-means
|
|
38
|
+
algorithm.
|
|
39
|
+
n_local_trials (int | None): The number of seeding trials for
|
|
40
|
+
centroids initialization.
|
|
41
|
+
random_state (int | None) Determines random number generation for
|
|
42
|
+
centroid initialization.
|
|
43
|
+
"""
|
|
44
|
+
self.n_clusters = n_clusters
|
|
45
|
+
self.n_iter = n_iter
|
|
46
|
+
self.n_local_trials = n_local_trials
|
|
47
|
+
self.random_state = random_state
|
|
48
|
+
self.cluster_centers_ = None
|
|
49
|
+
self.labels_ = None
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _dists(
|
|
53
|
+
X: NDArray[np.float32], y: NDArray[np.float32], XX: NDArray[np.float32]
|
|
54
|
+
) -> NDArray[np.float32]:
|
|
55
|
+
"""Computes the pairwise distances between a fixed data matrix and some points.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
X (NDArray[np.float32]): The fixed data matrix.
|
|
59
|
+
y (NDArray[np.float32]): The non fixed points.
|
|
60
|
+
XX (NDArray[np.float32]): The fixed matrix squared norm.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
NDArray[np.float32]: The computed pairwise distances.
|
|
64
|
+
"""
|
|
65
|
+
yy = np.einsum("ij,ij->i", y, y)
|
|
66
|
+
dists = XX - sgemm(2.0, X, y, trans_b=True) + yy
|
|
67
|
+
np.clip(dists, 0, None, out=dists)
|
|
68
|
+
return dists
|
|
69
|
+
|
|
70
|
+
def _init_centroids(self, X: NDArray[np.float32]) -> NDArray[np.float32]:
|
|
71
|
+
"""Initializes the centroids in a K-means++ fashion.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
X (NDArray[np.float32]): The fixed data matrix.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
NDArray[np.float32]: The initialized centroids.
|
|
78
|
+
"""
|
|
79
|
+
rng = np.random.default_rng(self.random_state)
|
|
80
|
+
|
|
81
|
+
centroids = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
|
|
82
|
+
centroids[0] = X[rng.integers(X.shape[0])]
|
|
83
|
+
|
|
84
|
+
XX = np.einsum("ij,ij->i", X, X)[:, None]
|
|
85
|
+
|
|
86
|
+
dists = self._dists(X, centroids[0:1], XX).ravel()
|
|
87
|
+
inertia = dists.sum()
|
|
88
|
+
|
|
89
|
+
if self.n_local_trials is None:
|
|
90
|
+
self.n_local_trials = 2 + int(np.log(self.n_clusters))
|
|
91
|
+
|
|
92
|
+
for i in range(1, self.n_clusters):
|
|
93
|
+
candidate_ids = rng.choice(
|
|
94
|
+
X.shape[0], size=self.n_local_trials, p=dists / inertia
|
|
95
|
+
)
|
|
96
|
+
candidates = np.asfortranarray(X[candidate_ids])
|
|
97
|
+
|
|
98
|
+
current_candidates_dists = self._dists(X, candidates, XX)
|
|
99
|
+
candidates_dists = np.minimum(current_candidates_dists, dists[:, None])
|
|
100
|
+
|
|
101
|
+
inertias = candidates_dists.sum(axis=0)
|
|
102
|
+
best_inertia = inertias.argmin()
|
|
103
|
+
best_candidate = candidate_ids[best_inertia]
|
|
104
|
+
dists = candidates_dists[:, best_inertia]
|
|
105
|
+
inertia = inertias[best_inertia]
|
|
106
|
+
|
|
107
|
+
centroids[i] = X[best_candidate]
|
|
108
|
+
|
|
109
|
+
return centroids
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def _validate_X(X: NDArray[np.float32 | np.float64]) -> NDArray[np.float32]:
|
|
113
|
+
"""Validates and converts the data matrix.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
X (NDArray[np.float32 | np.float64]): The fixed data matrix.
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
ValueError: If `X``contains inf values.
|
|
120
|
+
ValueError: If `X``contains NaN values.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
NDArray[np.float32]: The validated and converted data matrix.
|
|
124
|
+
"""
|
|
125
|
+
if np.isinf(X).any():
|
|
126
|
+
raise ValueError("X must not contain inf values")
|
|
127
|
+
if np.isnan(X).any():
|
|
128
|
+
raise ValueError("X must not contain NaN values")
|
|
129
|
+
|
|
130
|
+
return np.array(X, dtype=np.float32, order="F")
|
|
131
|
+
|
|
132
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
133
|
+
def fit(self, X: np.ndarray):
|
|
134
|
+
"""Run k-means clustering on the input data X.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
X (np.ndarray): Input data matrix to cluster.
|
|
138
|
+
"""
|
|
139
|
+
X_f32 = self._validate_X(X)
|
|
140
|
+
|
|
141
|
+
index = faiss.IndexFlatL2(X.shape[1])
|
|
142
|
+
kmeans = faiss.Clustering(X.shape[1], self.n_clusters)
|
|
143
|
+
|
|
144
|
+
init_centroids = self._init_centroids(X_f32)
|
|
145
|
+
|
|
146
|
+
kmeans.centroids.resize(init_centroids.size)
|
|
147
|
+
faiss.copy_array_to_vector(init_centroids.ravel(), kmeans.centroids) # type: ignore
|
|
148
|
+
kmeans.niter = self.n_iter
|
|
149
|
+
kmeans.min_points_per_centroid = 0
|
|
150
|
+
kmeans.max_points_per_centroid = -1
|
|
151
|
+
kmeans.train(X_f32, index) # type: ignore
|
|
152
|
+
|
|
153
|
+
self.cluster_centers_ = cast(
|
|
154
|
+
NDArray[np.float32],
|
|
155
|
+
faiss.vector_to_array(kmeans.centroids).reshape( # type: ignore
|
|
156
|
+
self.n_clusters, X.shape[1]
|
|
157
|
+
),
|
|
158
|
+
)
|
|
159
|
+
self.labels_ = cast(NDArray[np.int32], index.search(X_f32, 1)[1].ravel()) # type: ignore
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import cast
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
from scipy.sparse.csgraph import laplacian # type: ignore
|
|
6
|
+
|
|
7
|
+
from ._kmeans import KMeans
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SpectralClustering:
|
|
11
|
+
"""Spectral clustering based on Laplacian matrix.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
n_local_trials (int | None): The number of seeding trials for
|
|
15
|
+
centroids initialization.
|
|
16
|
+
random_state (int | None) Determines random number generation for
|
|
17
|
+
centroid initialization.
|
|
18
|
+
labels_ (NDArray[np.int32] | None): Labels of each point (index) in the affinity
|
|
19
|
+
matrix.
|
|
20
|
+
eigvals_ (NDArray[np.float32 | np.float64] | None): The eigenvalues of the
|
|
21
|
+
(normalized) laplacian matrix.
|
|
22
|
+
ngap_ (float): The normalized eigengap.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
n_iter: int
|
|
26
|
+
n_local_trials: int | None
|
|
27
|
+
random_state: int | None
|
|
28
|
+
labels_: NDArray[np.int32] | None
|
|
29
|
+
eigvals_: NDArray[np.float32 | np.float64] | None
|
|
30
|
+
ngap_: float | None
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
n_clusters: int,
|
|
35
|
+
n_iter: int,
|
|
36
|
+
n_local_trials: int | None,
|
|
37
|
+
random_state: int | None,
|
|
38
|
+
):
|
|
39
|
+
"""Initializes the class.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
n_clusters (int): The number of clusters to form.
|
|
43
|
+
n_iter (int): The number of iterations to run the k-means
|
|
44
|
+
algorithm.
|
|
45
|
+
n_local_trials (int | None): The number of seeding trials for
|
|
46
|
+
centroids initialization.
|
|
47
|
+
random_state (int | None) Determines random number generation for
|
|
48
|
+
centroid initialization.
|
|
49
|
+
random_state (int | None): Determines random number generation for centroid
|
|
50
|
+
initialization.
|
|
51
|
+
"""
|
|
52
|
+
self.n_clusters = n_clusters
|
|
53
|
+
self.random_state = random_state
|
|
54
|
+
self.n_iter = n_iter
|
|
55
|
+
self.n_local_trials = n_local_trials
|
|
56
|
+
self.labels_ = None
|
|
57
|
+
self.eigvals_ = None
|
|
58
|
+
self.ngap_ = None
|
|
59
|
+
|
|
60
|
+
def fit(self, affinity: NDArray[np.float32 | np.float64]):
|
|
61
|
+
"""Fit the spectral clustering model on the affinity matrix.
|
|
62
|
+
|
|
63
|
+
Parameters:
|
|
64
|
+
-----------
|
|
65
|
+
affinity (NDArray[np.float32]): Affinity matrix representing pairwise similarity
|
|
66
|
+
between points.
|
|
67
|
+
"""
|
|
68
|
+
L = cast(NDArray[np.float32 | np.float64], laplacian(affinity, normed=True))
|
|
69
|
+
|
|
70
|
+
self.eigvals_, eigvecs = cast(
|
|
71
|
+
tuple[NDArray[np.float32 | np.float64], ...],
|
|
72
|
+
np.linalg.eigh(L), # type: ignore
|
|
73
|
+
)
|
|
74
|
+
eigvecs = eigvecs[:, : self.n_clusters]
|
|
75
|
+
eigvecs /= np.linalg.norm(eigvecs, axis=1)[:, None]
|
|
76
|
+
kmeans = KMeans(
|
|
77
|
+
self.n_clusters, self.n_iter, self.n_local_trials, self.random_state
|
|
78
|
+
)
|
|
79
|
+
kmeans.fit(eigvecs)
|
|
80
|
+
|
|
81
|
+
self.ngap_ = (
|
|
82
|
+
self.eigvals_[self.n_clusters] - self.eigvals_[self.n_clusters - 1]
|
|
83
|
+
) / self.eigvals_[self.n_clusters - 1]
|
|
84
|
+
self.labels_ = kmeans.labels_
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sbcluster
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Spectral Bridges clustering algorithm
|
|
5
|
+
Author-email: Félix Laplante <felixlaplante0@proton.me>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scipy
|
|
10
|
+
Requires-Dist: pydantic
|
|
11
|
+
Requires-Dist: faiss-cpu
|
|
12
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
.gitlab-ci.yml
|
|
2
|
+
LICENSE
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
docs/Makefile
|
|
6
|
+
docs/make.bat
|
|
7
|
+
docs/source/conf.py
|
|
8
|
+
docs/source/index.rst
|
|
9
|
+
docs/source/_static/.gitkeep
|
|
10
|
+
docs/source/_templates/autosummary/class.rst
|
|
11
|
+
sbcluster/__init__.py
|
|
12
|
+
sbcluster/_bridges.py
|
|
13
|
+
sbcluster/_defs.py
|
|
14
|
+
sbcluster/_kmeans.py
|
|
15
|
+
sbcluster/_spectral.py
|
|
16
|
+
sbcluster.egg-info/PKG-INFO
|
|
17
|
+
sbcluster.egg-info/SOURCES.txt
|
|
18
|
+
sbcluster.egg-info/dependency_links.txt
|
|
19
|
+
sbcluster.egg-info/requires.txt
|
|
20
|
+
sbcluster.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sbcluster
|