sbcluster 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ stages:
2
+ - lint
3
+ - deploy
4
+ - publish
5
+
6
+ lint:
7
+ image: python:latest
8
+ stage: lint
9
+ script:
10
+ - pip install ruff
11
+ - ruff format sbcluster
12
+ - ruff check sbcluster
13
+ rules:
14
+ - if: $CI_COMMIT_BRANCH
15
+
16
+ pages:
17
+ image: python:latest
18
+ stage: deploy
19
+ script:
20
+ - apt-get update && apt-get install -y git
21
+ - pip install sphinx furo
22
+ - pip install . --extra-index-url https://download.pytorch.org/whl/cpu
23
+ - sphinx-build -b html docs/source public
24
+ artifacts:
25
+ paths:
26
+ - public
27
+ rules:
28
+ - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
29
+
30
+ publish-to-pypi:
31
+ image: python:latest
32
+ stage: publish
33
+ script:
34
+ - python -m pip install --upgrade pip build twine setuptools-scm
35
+ - python -m build
36
+ - TWINE_PASSWORD=${PYPI_TOKEN} TWINE_USERNAME=__token__ python -m twine upload --verbose dist/*
37
+ rules:
38
+ - if: '$CI_COMMIT_TAG =~ /^v.*$/'
39
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Felix Laplante <felixlaplante0@proton.me>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: sbcluster
3
+ Version: 0.1.0
4
+ Summary: Spectral Bridges clustering algorithm
5
+ Author-email: Félix Laplante <felixlaplante0@proton.me>
6
+ Requires-Python: >=3.10
7
+ License-File: LICENSE
8
+ Requires-Dist: numpy
9
+ Requires-Dist: scipy
10
+ Requires-Dist: pydantic
11
+ Requires-Dist: faiss-cpu
12
+ Dynamic: license-file
@@ -0,0 +1,78 @@
1
+ # 📊 Spectral Bridges
2
+
3
+ **sbcluster** is a Python package that implements a novel clustering algorithm combining k-means and spectral clustering techniques, called **Spectral Bridges**. It leverages efficient affinity matrix computation and merges clusters based on a connectivity measure inspired by SVM's margin concept. This package is designed to provide robust clustering solutions, particularly suited for large datasets.
4
+
5
+ ---
6
+
7
+ ## ✨ Features
8
+
9
+ - **Spectral Bridges Algorithm**: Integrates k-means and spectral clustering with efficient affinity matrix calculation for improved clustering results.
10
+ - **Scalability**: Designed to handle large datasets by optimizing cluster formation through advanced affinity matrix computations.
11
+ - **Customizable**: Parameters such as number of clusters, iterations, and random state allow flexibility in clustering configurations.
12
+ - **Model selection**: Automatic model selection for number of nodes (m) according to a normalized eigengap metric.
13
+
14
+ ---
15
+
16
+ ## ⚡ Speed
17
+
18
+ Spectral Bridges not only utilizes FAISS's efficient k-means implementation but also uses a scikit-learn method clone for centroid initialization, which is much faster than using scikit-learn's implementation (over 2x improvement).
19
+
20
+ ---
21
+
22
+ ## 🚀 Installation
23
+
24
+ ```bash
25
+ pip install sbcluster
26
+ ```
27
+
28
+ ## 🔧 Usage
29
+
30
+ ### Example
31
+
32
+ ```python
33
+ import numpy as np
34
+
35
+ from sbcluster import SpectralBridges
36
+
37
+ # Generate sample data
38
+ np.random.seed(0)
39
+ X = np.random.rand(100, 10) # Replace with your dataset
40
+
41
+ # Initialize and fit Spectral Bridges (with a specified number of nodes if needed) and random seed
42
+ model = SpectralBridges(n_clusters=5, random_state=42)
43
+
44
+ # Define range of nodes to evaluate, should be an iterable of integers, or None if n_nodes is already set.
45
+ n_nodes_range = [10, 15, 20]
46
+
47
+ # Find the optimal number of nodes for a given value of clusters
48
+ # Modifies the instance attributes, returns a dict
49
+ # If n_nodes_range is None, then the model selects using self.n_nodes if not None
50
+ mean_ngaps = model.fit_select(X, n_nodes_range)
51
+
52
+ print("Optimal number of nodes:", model.n_nodes)
53
+ print("Dict of mean normalized eigengaps:", mean_ngaps)
54
+
55
+ # Predict clusters for new data points
56
+ new_data = np.random.rand(20, 10) # Replace with new data
57
+ predicted_clusters = model.predict(new_data)
58
+
59
+ print("Predicted clusters:", predicted_clusters)
60
+
61
+ # With a custom number of nodes
62
+ custom_model = SpectralBridges(n_clusters=5, n_nodes=12, p=1) # And a p-bridge affinity
63
+
64
+ # Fit the model
65
+ custom_model.fit(X)
66
+
67
+ # Predict the same way...
68
+ custom_predicted_clusters = custom_model.predict(new_data)
69
+
70
+ print("Predicted clusters:", custom_predicted_clusters)
71
+ ```
72
+
73
+ ---
74
+
75
+ ## 📖 Learn More
76
+
77
+ For tutorials, API reference, visit the official site:
78
+ 👉 [sbcluster Documentation](https://felixlaplante0.gitlab.io/sbcluster)
@@ -0,0 +1,20 @@
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = source
9
+ BUILDDIR = build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,35 @@
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=source
11
+ set BUILDDIR=build
12
+
13
+ %SPHINXBUILD% >NUL 2>NUL
14
+ if errorlevel 9009 (
15
+ echo.
16
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
+ echo.installed, then set the SPHINXBUILD environment variable to point
18
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
+ echo.may add the Sphinx directory to PATH.
20
+ echo.
21
+ echo.If you don't have Sphinx installed, grab it from
22
+ echo.https://www.sphinx-doc.org/
23
+ exit /b 1
24
+ )
25
+
26
+ if "%1" == "" goto help
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
File without changes
@@ -0,0 +1,8 @@
1
+ {{ fullname | escape | underline}}
2
+
3
+ .. currentmodule:: {{ module }}
4
+
5
+ .. autoclass:: {{ objname }}
6
+ :members:
7
+
8
+ .. automethod:: __init__
@@ -0,0 +1,46 @@
1
+ # Configuration file for the Sphinx documentation builder.
2
+ #
3
+ # For the full list of built-in configuration values, see the documentation:
4
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
+
6
+ # -- Project information -----------------------------------------------------
7
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8
+
9
+
10
+ import os
11
+
12
+ release = os.getenv("CI_COMMIT_TAG", "v0.0.0")
13
+ version = release.lstrip("v")
14
+
15
+
16
+ project = "sbcluster"
17
+ copyright = "2025, Félix Laplante"
18
+ author = "Félix Laplante"
19
+
20
+ # -- General configuration ---------------------------------------------------
21
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
22
+
23
+ extensions = [
24
+ "sphinx.ext.autodoc",
25
+ "sphinx.ext.napoleon",
26
+ "sphinx.ext.viewcode",
27
+ "sphinx.ext.autosummary",
28
+ ]
29
+
30
+ templates_path = ["_templates"]
31
+ exclude_patterns = []
32
+
33
+ autodoc_member_order = "bysource"
34
+ autodoc_typehints = "description"
35
+ autodoc_typehints_format = "short"
36
+ autodoc_inherit_docstrings = True
37
+ autosummary_generate = True
38
+ add_module_names = False
39
+ napoleon_use_ivar = True
40
+ napoleon_attr_annotations = True
41
+
42
+ # -- Options for HTML output -------------------------------------------------
43
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
44
+
45
+ html_theme = "furo"
46
+ html_static_path = ["_static"]
@@ -0,0 +1,71 @@
1
+ Spectral Bridges
2
+ ================
3
+
4
+ **sbcluster** is a Python package that implements a novel clustering algorithm combining k-means and spectral clustering techniques, called **Spectral Bridges**. It leverages efficient affinity matrix computation and merges clusters based on a connectivity measure inspired by SVM's margin concept. This package is designed to provide robust clustering solutions, particularly suited for large datasets.
5
+
6
+ Features
7
+ --------
8
+
9
+ - **Spectral Bridges Algorithm**: Integrates k-means and spectral clustering with efficient affinity matrix calculation for improved clustering results.
10
+ - **Scalability**: Designed to handle large datasets by optimizing cluster formation through advanced affinity matrix computations.
11
+ - **Customizable**: Parameters such as number of clusters, iterations, and random state allow flexibility in clustering configurations.
12
+ - **Model selection**: Automatic model selection for number of nodes (m) according to a normalized eigengap metric.
13
+
14
+ Speed
15
+ -----
16
+
17
+ Spectral Bridges not only utilizes FAISS's efficient k-means implementation but also uses a scikit-learn method clone for centroid initialization which is much faster than using scikit-learn's implementation (over 2x improvement).
18
+
19
+ Installation
20
+ ------------
21
+
22
+ You can install the package via pip:
23
+
24
+ .. code-block:: bash
25
+
26
+ pip install sbcluster
27
+
28
+ Usage
29
+ -----
30
+
31
+ Example:
32
+
33
+ .. code-block:: python
34
+
35
+ import numpy as np
36
+
37
+ from sbcluster import SpectralBridges
38
+
39
+ # Generate sample data
40
+ np.random.seed(0)
41
+ X = np.random.rand(100, 10) # Replace with your dataset
42
+
43
+ # Initialize and fit Spectral Bridges (with a specified number of nodes if needed) and random seed
44
+ model = SpectralBridges(n_clusters=5, random_state=42)
45
+
46
+ # Define range of nodes to evaluate, should be an iterable of integers, or None if n_nodes is already set.
47
+ n_nodes_range = [10, 15, 20]
48
+
49
+ # Find the optimal number of nodes for a given value of clusters
50
+ mean_ngaps = model.fit_select(X, n_nodes_range)
51
+ print("Optimal number of nodes:", model.n_nodes)
52
+ print("Dict of mean normalized eigengaps:", mean_ngaps)
53
+
54
+ # Predict clusters for new data points
55
+ new_data = np.random.rand(20, 10) # Replace with new data
56
+ predicted_clusters = model.predict(new_data)
57
+ print("Predicted clusters:", predicted_clusters)
58
+
59
+ # With a custom number of nodes
60
+ custom_model = SpectralBridges(n_clusters=5, n_nodes=12, p=1) # And a p-bridge affinity
61
+ custom_model.fit(X)
62
+ custom_predicted_clusters = custom_model.predict(new_data)
63
+ print("Predicted clusters:", custom_predicted_clusters)
64
+
65
+ API Reference
66
+ -------------
67
+
68
+ .. autoclass:: sbcluster._bridges.SpectralBridges
69
+ :members:
70
+ :undoc-members:
71
+ :show-inheritance:
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "sbcluster"
3
+ description = "Spectral Bridges clustering algorithm"
4
+ authors = [{ name = "Félix Laplante", email = "felixlaplante0@proton.me" }]
5
+ requires-python = ">=3.10"
6
+ dependencies = ["numpy", "scipy", "pydantic", "faiss-cpu"]
7
+ dynamic = ["version"]
8
+
9
+ [build-system]
10
+ requires = ["setuptools>=42", "setuptools-scm[toml]>=6.0", "wheel"]
11
+ build-backend = "setuptools.build_meta"
12
+
13
+ [tool.setuptools_scm]
14
+ version_scheme = "post-release"
15
+ local_scheme = "no-local-version"
16
+
17
+ [tool.ruff]
18
+ lint.select = [
19
+ "D", # pydocstyle (docstring conventions)
20
+ "E", # pycodestyle errors
21
+ "W", # pycodestyle warnings
22
+ "F", # Pyflakes
23
+ "I", # isort
24
+ "UP", # pyupgrade
25
+ "B", # flake8-bugbear
26
+ "C4", # flake8-comprehensions
27
+ "S", # flake8-bandit (security)
28
+ "T20", # flake8-print
29
+ "PT", # flake8-pytest-style
30
+ "Q", # flake8-quotes
31
+ "RET", # flake8-return
32
+ "SIM", # flake8-simplify
33
+ "ARG", # flake8-unused-arguments
34
+ "ERA", # eradicate (commented code)
35
+ "PL", # Pylint
36
+ "RUF", # Ruff-specific rules
37
+ ]
38
+ lint.ignore = ["D417", "PLR0913"]
39
+
40
+ [tool.ruff.lint.pydocstyle]
41
+ convention = "google"
@@ -0,0 +1,4 @@
1
+ from ._bridges import SpectralBridges # noqa: D104
2
+ from ._defs import ExpQuantileTransform
3
+
4
+ __all__ = ["ExpQuantileTransform", "SpectralBridges"]
@@ -0,0 +1,269 @@
1
+ from collections.abc import Iterable
2
+ from typing import Final, cast
3
+
4
+ import faiss # type: ignore
5
+ import numpy as np
6
+ from numpy.typing import NDArray
7
+ from pydantic import ConfigDict, validate_call
8
+ from scipy.linalg.blas import sgemm # type: ignore
9
+
10
+ from ._defs import (
11
+ AffinityTransform,
12
+ ExpQuantileTransform,
13
+ FloatGtZeroLtHalf,
14
+ IntStrictlyPositive,
15
+ NumStrictlyPositive,
16
+ )
17
+ from ._kmeans import KMeans
18
+ from ._spectral import SpectralClustering
19
+
20
+ # Constants
21
+ DEFAULT_AFFINITY_TRANSFORM: Final = ExpQuantileTransform(0.1, 1e4)
22
+
23
+
24
+ class SpectralBridges:
25
+ """Spectral Bridges clustering algorithm.
26
+
27
+ Attributes:
28
+ random_state (int | None): Determines random number generation for centroid
29
+ initialization.
30
+ n_clusters (int): The number of clusters to form.
31
+ n_nodes (int): Number of nodes or initial clusters.
32
+ p (float): Power of the alpha_i.
33
+ n_iter (int): Number of iterations to run the k-means algorithm.
34
+ n_local_trials (int or None): Number of seeding trials for centroids
35
+ initialization.
36
+ random_state (int | None): Determines random number generation for centroid
37
+ initialization.
38
+ affinity_transform (AffinityTransform): Affinity transform to apply to the
39
+ affinity matrix.
40
+ cluster_centers_ (list[NDArray[np.float32]] | None): Coordinates of cluster
41
+ centers.
42
+ eigvals_ (NDArray[np.float32 | np.float64] | None): The eigenvalues of the
43
+ (normalized) laplacian matrix.
44
+ ngap_ (float): The normalized eigengap.
45
+ """
46
+
47
+ n_clusters: int
48
+ n_nodes: int | None
49
+ p: float
50
+ n_iter: int
51
+ n_local_trials: IntStrictlyPositive | None
52
+ random_state: int | None
53
+ cluster_centers_: list[NDArray[np.float32]] | None
54
+ eigvals_: NDArray[np.float32 | np.float64] | None
55
+ ngap_: float | None
56
+ affinity_transform: AffinityTransform
57
+
58
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
59
+ def __init__(
60
+ self,
61
+ n_clusters: IntStrictlyPositive,
62
+ n_nodes: IntStrictlyPositive | None = None,
63
+ *,
64
+ p: NumStrictlyPositive = 2,
65
+ alpha: FloatGtZeroLtHalf = 0.1,
66
+ n_iter: IntStrictlyPositive = 20,
67
+ n_local_trials: IntStrictlyPositive | None = None,
68
+ random_state: int | None = None,
69
+ affinity_transform: AffinityTransform = DEFAULT_AFFINITY_TRANSFORM,
70
+ ):
71
+ """Initialize the Spectral Bridges model.
72
+
73
+ Args:
74
+ n_clusters (IntStrictlyPositive): The number of clusters to form.
75
+ n_nodes (IntStrictlyPositive | None): Number of nodes or initial clusters.
76
+ p (NumStrictlyPositive, optional): Power of the alpha_i. Defaults to 2.
77
+ alpha (FloatGtZeroLtHalf, optional): Quantile for affinity matrix
78
+ computation. Defaults to 0.1.
79
+ n_iter (int, optional): Number of iterations to run the k-means algorithm.
80
+ Defaults to 20.
81
+ n_local_trials (int or None, optional): Number of seeding trials for
82
+ centroids initialization.
83
+ random_state (int or None, optional): Determines random number generation
84
+ for centroid initialization.
85
+ affinity_transform (AffinityTransform, optional): Affinity transform
86
+ to apply to the affinity matrix. Defaults to DEFAULT_AFFINITY_TRANSFORM.
87
+ """
88
+ self.n_clusters = n_clusters
89
+ self.n_nodes = n_nodes
90
+ self.p = p
91
+ self.alpha = alpha
92
+ self.n_iter = n_iter
93
+ self.n_local_trials = n_local_trials
94
+ self.random_state = random_state
95
+ self.affinity_transform = affinity_transform
96
+ self.cluster_centers_ = None
97
+ self.eigvals_ = None
98
+ self.ngap_ = None
99
+
100
+ if self.n_nodes is not None and self.n_nodes <= self.n_clusters:
101
+ raise ValueError(
102
+ f"n_nodes must be greater than n_clusters, got {self.n_nodes} <= "
103
+ "{self.n_clusters}"
104
+ )
105
+
106
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
107
+ def fit(self, X: np.ndarray):
108
+ """Fit the Spectral Bridges model on the input data X.
109
+
110
+ Args:
111
+ X : numpy.ndarray
112
+ Input data to cluster.
113
+ """
114
+ if self.n_nodes is None:
115
+ raise ValueError("n_nodes must be provided")
116
+
117
+ kmeans = KMeans(
118
+ self.n_nodes,
119
+ self.n_iter,
120
+ self.n_local_trials,
121
+ self.random_state,
122
+ )
123
+ kmeans.fit(X)
124
+ centers = cast(NDArray[np.float32], kmeans.cluster_centers_)
125
+
126
+ affinity: NDArray[np.float64] = np.empty((self.n_nodes, self.n_nodes))
127
+
128
+ X_centered = [
129
+ np.array(
130
+ X[kmeans.labels_ == i]
131
+ - cast(NDArray[np.float32], kmeans.cluster_centers_)[i],
132
+ dtype=np.float32,
133
+ order="F",
134
+ )
135
+ for i in range(self.n_nodes)
136
+ ]
137
+
138
+ counts = np.array([X_centered[i].shape[0] for i in range(self.n_nodes)])
139
+ counts = counts[None, :] + counts[:, None]
140
+
141
+ for i in range(self.n_nodes):
142
+ segments = np.asfortranarray(centers - centers[i])
143
+ dists = np.einsum("ij,ij->i", segments, segments)
144
+ dists[i] = 1
145
+
146
+ projs = sgemm(1.0, X_centered[i], segments, trans_b=True)
147
+ np.clip(projs / dists, 0, None, out=projs)
148
+ projs = np.power(projs, self.p)
149
+
150
+ affinity[i] = projs.sum(axis=0)
151
+
152
+ affinity = np.power((affinity + affinity.T) / counts, 1 / self.p)
153
+
154
+ affinity = cast(NDArray[np.float64], self.affinity_transform(affinity))
155
+
156
+ spectralclustering = SpectralClustering(
157
+ self.n_clusters, self.n_iter, self.n_local_trials, self.random_state
158
+ )
159
+ spectralclustering.fit(affinity)
160
+
161
+ self.eigvals_ = spectralclustering.eigvals_
162
+ self.ngap_ = spectralclustering.ngap_
163
+ self.cluster_centers_ = [
164
+ centers[spectralclustering.labels_ == i] for i in range(self.n_clusters)
165
+ ]
166
+
167
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
168
+ def fit_select(
169
+ self,
170
+ X: np.ndarray,
171
+ n_nodes_range: Iterable[int] | None = None,
172
+ n_redo: IntStrictlyPositive = 10,
173
+ ) -> dict[int, float]:
174
+ """Selects and fits the best model from a range of possible node counts.
175
+
176
+ It evaluates the mean normalized eigengap (ngap) for each candidate.
177
+
178
+ For each `n_nodes` in `n_nodes_range`, multiple models are fit to the data,
179
+ and the one with the highest mean normalized eigengap over `n_redo` runs
180
+ is selected. The method then updates the current instance to use the
181
+ attributes of the best candidate model.
182
+
183
+ Args:
184
+ X (np.ndarray): The input data.
185
+ n_nodes_range (Iterable[int] | None): The range of possible node counts.
186
+ n_redo (int): The number of times to run the model.
187
+
188
+ Returns:
189
+ dict[int, float]: The mean normalized eigengap for each node count.
190
+ """
191
+ if n_nodes_range is None:
192
+ if self.n_nodes is None:
193
+ raise ValueError("n_nodes_range or self.n_nodes must be provided")
194
+ n_nodes_range = [self.n_nodes]
195
+
196
+ rng = np.random.default_rng(self.random_state)
197
+ max_int = np.iinfo(np.int32).max
198
+
199
+ best_candidate = None
200
+ best_mean_ngap = -1
201
+ mean_ngaps: dict[int, float] = {}
202
+
203
+ for n_nodes in n_nodes_range:
204
+ candidate = None
205
+ cum_ngap = 0
206
+
207
+ for _ in range(n_redo):
208
+ model = SpectralBridges(
209
+ n_clusters=self.n_clusters,
210
+ n_nodes=n_nodes,
211
+ p=self.p,
212
+ n_iter=self.n_iter,
213
+ n_local_trials=self.n_local_trials,
214
+ random_state=self.random_state,
215
+ affinity_transform=self.affinity_transform,
216
+ )
217
+ model.fit(X)
218
+
219
+ cum_ngap += cast(float, model.ngap_)
220
+
221
+ if candidate is None or cast(float, model.ngap_) > cast(
222
+ float, candidate.ngap_
223
+ ):
224
+ candidate = model
225
+
226
+ self.random_state = int(rng.integers(max_int + 1))
227
+
228
+ mean_ngap = cum_ngap / n_redo
229
+ mean_ngaps[n_nodes] = mean_ngap
230
+
231
+ if mean_ngap > best_mean_ngap:
232
+ best_candidate = candidate
233
+ best_mean_ngap = mean_ngap
234
+
235
+ self.__dict__.update(best_candidate.__dict__)
236
+
237
+ return mean_ngaps
238
+
239
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
240
+ def predict(self, x: np.ndarray) -> np.ndarray:
241
+ """Predict the nearest cluster index for each input data point x.
242
+
243
+ Args:
244
+ x (np.ndarray): The input data.
245
+
246
+ Raises:
247
+ ValueError: If `x` contains inf or NaN values.
248
+
249
+ Returns:
250
+ NDArray[np.int32]: The predicted cluster indices.
251
+ """
252
+ if np.isinf(x).any():
253
+ raise ValueError("x must not contain inf values")
254
+ if np.isnan(x).any():
255
+ raise ValueError("x must not contain NaN values")
256
+
257
+ centers = cast(list[NDArray[np.float32]], self.cluster_centers_)
258
+
259
+ cluster_centers = np.vstack(centers)
260
+ cluster_cutoffs = np.cumsum([cluster.shape[0] for cluster in centers])
261
+
262
+ index = faiss.IndexFlatL2(x.shape[1])
263
+ index.add(cluster_centers.astype(np.float32)) # type: ignore
264
+ winners = index.search(x.astype(np.float32), 1)[1].ravel() # type: ignore
265
+
266
+ return cast(
267
+ NDArray[np.int32],
268
+ np.searchsorted(cluster_cutoffs, winners, side="right"), # type: ignore
269
+ )
@@ -0,0 +1,95 @@
1
+ from typing import Annotated, Protocol, TypeAlias, runtime_checkable
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+ from pydantic import AfterValidator, ConfigDict, validate_call
6
+
7
+
8
+ # Validators
9
+ def is_strict_pos(x: int | float) -> int | float:
10
+ """Checks if the argument is strictly positive.
11
+
12
+ Args:
13
+ x (int | float): The input number.
14
+
15
+ Raises:
16
+ ValueError: If the number is not strictly positive.
17
+
18
+ Returns:
19
+ int | float | torch.Tensor: The output number or tensor.
20
+ """
21
+ if x <= 0:
22
+ raise ValueError(f"Expected strictly positive number, got {x}")
23
+ return x
24
+
25
+
26
+ def is_gt_zero_lt_half(x: float) -> float:
27
+ """Checks if the argument is between 0 and 1/2.
28
+
29
+ Args:
30
+ x (float): The input number.
31
+
32
+ Raises:
33
+ ValueError: If the number is not between 0 and 1/2.
34
+
35
+ Returns:
36
+ float: The output number.
37
+ """
38
+ if x <= 0 or x >= 0.5: # noqa: PLR2004
39
+ raise ValueError(f"Expected number > 0 and < 1/2, got {x}")
40
+ return x
41
+
42
+
43
+ Num: TypeAlias = int | float
44
+ IntStrictlyPositive = Annotated[int, AfterValidator(is_strict_pos)]
45
+ NumStrictlyPositive = Annotated[Num, AfterValidator(is_strict_pos)]
46
+ FloatGtZeroLtHalf = Annotated[float, AfterValidator(is_gt_zero_lt_half)]
47
+
48
+
49
+ # Protocols
50
+ @runtime_checkable
51
+ class AffinityTransform(Protocol):
52
+ """Protocol for affinity transforms.
53
+
54
+ Use this protocol to define custom affinity transforms.
55
+ """
56
+
57
+ def __call__(
58
+ self, x: NDArray[np.float32 | np.float64]
59
+ ) -> NDArray[np.float32 | np.float64]: ...
60
+
61
+
62
+ # Transformations
63
+
64
+
65
+ class ExpQuantileTransform(AffinityTransform):
66
+ """Exponential quantile transform.
67
+
68
+ Attributes:
69
+ alpha (float): Quantile for affinity matrix computation.
70
+ mult_factor (int | float): Scaling parameter for affinity matrix computation.
71
+ """
72
+
73
+ alpha: float
74
+ mult_factor: int | float
75
+
76
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
77
+ def __init__(
78
+ self, alpha: FloatGtZeroLtHalf = 0.1, mult_factor: NumStrictlyPositive = 1e4
79
+ ):
80
+ """Initialize the Exponential quantile transform.
81
+
82
+ Args:
83
+ alpha (FloatGtZeroLtHalf): Quantile for affinity matrix computation.
84
+ mult_factor (NumStrictlyPositive): Scaling parameter for affinity matrix
85
+ computation.
86
+ """
87
+ self.alpha = alpha
88
+ self.mult_factor = mult_factor
89
+
90
+ def __call__(
91
+ self, x: NDArray[np.float32 | np.float64]
92
+ ) -> NDArray[np.float32 | np.float64]:
93
+ q1, q2 = np.quantile(x, [self.alpha, 1 - self.alpha])
94
+ gamma = np.log(self.mult_factor) / (q2 - q1)
95
+ return np.exp(gamma * (x - x.max()))
@@ -0,0 +1,159 @@
1
+ from typing import cast
2
+
3
+ import faiss # type: ignore
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+ from pydantic import ConfigDict, validate_call
7
+ from scipy.linalg.blas import sgemm # type: ignore
8
+
9
+
10
+ class KMeans:
11
+ """K-means clustering using FAISS.
12
+
13
+ Attributes:
14
+ cluster_centers_ (NDArray[np.float32] | None): Coordinates of cluster centers.
15
+ labels_ (NDArray[np.int32] | None): Labels of each point (index) in X.
16
+
17
+ Methods:
18
+ --------
19
+ fit(X):
20
+ Run k-means clustering on the input data X.
21
+ """
22
+
23
+ cluster_centers_: NDArray[np.float32] | None
24
+ labels_: NDArray[np.int32] | None
25
+
26
+ def __init__(
27
+ self,
28
+ n_clusters: int,
29
+ n_iter: int,
30
+ n_local_trials: int | None,
31
+ random_state: int | None,
32
+ ):
33
+ """Initializes the KMeans class.
34
+
35
+ Args:
36
+ n_clusters (int): The number of clusters to form.
37
+ n_iter (int): The number of iterations to run the k-means
38
+ algorithm.
39
+ n_local_trials (int | None): The number of seeding trials for
40
+ centroids initialization.
41
+ random_state (int | None) Determines random number generation for
42
+ centroid initialization.
43
+ """
44
+ self.n_clusters = n_clusters
45
+ self.n_iter = n_iter
46
+ self.n_local_trials = n_local_trials
47
+ self.random_state = random_state
48
+ self.cluster_centers_ = None
49
+ self.labels_ = None
50
+
51
+ @staticmethod
52
+ def _dists(
53
+ X: NDArray[np.float32], y: NDArray[np.float32], XX: NDArray[np.float32]
54
+ ) -> NDArray[np.float32]:
55
+ """Computes the pairwise distances between a fixed data matrix and some points.
56
+
57
+ Args:
58
+ X (NDArray[np.float32]): The fixed data matrix.
59
+ y (NDArray[np.float32]): The non fixed points.
60
+ XX (NDArray[np.float32]): The fixed matrix squared norm.
61
+
62
+ Returns:
63
+ NDArray[np.float32]: The computed pairwise distances.
64
+ """
65
+ yy = np.einsum("ij,ij->i", y, y)
66
+ dists = XX - sgemm(2.0, X, y, trans_b=True) + yy
67
+ np.clip(dists, 0, None, out=dists)
68
+ return dists
69
+
70
+ def _init_centroids(self, X: NDArray[np.float32]) -> NDArray[np.float32]:
71
+ """Initializes the centroids in a K-means++ fashion.
72
+
73
+ Args:
74
+ X (NDArray[np.float32]): The fixed data matrix.
75
+
76
+ Returns:
77
+ NDArray[np.float32]: The initialized centroids.
78
+ """
79
+ rng = np.random.default_rng(self.random_state)
80
+
81
+ centroids = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
82
+ centroids[0] = X[rng.integers(X.shape[0])]
83
+
84
+ XX = np.einsum("ij,ij->i", X, X)[:, None]
85
+
86
+ dists = self._dists(X, centroids[0:1], XX).ravel()
87
+ inertia = dists.sum()
88
+
89
+ if self.n_local_trials is None:
90
+ self.n_local_trials = 2 + int(np.log(self.n_clusters))
91
+
92
+ for i in range(1, self.n_clusters):
93
+ candidate_ids = rng.choice(
94
+ X.shape[0], size=self.n_local_trials, p=dists / inertia
95
+ )
96
+ candidates = np.asfortranarray(X[candidate_ids])
97
+
98
+ current_candidates_dists = self._dists(X, candidates, XX)
99
+ candidates_dists = np.minimum(current_candidates_dists, dists[:, None])
100
+
101
+ inertias = candidates_dists.sum(axis=0)
102
+ best_inertia = inertias.argmin()
103
+ best_candidate = candidate_ids[best_inertia]
104
+ dists = candidates_dists[:, best_inertia]
105
+ inertia = inertias[best_inertia]
106
+
107
+ centroids[i] = X[best_candidate]
108
+
109
+ return centroids
110
+
111
+ @staticmethod
112
+ def _validate_X(X: NDArray[np.float32 | np.float64]) -> NDArray[np.float32]:
113
+ """Validates and converts the data matrix.
114
+
115
+ Args:
116
+ X (NDArray[np.float32 | np.float64]): The fixed data matrix.
117
+
118
+ Raises:
119
+ ValueError: If `X``contains inf values.
120
+ ValueError: If `X``contains NaN values.
121
+
122
+ Returns:
123
+ NDArray[np.float32]: The validated and converted data matrix.
124
+ """
125
+ if np.isinf(X).any():
126
+ raise ValueError("X must not contain inf values")
127
+ if np.isnan(X).any():
128
+ raise ValueError("X must not contain NaN values")
129
+
130
+ return np.array(X, dtype=np.float32, order="F")
131
+
132
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
133
+ def fit(self, X: np.ndarray):
134
+ """Run k-means clustering on the input data X.
135
+
136
+ Args:
137
+ X (np.ndarray): Input data matrix to cluster.
138
+ """
139
+ X_f32 = self._validate_X(X)
140
+
141
+ index = faiss.IndexFlatL2(X.shape[1])
142
+ kmeans = faiss.Clustering(X.shape[1], self.n_clusters)
143
+
144
+ init_centroids = self._init_centroids(X_f32)
145
+
146
+ kmeans.centroids.resize(init_centroids.size)
147
+ faiss.copy_array_to_vector(init_centroids.ravel(), kmeans.centroids) # type: ignore
148
+ kmeans.niter = self.n_iter
149
+ kmeans.min_points_per_centroid = 0
150
+ kmeans.max_points_per_centroid = -1
151
+ kmeans.train(X_f32, index) # type: ignore
152
+
153
+ self.cluster_centers_ = cast(
154
+ NDArray[np.float32],
155
+ faiss.vector_to_array(kmeans.centroids).reshape( # type: ignore
156
+ self.n_clusters, X.shape[1]
157
+ ),
158
+ )
159
+ self.labels_ = cast(NDArray[np.int32], index.search(X_f32, 1)[1].ravel()) # type: ignore
@@ -0,0 +1,84 @@
1
+ from typing import cast
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+ from scipy.sparse.csgraph import laplacian # type: ignore
6
+
7
+ from ._kmeans import KMeans
8
+
9
+
10
+ class SpectralClustering:
11
+ """Spectral clustering based on Laplacian matrix.
12
+
13
+ Attributes:
14
+ n_local_trials (int | None): The number of seeding trials for
15
+ centroids initialization.
16
+ random_state (int | None) Determines random number generation for
17
+ centroid initialization.
18
+ labels_ (NDArray[np.int32] | None): Labels of each point (index) in the affinity
19
+ matrix.
20
+ eigvals_ (NDArray[np.float32 | np.float64] | None): The eigenvalues of the
21
+ (normalized) laplacian matrix.
22
+ ngap_ (float): The normalized eigengap.
23
+ """
24
+
25
+ n_iter: int
26
+ n_local_trials: int | None
27
+ random_state: int | None
28
+ labels_: NDArray[np.int32] | None
29
+ eigvals_: NDArray[np.float32 | np.float64] | None
30
+ ngap_: float | None
31
+
32
+ def __init__(
33
+ self,
34
+ n_clusters: int,
35
+ n_iter: int,
36
+ n_local_trials: int | None,
37
+ random_state: int | None,
38
+ ):
39
+ """Initializes the class.
40
+
41
+ Args:
42
+ n_clusters (int): The number of clusters to form.
43
+ n_iter (int): The number of iterations to run the k-means
44
+ algorithm.
45
+ n_local_trials (int | None): The number of seeding trials for
46
+ centroids initialization.
47
+ random_state (int | None) Determines random number generation for
48
+ centroid initialization.
49
+ random_state (int | None): Determines random number generation for centroid
50
+ initialization.
51
+ """
52
+ self.n_clusters = n_clusters
53
+ self.random_state = random_state
54
+ self.n_iter = n_iter
55
+ self.n_local_trials = n_local_trials
56
+ self.labels_ = None
57
+ self.eigvals_ = None
58
+ self.ngap_ = None
59
+
60
+ def fit(self, affinity: NDArray[np.float32 | np.float64]):
61
+ """Fit the spectral clustering model on the affinity matrix.
62
+
63
+ Parameters:
64
+ -----------
65
+ affinity (NDArray[np.float32]): Affinity matrix representing pairwise similarity
66
+ between points.
67
+ """
68
+ L = cast(NDArray[np.float32 | np.float64], laplacian(affinity, normed=True))
69
+
70
+ self.eigvals_, eigvecs = cast(
71
+ tuple[NDArray[np.float32 | np.float64], ...],
72
+ np.linalg.eigh(L), # type: ignore
73
+ )
74
+ eigvecs = eigvecs[:, : self.n_clusters]
75
+ eigvecs /= np.linalg.norm(eigvecs, axis=1)[:, None]
76
+ kmeans = KMeans(
77
+ self.n_clusters, self.n_iter, self.n_local_trials, self.random_state
78
+ )
79
+ kmeans.fit(eigvecs)
80
+
81
+ self.ngap_ = (
82
+ self.eigvals_[self.n_clusters] - self.eigvals_[self.n_clusters - 1]
83
+ ) / self.eigvals_[self.n_clusters - 1]
84
+ self.labels_ = kmeans.labels_
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: sbcluster
3
+ Version: 0.1.0
4
+ Summary: Spectral Bridges clustering algorithm
5
+ Author-email: Félix Laplante <felixlaplante0@proton.me>
6
+ Requires-Python: >=3.10
7
+ License-File: LICENSE
8
+ Requires-Dist: numpy
9
+ Requires-Dist: scipy
10
+ Requires-Dist: pydantic
11
+ Requires-Dist: faiss-cpu
12
+ Dynamic: license-file
@@ -0,0 +1,20 @@
1
+ .gitlab-ci.yml
2
+ LICENSE
3
+ README.md
4
+ pyproject.toml
5
+ docs/Makefile
6
+ docs/make.bat
7
+ docs/source/conf.py
8
+ docs/source/index.rst
9
+ docs/source/_static/.gitkeep
10
+ docs/source/_templates/autosummary/class.rst
11
+ sbcluster/__init__.py
12
+ sbcluster/_bridges.py
13
+ sbcluster/_defs.py
14
+ sbcluster/_kmeans.py
15
+ sbcluster/_spectral.py
16
+ sbcluster.egg-info/PKG-INFO
17
+ sbcluster.egg-info/SOURCES.txt
18
+ sbcluster.egg-info/dependency_links.txt
19
+ sbcluster.egg-info/requires.txt
20
+ sbcluster.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ numpy
2
+ scipy
3
+ pydantic
4
+ faiss-cpu
@@ -0,0 +1 @@
1
+ sbcluster
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+