umapers 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- umapers-1.0.0/PKG-INFO +83 -0
- umapers-1.0.0/README.md +55 -0
- umapers-1.0.0/pyproject.toml +48 -0
- umapers-1.0.0/python/umapers/__init__.py +4 -0
- umapers-1.0.0/python/umapers/__init__.pyi +5 -0
- umapers-1.0.0/python/umapers/_api.py +450 -0
- umapers-1.0.0/python/umapers/_api.pyi +365 -0
- umapers-1.0.0/python/umapers/py.typed +1 -0
- umapers-1.0.0/rust_umap/Cargo.lock +360 -0
- umapers-1.0.0/rust_umap/Cargo.toml +18 -0
- umapers-1.0.0/rust_umap/README.md +217 -0
- umapers-1.0.0/rust_umap/benchmarks/eval_aligned_umap.py +861 -0
- umapers-1.0.0/rust_umap/benchmarks/eval_euclidean_fit_regression.py +210 -0
- umapers-1.0.0/rust_umap/benchmarks/eval_inverse_quality.py +265 -0
- umapers-1.0.0/rust_umap/benchmarks/eval_parametric_consistency.py +941 -0
- umapers-1.0.0/rust_umap/benchmarks/eval_sparse_csr_vs_umap_learn.py +814 -0
- umapers-1.0.0/rust_umap/examples/aligned_benchmark.rs +288 -0
- umapers-1.0.0/rust_umap/examples/aligned_demo.rs +112 -0
- umapers-1.0.0/rust_umap/examples/inverse_quality.rs +120 -0
- umapers-1.0.0/rust_umap/examples/parametric_eval.rs +219 -0
- umapers-1.0.0/rust_umap/src/aligned.rs +983 -0
- umapers-1.0.0/rust_umap/src/bin/bench_fit_csv.rs +252 -0
- umapers-1.0.0/rust_umap/src/bin/fit_csv.rs +106 -0
- umapers-1.0.0/rust_umap/src/cli_common.rs +583 -0
- umapers-1.0.0/rust_umap/src/lib.rs +5553 -0
- umapers-1.0.0/rust_umap/src/main.rs +64 -0
- umapers-1.0.0/rust_umap/src/parametric.rs +887 -0
- umapers-1.0.0/rust_umap/src/sparse.rs +648 -0
- umapers-1.0.0/rust_umap/tests/cli_e2e.rs +330 -0
- umapers-1.0.0/umap_rs/Cargo.lock +531 -0
- umapers-1.0.0/umap_rs/Cargo.toml +21 -0
- umapers-1.0.0/umap_rs/README.md +55 -0
- umapers-1.0.0/umap_rs/examples/manual_dense_workflow.py +62 -0
- umapers-1.0.0/umap_rs/examples/manual_help_surface.py +50 -0
- umapers-1.0.0/umap_rs/examples/manual_precomputed_knn.py +68 -0
- umapers-1.0.0/umap_rs/examples/manual_transform_inverse.py +57 -0
- umapers-1.0.0/umap_rs/src/lib.rs +913 -0
- umapers-1.0.0/umap_rs/tests/test_binding.py +727 -0
- umapers-1.0.0/umap_rs/uv.lock +223 -0
umapers-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: umapers
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
13
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
14
|
+
Classifier: Typing :: Typed
|
|
15
|
+
Requires-Dist: numpy>=1.26,<3
|
|
16
|
+
Summary: Python bindings for rust_umap
|
|
17
|
+
Keywords: umap,dimensionality-reduction,manifold-learning,rust,pyo3
|
|
18
|
+
Home-Page: https://github.com/wenjiudaijiugui/umapers
|
|
19
|
+
License: BSD-3-Clause
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
22
|
+
Project-URL: Changelog, https://github.com/wenjiudaijiugui/umapers/blob/main/CHANGELOG.md
|
|
23
|
+
Project-URL: Documentation, https://github.com/wenjiudaijiugui/umapers/tree/main/umap_rs
|
|
24
|
+
Project-URL: Homepage, https://github.com/wenjiudaijiugui/umapers
|
|
25
|
+
Project-URL: Issues, https://github.com/wenjiudaijiugui/umapers/issues
|
|
26
|
+
Project-URL: Repository, https://github.com/wenjiudaijiugui/umapers
|
|
27
|
+
|
|
28
|
+
# umapers
|
|
29
|
+
|
|
30
|
+
Python package `umapers`, backed by `rust_umap` and built with PyO3 + maturin.
|
|
31
|
+
|
|
32
|
+
Version `1.0.0` focuses on IDE-help quality for the public Python API:
|
|
33
|
+
|
|
34
|
+
- the exported surface has useful type hints
|
|
35
|
+
- public methods carry docstrings that explain inputs and outputs
|
|
36
|
+
- `help(umapers.Umap)` and editor hover should be informative
|
|
37
|
+
|
|
38
|
+
The binding remains intentionally thin: Python normalizes arrays and CSR inputs,
|
|
39
|
+
while Rust owns validation and compute-heavy paths whenever practical.
|
|
40
|
+
|
|
41
|
+
## Local build
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
PYTHON_BIN="$(command -v python3 || command -v python)"
|
|
45
|
+
uv venv --python "$PYTHON_BIN" .venv
|
|
46
|
+
uv pip install --python .venv/bin/python --upgrade pip maturin
|
|
47
|
+
uv run --python .venv/bin/python maturin develop --manifest-path umap_rs/Cargo.toml
|
|
48
|
+
uv run --python .venv/bin/python python -I -m pytest -q umap_rs/tests/test_binding.py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Quick usage
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import numpy as np
|
|
55
|
+
from umapers import Umap
|
|
56
|
+
|
|
57
|
+
x = np.random.default_rng(42).normal(size=(200, 16)).astype(np.float32)
|
|
58
|
+
model = Umap(n_neighbors=15, n_components=2, n_epochs=120, random_seed=42, init="random")
|
|
59
|
+
emb = model.fit_transform(x)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## API layers
|
|
63
|
+
|
|
64
|
+
### Main API
|
|
65
|
+
|
|
66
|
+
Most users should start here:
|
|
67
|
+
|
|
68
|
+
- `Umap`
|
|
69
|
+
- `fit_transform`
|
|
70
|
+
- `Umap.fit`
|
|
71
|
+
- `Umap.fit_transform`
|
|
72
|
+
- `Umap.transform`
|
|
73
|
+
- `Umap.inverse_transform`
|
|
74
|
+
|
|
75
|
+
These methods accept NumPy arrays by default and support the documented `out=`
|
|
76
|
+
buffers where available.
|
|
77
|
+
|
|
78
|
+
### Advanced API
|
|
79
|
+
|
|
80
|
+
`Umap.fit_transform_with_knn(...)` is available for callers who already have a
|
|
81
|
+
precomputed exact or shared kNN graph. It is useful for benchmarks and
|
|
82
|
+
parameter sweeps, but it is not the recommended first-stop quickstart.
|
|
83
|
+
|
umapers-1.0.0/README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# umapers
|
|
2
|
+
|
|
3
|
+
Python package `umapers`, backed by `rust_umap` and built with PyO3 + maturin.
|
|
4
|
+
|
|
5
|
+
Version `1.0.0` focuses on IDE-help quality for the public Python API:
|
|
6
|
+
|
|
7
|
+
- the exported surface has useful type hints
|
|
8
|
+
- public methods carry docstrings that explain inputs and outputs
|
|
9
|
+
- `help(umapers.Umap)` and editor hover should be informative
|
|
10
|
+
|
|
11
|
+
The binding remains intentionally thin: Python normalizes arrays and CSR inputs,
|
|
12
|
+
while Rust owns validation and compute-heavy paths whenever practical.
|
|
13
|
+
|
|
14
|
+
## Local build
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
PYTHON_BIN="$(command -v python3 || command -v python)"
|
|
18
|
+
uv venv --python "$PYTHON_BIN" .venv
|
|
19
|
+
uv pip install --python .venv/bin/python --upgrade pip maturin
|
|
20
|
+
uv run --python .venv/bin/python maturin develop --manifest-path umap_rs/Cargo.toml
|
|
21
|
+
uv run --python .venv/bin/python python -I -m pytest -q umap_rs/tests/test_binding.py
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import numpy as np
|
|
28
|
+
from umapers import Umap
|
|
29
|
+
|
|
30
|
+
x = np.random.default_rng(42).normal(size=(200, 16)).astype(np.float32)
|
|
31
|
+
model = Umap(n_neighbors=15, n_components=2, n_epochs=120, random_seed=42, init="random")
|
|
32
|
+
emb = model.fit_transform(x)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## API layers
|
|
36
|
+
|
|
37
|
+
### Main API
|
|
38
|
+
|
|
39
|
+
Most users should start here:
|
|
40
|
+
|
|
41
|
+
- `Umap`
|
|
42
|
+
- `fit_transform`
|
|
43
|
+
- `Umap.fit`
|
|
44
|
+
- `Umap.fit_transform`
|
|
45
|
+
- `Umap.transform`
|
|
46
|
+
- `Umap.inverse_transform`
|
|
47
|
+
|
|
48
|
+
These methods accept NumPy arrays by default and support the documented `out=`
|
|
49
|
+
buffers where available.
|
|
50
|
+
|
|
51
|
+
### Advanced API
|
|
52
|
+
|
|
53
|
+
`Umap.fit_transform_with_knn(...)` is available for callers who already have a
|
|
54
|
+
precomputed exact or shared kNN graph. It is useful for benchmarks and
|
|
55
|
+
parameter sweeps, but it is not the recommended first-stop quickstart.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.6,<2.0"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "umapers"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Python bindings for rust_umap"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "BSD-3-Clause" }
|
|
12
|
+
keywords = ["umap", "dimensionality-reduction", "manifold-learning", "rust", "pyo3"]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"numpy>=1.26,<3",
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Rust",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
26
|
+
"License :: OSI Approved :: BSD License",
|
|
27
|
+
"Typing :: Typed",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/wenjiudaijiugui/umapers"
|
|
32
|
+
Repository = "https://github.com/wenjiudaijiugui/umapers"
|
|
33
|
+
Issues = "https://github.com/wenjiudaijiugui/umapers/issues"
|
|
34
|
+
Changelog = "https://github.com/wenjiudaijiugui/umapers/blob/main/CHANGELOG.md"
|
|
35
|
+
Documentation = "https://github.com/wenjiudaijiugui/umapers/tree/main/umap_rs"
|
|
36
|
+
|
|
37
|
+
[tool.maturin]
|
|
38
|
+
module-name = "umapers._umapers"
|
|
39
|
+
exclude = [
|
|
40
|
+
"python/umapers/__pycache__/*",
|
|
41
|
+
]
|
|
42
|
+
include = [
|
|
43
|
+
"python/umapers/__init__.pyi",
|
|
44
|
+
"python/umapers/_api.pyi",
|
|
45
|
+
"python/umapers/py.typed",
|
|
46
|
+
]
|
|
47
|
+
manifest-path = "umap_rs/Cargo.toml"
|
|
48
|
+
python-source = "python"
|
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, TypedDict
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from ._umapers import UmapCore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UmapKwargs(TypedDict, total=False):
|
|
11
|
+
"""Keyword arguments accepted by `Umap(...)` and `fit_transform(..., **kwargs)`."""
|
|
12
|
+
|
|
13
|
+
n_neighbors: int
|
|
14
|
+
n_components: int
|
|
15
|
+
n_epochs: int | None
|
|
16
|
+
metric: str
|
|
17
|
+
learning_rate: float
|
|
18
|
+
min_dist: float
|
|
19
|
+
spread: float
|
|
20
|
+
local_connectivity: float
|
|
21
|
+
set_op_mix_ratio: float
|
|
22
|
+
repulsion_strength: float
|
|
23
|
+
negative_sample_rate: int
|
|
24
|
+
random_seed: int
|
|
25
|
+
init: str
|
|
26
|
+
ann_mode: str
|
|
27
|
+
use_approximate_knn: bool
|
|
28
|
+
approx_knn_candidates: int
|
|
29
|
+
approx_knn_iters: int
|
|
30
|
+
approx_knn_threshold: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _as_f32_matrix(x: Any, name: str) -> np.ndarray:
|
|
34
|
+
arr = np.asarray(x, dtype=np.float32, order="C")
|
|
35
|
+
if arr.ndim != 2:
|
|
36
|
+
raise ValueError(f"{name} must be a 2D array, got ndim={arr.ndim}")
|
|
37
|
+
return arr
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _as_knn_indices(x: Any, name: str) -> np.ndarray:
|
|
41
|
+
arr = np.asarray(x, dtype=np.int64, order="C")
|
|
42
|
+
if arr.ndim != 2:
|
|
43
|
+
raise ValueError(f"{name} must be a 2D array, got ndim={arr.ndim}")
|
|
44
|
+
return arr
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _maybe_as_csr_parts(x: Any, name: str) -> tuple[np.ndarray, np.ndarray, np.ndarray, int, int] | None:
|
|
48
|
+
if getattr(x, "format", None) != "csr":
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
shape = getattr(x, "shape", None)
|
|
52
|
+
if shape is None or len(shape) != 2:
|
|
53
|
+
raise ValueError(f"{name} must be a 2D CSR matrix")
|
|
54
|
+
|
|
55
|
+
n_rows, n_cols = int(shape[0]), int(shape[1])
|
|
56
|
+
if n_cols <= 0:
|
|
57
|
+
raise ValueError(f"{name} must have at least one column")
|
|
58
|
+
|
|
59
|
+
indptr = np.asarray(x.indptr, dtype=np.int64, order="C")
|
|
60
|
+
indices = np.asarray(x.indices, dtype=np.int64, order="C")
|
|
61
|
+
data = np.asarray(x.data, dtype=np.float32, order="C")
|
|
62
|
+
if indptr.ndim != 1 or indices.ndim != 1 or data.ndim != 1:
|
|
63
|
+
raise ValueError(f"{name} CSR arrays must be 1D")
|
|
64
|
+
if indices.shape[0] != data.shape[0]:
|
|
65
|
+
raise ValueError(f"{name} CSR indices/data length mismatch")
|
|
66
|
+
return indptr, indices, data, n_rows, n_cols
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _as_out_buffer(out: Any, shape: tuple[int, int]) -> np.ndarray:
|
|
70
|
+
if not isinstance(out, np.ndarray):
|
|
71
|
+
raise TypeError("out must be a NumPy ndarray")
|
|
72
|
+
if out.dtype != np.float32:
|
|
73
|
+
raise TypeError(f"out dtype must be float32, got {out.dtype}")
|
|
74
|
+
if out.ndim != 2:
|
|
75
|
+
raise ValueError(f"out must be 2D, got ndim={out.ndim}")
|
|
76
|
+
if not out.flags.c_contiguous:
|
|
77
|
+
raise ValueError("out must be C-contiguous")
|
|
78
|
+
if not out.flags.writeable:
|
|
79
|
+
raise ValueError("out must be writeable")
|
|
80
|
+
if tuple(out.shape) != tuple(shape):
|
|
81
|
+
raise ValueError(f"output buffer shape mismatch: expected {shape}, got {tuple(out.shape)}")
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _normalize_ann_mode(
|
|
86
|
+
ann_mode: Any,
|
|
87
|
+
use_approximate_knn: bool,
|
|
88
|
+
approx_knn_threshold: int,
|
|
89
|
+
) -> tuple[str, bool, int]:
|
|
90
|
+
mode = str(ann_mode).lower()
|
|
91
|
+
if mode == "auto":
|
|
92
|
+
return mode, use_approximate_knn, approx_knn_threshold
|
|
93
|
+
if mode == "exact":
|
|
94
|
+
return mode, False, approx_knn_threshold
|
|
95
|
+
if mode == "approximate":
|
|
96
|
+
return mode, True, 0
|
|
97
|
+
raise ValueError(f"unsupported ann_mode '{ann_mode}', expected auto|exact|approximate")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class Umap:
|
|
101
|
+
"""High-level Python wrapper around the Rust UMAP core.
|
|
102
|
+
|
|
103
|
+
The Python layer is intentionally thin: it normalizes array-like inputs,
|
|
104
|
+
handles optional CSR sparse inputs, and forwards validated data to the
|
|
105
|
+
Rust implementation for fitting and inference.
|
|
106
|
+
|
|
107
|
+
Example
|
|
108
|
+
-------
|
|
109
|
+
>>> import numpy as np
|
|
110
|
+
>>> from umapers import Umap
|
|
111
|
+
>>> x = np.random.default_rng(42).normal(size=(100, 8)).astype(np.float32)
|
|
112
|
+
>>> emb = Umap(n_neighbors=15, n_components=2).fit_transform(x)
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
*,
|
|
118
|
+
n_neighbors: int = 15,
|
|
119
|
+
n_components: int = 2,
|
|
120
|
+
n_epochs: int | None = None,
|
|
121
|
+
metric: str = "euclidean",
|
|
122
|
+
learning_rate: float = 1.0,
|
|
123
|
+
min_dist: float = 0.1,
|
|
124
|
+
spread: float = 1.0,
|
|
125
|
+
local_connectivity: float = 1.0,
|
|
126
|
+
set_op_mix_ratio: float = 1.0,
|
|
127
|
+
repulsion_strength: float = 1.0,
|
|
128
|
+
negative_sample_rate: int = 5,
|
|
129
|
+
random_seed: int = 42,
|
|
130
|
+
init: str = "spectral",
|
|
131
|
+
ann_mode: str = "auto",
|
|
132
|
+
use_approximate_knn: bool = True,
|
|
133
|
+
approx_knn_candidates: int = 30,
|
|
134
|
+
approx_knn_iters: int = 10,
|
|
135
|
+
approx_knn_threshold: int = 4096,
|
|
136
|
+
) -> None:
|
|
137
|
+
"""Create a UMAP model.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
n_neighbors:
|
|
142
|
+
Number of neighbors used to build the neighborhood graph.
|
|
143
|
+
n_components:
|
|
144
|
+
Output embedding dimension.
|
|
145
|
+
n_epochs:
|
|
146
|
+
Number of optimization epochs. If `None`, the Rust core uses its
|
|
147
|
+
internal default.
|
|
148
|
+
metric:
|
|
149
|
+
Distance metric for dense input and query transforms.
|
|
150
|
+
learning_rate, min_dist, spread, local_connectivity,
|
|
151
|
+
set_op_mix_ratio, repulsion_strength, negative_sample_rate,
|
|
152
|
+
random_seed, init:
|
|
153
|
+
Standard UMAP hyperparameters forwarded to the Rust core.
|
|
154
|
+
ann_mode:
|
|
155
|
+
Python-side shortcut for approximate nearest-neighbor behavior.
|
|
156
|
+
Supported values are `auto`, `exact`, and `approximate`.
|
|
157
|
+
use_approximate_knn:
|
|
158
|
+
Default approximate-kNN behavior when `ann_mode="auto"`.
|
|
159
|
+
approx_knn_candidates, approx_knn_iters, approx_knn_threshold:
|
|
160
|
+
Approximate-kNN tuning parameters forwarded to the Rust core.
|
|
161
|
+
|
|
162
|
+
Examples
|
|
163
|
+
--------
|
|
164
|
+
>>> import numpy as np
|
|
165
|
+
>>> from umapers import Umap
|
|
166
|
+
>>> x = np.random.default_rng(42).normal(size=(200, 16)).astype(np.float32)
|
|
167
|
+
>>> model = Umap(n_neighbors=15, n_components=2, init="random")
|
|
168
|
+
>>> emb = model.fit_transform(x)
|
|
169
|
+
>>> emb.shape
|
|
170
|
+
(200, 2)
|
|
171
|
+
"""
|
|
172
|
+
self.n_neighbors = int(n_neighbors)
|
|
173
|
+
self.n_components = int(n_components)
|
|
174
|
+
ann_mode, use_approximate_knn, approx_knn_threshold = _normalize_ann_mode(
|
|
175
|
+
ann_mode,
|
|
176
|
+
use_approximate_knn,
|
|
177
|
+
approx_knn_threshold,
|
|
178
|
+
)
|
|
179
|
+
self.ann_mode = ann_mode
|
|
180
|
+
self._core = UmapCore(
|
|
181
|
+
n_neighbors=n_neighbors,
|
|
182
|
+
n_components=n_components,
|
|
183
|
+
n_epochs=n_epochs,
|
|
184
|
+
metric=metric,
|
|
185
|
+
learning_rate=learning_rate,
|
|
186
|
+
min_dist=min_dist,
|
|
187
|
+
spread=spread,
|
|
188
|
+
local_connectivity=local_connectivity,
|
|
189
|
+
set_op_mix_ratio=set_op_mix_ratio,
|
|
190
|
+
repulsion_strength=repulsion_strength,
|
|
191
|
+
negative_sample_rate=negative_sample_rate,
|
|
192
|
+
random_seed=random_seed,
|
|
193
|
+
init=init,
|
|
194
|
+
use_approximate_knn=use_approximate_knn,
|
|
195
|
+
approx_knn_candidates=approx_knn_candidates,
|
|
196
|
+
approx_knn_iters=approx_knn_iters,
|
|
197
|
+
approx_knn_threshold=approx_knn_threshold,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def fit(self, data: Any) -> "Umap":
|
|
201
|
+
"""Fit the model on dense or CSR input and return `self`.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
data:
|
|
206
|
+
Dense input is converted to a C-contiguous `float32` matrix of
|
|
207
|
+
shape `(n_samples, n_features)`. CSR sparse input is accepted as an
|
|
208
|
+
advanced convenience path and is forwarded to the Rust core.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
Umap
|
|
213
|
+
The fitted model.
|
|
214
|
+
"""
|
|
215
|
+
csr = _maybe_as_csr_parts(data, "data")
|
|
216
|
+
if csr is not None:
|
|
217
|
+
indptr, indices, values, _, n_cols = csr
|
|
218
|
+
self._core.fit_sparse_csr(indptr, indices, values, n_cols)
|
|
219
|
+
return self
|
|
220
|
+
|
|
221
|
+
arr = _as_f32_matrix(data, "data")
|
|
222
|
+
self._core.fit(arr)
|
|
223
|
+
return self
|
|
224
|
+
|
|
225
|
+
def fit_transform(self, data: Any, *, out: np.ndarray | None = None) -> np.ndarray:
|
|
226
|
+
"""Fit the model and return the embedding for the training data.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
data:
|
|
231
|
+
Dense input is converted to `float32` dtype and expected to have
|
|
232
|
+
shape `(n_samples, n_features)`. CSR sparse input is supported for
|
|
233
|
+
the current sparse MVP path.
|
|
234
|
+
out:
|
|
235
|
+
Optional writable `float32` dtype buffer with shape
|
|
236
|
+
`(n_samples, n_components)`. When provided, the result is written
|
|
237
|
+
in place and the same array is returned.
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
numpy.ndarray
|
|
242
|
+
The fitted embedding with shape `(n_samples, n_components)`.
|
|
243
|
+
"""
|
|
244
|
+
csr = _maybe_as_csr_parts(data, "data")
|
|
245
|
+
if csr is not None:
|
|
246
|
+
indptr, indices, values, n_rows, n_cols = csr
|
|
247
|
+
expected_shape = (n_rows, self.n_components)
|
|
248
|
+
if out is None:
|
|
249
|
+
return self._core.fit_transform_sparse_csr(indptr, indices, values, n_cols)
|
|
250
|
+
out_buf = _as_out_buffer(out, expected_shape)
|
|
251
|
+
self._core.fit_transform_sparse_csr_into(indptr, indices, values, n_cols, out_buf)
|
|
252
|
+
return out_buf
|
|
253
|
+
|
|
254
|
+
arr = _as_f32_matrix(data, "data")
|
|
255
|
+
expected_shape = (arr.shape[0], self.n_components)
|
|
256
|
+
if out is None:
|
|
257
|
+
return self._core.fit_transform(arr)
|
|
258
|
+
out_buf = _as_out_buffer(out, expected_shape)
|
|
259
|
+
self._core.fit_transform_into(arr, out_buf)
|
|
260
|
+
return out_buf
|
|
261
|
+
|
|
262
|
+
def fit_transform_with_knn(
|
|
263
|
+
self,
|
|
264
|
+
data: Any,
|
|
265
|
+
knn_indices: Any,
|
|
266
|
+
knn_dists: Any,
|
|
267
|
+
*,
|
|
268
|
+
knn_metric: str = "euclidean",
|
|
269
|
+
validate_precomputed: bool = True,
|
|
270
|
+
out: np.ndarray | None = None,
|
|
271
|
+
) -> np.ndarray:
|
|
272
|
+
"""Fit using a precomputed kNN graph and return the embedding.
|
|
273
|
+
|
|
274
|
+
This is a public advanced interface for callers that already have an
|
|
275
|
+
exact or shared kNN graph. It is useful for benchmark parity and for
|
|
276
|
+
integrating external neighbor-search pipelines, but it is not the
|
|
277
|
+
default quickstart path.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
data:
|
|
282
|
+
Dense training data with shape `(n_samples, n_features)`. It is
|
|
283
|
+
converted to `float32`.
|
|
284
|
+
knn_indices:
|
|
285
|
+
Precomputed neighbor indices with shape `(n_samples, k)` and
|
|
286
|
+
integer dtype.
|
|
287
|
+
knn_dists:
|
|
288
|
+
Precomputed neighbor distances with shape `(n_samples, k)` and
|
|
289
|
+
`float32`-compatible values.
|
|
290
|
+
knn_metric:
|
|
291
|
+
Metric name for the precomputed graph. It must match the model
|
|
292
|
+
metric.
|
|
293
|
+
validate_precomputed:
|
|
294
|
+
If `True`, the Rust core performs precomputed-kNN validation before
|
|
295
|
+
fitting. The Python binding keeps this path thin and only
|
|
296
|
+
normalizes array dtypes and layouts.
|
|
297
|
+
out:
|
|
298
|
+
Optional writable `float32` buffer with shape
|
|
299
|
+
`(n_samples, n_components)`.
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
numpy.ndarray
|
|
304
|
+
The fitted embedding with shape `(n_samples, n_components)`.
|
|
305
|
+
|
|
306
|
+
Example
|
|
307
|
+
-------
|
|
308
|
+
>>> import numpy as np
|
|
309
|
+
>>> from sklearn.neighbors import NearestNeighbors
|
|
310
|
+
>>> from umapers import Umap
|
|
311
|
+
>>> x = np.random.default_rng(42).normal(size=(64, 8)).astype(np.float32)
|
|
312
|
+
>>> nbrs = NearestNeighbors(n_neighbors=16, algorithm="brute", metric="euclidean")
|
|
313
|
+
>>> nbrs.fit(x)
|
|
314
|
+
>>> dists, idx = nbrs.kneighbors(x)
|
|
315
|
+
>>> emb = Umap(n_neighbors=15, metric="euclidean").fit_transform_with_knn(
|
|
316
|
+
... x,
|
|
317
|
+
... idx[:, 1:16].astype(np.int64),
|
|
318
|
+
... dists[:, 1:16].astype(np.float32),
|
|
319
|
+
... )
|
|
320
|
+
"""
|
|
321
|
+
arr = _as_f32_matrix(data, "data")
|
|
322
|
+
idx = _as_knn_indices(knn_indices, "knn_indices")
|
|
323
|
+
dist = _as_f32_matrix(knn_dists, "knn_dists")
|
|
324
|
+
expected_shape = (arr.shape[0], self.n_components)
|
|
325
|
+
if out is None:
|
|
326
|
+
return self._core.fit_transform_with_knn(
|
|
327
|
+
arr,
|
|
328
|
+
idx,
|
|
329
|
+
dist,
|
|
330
|
+
knn_metric,
|
|
331
|
+
validate_precomputed,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
out_buf = _as_out_buffer(out, expected_shape)
|
|
335
|
+
self._core.fit_transform_with_knn_into(
|
|
336
|
+
arr,
|
|
337
|
+
idx,
|
|
338
|
+
dist,
|
|
339
|
+
out_buf,
|
|
340
|
+
knn_metric,
|
|
341
|
+
validate_precomputed,
|
|
342
|
+
)
|
|
343
|
+
return out_buf
|
|
344
|
+
|
|
345
|
+
def transform(self, query: Any, *, out: np.ndarray | None = None) -> np.ndarray:
|
|
346
|
+
"""Project new dense samples into the learned embedding space.
|
|
347
|
+
|
|
348
|
+
Parameters
|
|
349
|
+
----------
|
|
350
|
+
query:
|
|
351
|
+
Dense input of shape `(n_samples, n_features)`. It is converted to
|
|
352
|
+
a C-contiguous `float32` matrix.
|
|
353
|
+
out:
|
|
354
|
+
Optional writable `float32` buffer with shape
|
|
355
|
+
`(n_samples, n_components)`.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
numpy.ndarray
|
|
360
|
+
The projected embedding.
|
|
361
|
+
|
|
362
|
+
Example
|
|
363
|
+
-------
|
|
364
|
+
>>> import numpy as np
|
|
365
|
+
>>> from umapers import Umap
|
|
366
|
+
>>> x = np.random.default_rng(42).normal(size=(100, 8)).astype(np.float32)
|
|
367
|
+
>>> model = Umap(n_neighbors=15, n_components=2).fit(x)
|
|
368
|
+
>>> query_emb = model.transform(x[:10])
|
|
369
|
+
"""
|
|
370
|
+
arr = _as_f32_matrix(query, "query")
|
|
371
|
+
expected_shape = (arr.shape[0], self.n_components)
|
|
372
|
+
if out is None:
|
|
373
|
+
return self._core.transform(arr)
|
|
374
|
+
out_buf = _as_out_buffer(out, expected_shape)
|
|
375
|
+
self._core.transform_into(arr, out_buf)
|
|
376
|
+
return out_buf
|
|
377
|
+
|
|
378
|
+
def inverse_transform(self, embedded_query: Any, *, out: np.ndarray | None = None) -> np.ndarray:
|
|
379
|
+
"""Map embedded samples back to the original feature space.
|
|
380
|
+
|
|
381
|
+
Parameters
|
|
382
|
+
----------
|
|
383
|
+
embedded_query:
|
|
384
|
+
Dense embedding of shape `(n_samples, n_components)`. It is
|
|
385
|
+
converted to `float32`.
|
|
386
|
+
out:
|
|
387
|
+
Optional writable `float32` buffer with shape
|
|
388
|
+
`(n_samples, n_features)`. The model must already be fit before
|
|
389
|
+
using `out=`.
|
|
390
|
+
|
|
391
|
+
Returns
|
|
392
|
+
-------
|
|
393
|
+
numpy.ndarray
|
|
394
|
+
Reconstructed samples in the original feature space.
|
|
395
|
+
|
|
396
|
+
Example
|
|
397
|
+
-------
|
|
398
|
+
>>> import numpy as np
|
|
399
|
+
>>> from umapers import Umap
|
|
400
|
+
>>> x = np.random.default_rng(42).normal(size=(100, 8)).astype(np.float32)
|
|
401
|
+
>>> model = Umap(n_neighbors=15, n_components=2).fit(x)
|
|
402
|
+
>>> emb = model.transform(x[:10])
|
|
403
|
+
>>> x_rec = model.inverse_transform(emb)
|
|
404
|
+
"""
|
|
405
|
+
arr = _as_f32_matrix(embedded_query, "embedded_query")
|
|
406
|
+
if out is None:
|
|
407
|
+
return self._core.inverse_transform(arr)
|
|
408
|
+
n_features = self._core.n_features
|
|
409
|
+
if n_features is None:
|
|
410
|
+
raise RuntimeError("model must be fit before inverse_transform(out=...)")
|
|
411
|
+
out_buf = _as_out_buffer(out, (arr.shape[0], n_features))
|
|
412
|
+
self._core.inverse_transform_into(arr, out_buf)
|
|
413
|
+
return out_buf
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def fit_transform(data: Any, **kwargs: Any) -> np.ndarray:
|
|
417
|
+
"""Embed a dataset in one call.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
data:
|
|
422
|
+
Dense or CSR input accepted by ``Umap.fit_transform``.
|
|
423
|
+
**kwargs:
|
|
424
|
+
Hyperparameters forwarded to ``Umap(...)``. Common keys include
|
|
425
|
+
``n_neighbors``, ``n_components``, ``metric``, ``init``, and
|
|
426
|
+
``random_seed``.
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
numpy.ndarray
|
|
431
|
+
Embedding with shape ``(n_samples, n_components)`` and dtype
|
|
432
|
+
``float32``.
|
|
433
|
+
|
|
434
|
+
Examples
|
|
435
|
+
--------
|
|
436
|
+
>>> import numpy as np
|
|
437
|
+
>>> from umapers import fit_transform
|
|
438
|
+
>>> x = np.random.default_rng(42).normal(size=(200, 16)).astype(np.float32)
|
|
439
|
+
>>> emb = fit_transform(x, n_neighbors=15, n_components=2, init="random")
|
|
440
|
+
>>> emb.shape
|
|
441
|
+
(200, 2)
|
|
442
|
+
"""
|
|
443
|
+
model = Umap(**kwargs)
|
|
444
|
+
csr = _maybe_as_csr_parts(data, "data")
|
|
445
|
+
if csr is not None:
|
|
446
|
+
indptr, indices, values, _, n_cols = csr
|
|
447
|
+
return model._core.fit_transform_sparse_csr_stateless(indptr, indices, values, n_cols)
|
|
448
|
+
|
|
449
|
+
arr = _as_f32_matrix(data, "data")
|
|
450
|
+
return model._core.fit_transform_stateless(arr)
|