simmetry 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. simmetry-1.0.1/.gitignore +4 -0
  2. simmetry-1.0.1/CHANGELOG.md +22 -0
  3. simmetry-1.0.1/LICENSE +21 -0
  4. simmetry-1.0.1/PKG-INFO +213 -0
  5. simmetry-1.0.1/README.md +176 -0
  6. simmetry-1.0.1/bench/run.py +24 -0
  7. simmetry-1.0.1/pyproject.toml +67 -0
  8. simmetry-1.0.1/simmetry/__init__.py +39 -0
  9. simmetry-1.0.1/simmetry/ann/__init__.py +4 -0
  10. simmetry-1.0.1/simmetry/ann/faiss_.py +49 -0
  11. simmetry-1.0.1/simmetry/ann/hnsw.py +51 -0
  12. simmetry-1.0.1/simmetry/api.py +110 -0
  13. simmetry-1.0.1/simmetry/index.py +76 -0
  14. simmetry-1.0.1/simmetry/points/__init__.py +3 -0
  15. simmetry-1.0.1/simmetry/points/core.py +28 -0
  16. simmetry-1.0.1/simmetry/registry.py +38 -0
  17. simmetry-1.0.1/simmetry/sets/__init__.py +3 -0
  18. simmetry-1.0.1/simmetry/sets/core.py +42 -0
  19. simmetry-1.0.1/simmetry/strings/__init__.py +14 -0
  20. simmetry-1.0.1/simmetry/strings/jaro.py +57 -0
  21. simmetry-1.0.1/simmetry/strings/levenshtein.py +30 -0
  22. simmetry-1.0.1/simmetry/strings/ngrams.py +33 -0
  23. simmetry-1.0.1/simmetry/strings/pairwise.py +56 -0
  24. simmetry-1.0.1/simmetry/utils/numpy_utils.py +12 -0
  25. simmetry-1.0.1/simmetry/vectors/__init__.py +3 -0
  26. simmetry-1.0.1/simmetry/vectors/core.py +47 -0
  27. simmetry-1.0.1/simmetry/vectors/pairwise.py +88 -0
  28. simmetry-1.0.1/tests/test_ann_optional.py +24 -0
  29. simmetry-1.0.1/tests/test_auto_and_index.py +54 -0
  30. simmetry-1.0.1/tests/test_points_sets.py +16 -0
  31. simmetry-1.0.1/tests/test_registry_api.py +19 -0
  32. simmetry-1.0.1/tests/test_strings.py +19 -0
  33. simmetry-1.0.1/tests/test_strings_batch.py +17 -0
  34. simmetry-1.0.1/tests/test_vectors.py +38 -0
@@ -0,0 +1,4 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .pytest_cache/
4
+ .venv/
@@ -0,0 +1,22 @@
1
+ # Changelog
2
+
3
+ All notable changes to **simmetry** will be documented in this file.
4
+
5
+ The format is based on **Keep a Changelog**, and this project adheres to **Semantic Versioning**.
6
+
7
+ ## [1.0.1] - 2026-02-21
8
+ ### Added
9
+ - Optional Numba acceleration for `pairwise(..., metric="euclidean_sim" | "manhattan_sim")` when installed via `simmetry[fast]`.
10
+
11
+ ### Changed
12
+ - Improved validation and error messages for vector dimension mismatches.
13
+ - Fixed `similarity([], [], metric="auto")` to route to string similarity batch behavior.
14
+ - Project cleanup for public/PyPI release packaging.
15
+
16
+ ## [1.0.0] - 2026-02-21
17
+ ### Added
18
+ - Auto similarity (`metric="auto"`) across strings/vectors/points/sets.
19
+ - Batch string APIs (`pairwise_strings`, `topk_strings`).
20
+ - Optional ANN module (`hnswlib` / `faiss-cpu`) via extras.
21
+ - Unified `SimIndex` with `exact` / `hnsw` / `faiss` backends.
22
+ - Composite similarity for dict records (field metrics + weights).
simmetry-1.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,213 @@
1
+ Metadata-Version: 2.4
2
+ Name: simmetry
3
+ Version: 1.0.1
4
+ Summary: Blazing-fast similarity scores for strings, vectors, points, and sets.
5
+ Project-URL: Homepage, https://pypi.org/project/simmetry/
6
+ Project-URL: Repository, https://github.com/algumusrende/simmetry
7
+ Author-email: Ali Can Gumusrende <algumusrende@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: cosine,distance,haversine,jaccard,levenshtein,similarity
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Topic :: Software Development :: Libraries
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: numpy>=1.23
25
+ Provides-Extra: ann
26
+ Requires-Dist: hnswlib>=0.8.0; extra == 'ann'
27
+ Provides-Extra: ann-faiss
28
+ Requires-Dist: faiss-cpu>=1.7.4; extra == 'ann-faiss'
29
+ Provides-Extra: ann-hnsw
30
+ Requires-Dist: hnswlib>=0.8.0; extra == 'ann-hnsw'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=8.0; extra == 'dev'
33
+ Requires-Dist: ruff>=0.4; extra == 'dev'
34
+ Provides-Extra: fast
35
+ Requires-Dist: numba>=0.58; extra == 'fast'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # simmetry
39
+
40
+ Blazing-fast similarity scores for **strings**, **vectors**, **points**, and **sets** — with a simple API.
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install simmetry
46
+ pip install "simmetry[fast]"
47
+ ```
48
+
49
+ `simmetry[fast]` enables optional Numba acceleration for `pairwise(..., metric="euclidean_sim")` and `pairwise(..., metric="manhattan_sim")`.
50
+
51
+ ## Quickstart
52
+
53
+ ### One function
54
+ ```python
55
+ from simmetry import similarity
56
+
57
+ similarity("kitten", "sitting", metric="levenshtein")
58
+ similarity([1,2,3], [1,2,4], metric="cosine")
59
+ similarity((41.1, 29.0), (41.2, 29.1), metric="haversine_km")
60
+ similarity({1,2,3}, {2,3,4}, metric="jaccard")
61
+ ```
62
+
63
+ ### Pairwise matrices (fast for vectors)
64
+ ```python
65
+ import numpy as np
66
+ from simmetry import pairwise
67
+
68
+ X = np.random.randn(1000, 128)
69
+ S = pairwise(X, metric="cosine")
70
+ ```
71
+
72
+ ### Top-k search
73
+ ```python
74
+ import numpy as np
75
+ from simmetry import topk
76
+
77
+ X = np.random.randn(5000, 64)
78
+ q = np.random.randn(64)
79
+ idx, scores = topk(q, X, k=10, metric="cosine")
80
+ ```
81
+
82
+ ## Available metrics
83
+
84
+ ```python
85
+ from simmetry import available
86
+ available()
87
+ available("vector")
88
+ available("string")
89
+ available("point")
90
+ available("set")
91
+ ```
92
+
93
+ ### Vectors
94
+ - `cosine`, `dot`, `euclidean_sim`, `manhattan_sim`, `pearson`
95
+
96
+ ### Strings
97
+ - `levenshtein` (normalized similarity)
98
+ - `jaro_winkler`
99
+ - `ngram_jaccard` (character n-gram set Jaccard)
100
+ - `token_jaccard` (whitespace token set Jaccard)
101
+
102
+ ### Points / Geo
103
+ - `euclidean_2d`
104
+ - `haversine_km`
105
+
106
+ ### Sets
107
+ - `jaccard`, `dice`, `overlap`
108
+
109
+ ## License
110
+ MIT
111
+ ## Batch string APIs
112
+
113
+ If you need many string-to-string similarities (e.g., deduping names), use:
114
+
115
+ ```python
116
+ from simmetry.strings import pairwise_strings, topk_strings
117
+
118
+ S = pairwise_strings(["item_one", "item_two"], ["item_one", "item_alt"], metric="jaro_winkler")
119
+ idx, scores = topk_strings("samplecorp", ["samplecorp", "examplefinance", "testgroup"], k=2, metric="levenshtein")
120
+ ```
121
+
122
+ ## ANN top-k (optional, does NOT bloat core)
123
+
124
+ For very large vector corpora (100k+), exact `topk()` can be slow. ANN gives fast approximate results.
125
+
126
+ ### hnswlib (recommended)
127
+ ```bash
128
+ pip install "simmetry[ann-hnsw]"
129
+ ```
130
+
131
+ ```python
132
+ import numpy as np
133
+ from simmetry.ann import build_hnsw
134
+
135
+ X = np.random.randn(200_000, 128).astype("float32")
136
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
137
+
138
+ index = build_hnsw(X, space="cosine")
139
+ labels, distances = index.query(X[0], k=10)
140
+ ```
141
+
142
+ ### faiss
143
+ ```bash
144
+ pip install "simmetry[ann-faiss]"
145
+ ```
146
+
147
+ ```python
148
+ import numpy as np
149
+ from simmetry.ann import build_faiss
150
+
151
+ X = np.random.randn(200_000, 128).astype("float32")
152
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
153
+
154
+ index = build_faiss(X, metric="ip")
155
+ labels, scores = index.query(X[0], k=10)
156
+ ```
157
+
158
+
159
+ ## SimIndex (exact or ANN)
160
+
161
+ Exact search (no extras):
162
+
163
+ ```python
164
+ import numpy as np
165
+ from simmetry import SimIndex
166
+
167
+ X = np.random.randn(50_000, 128).astype("float32")
168
+ index = SimIndex(metric="cosine", backend="exact").add(X)
169
+
170
+ idx, scores = index.query(X[0], k=10)
171
+ ```
172
+
173
+ ANN (optional):
174
+
175
+ ```bash
176
+ pip install "simmetry[ann-hnsw]"
177
+ ```
178
+
179
+ ```python
180
+ import numpy as np
181
+ from simmetry import SimIndex
182
+
183
+ X = np.random.randn(200_000, 128).astype("float32")
184
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
185
+
186
+ index = SimIndex(metric="cosine", backend="hnsw").add(X)
187
+ labels, distances = index.query(X[0], k=10)
188
+ ```
189
+
190
+ ## Auto similarity and composite records
191
+
192
+ Auto metric selection:
193
+
194
+ ```python
195
+ from simmetry import similarity
196
+
197
+ similarity("samplecorp", "sample corp")
198
+ similarity((41.0, 29.0), (41.1, 29.1))
199
+ similarity({1,2,3}, {2,3,4})
200
+ ```
201
+
202
+ Composite similarity over dict fields:
203
+
204
+ ```python
205
+ a = {"name": "Entity One", "city": "CityAlpha", "loc": (41.0, 29.0)}
206
+ b = {"name": "Entity One Extended", "city": "CityAlpha", "loc": (41.01, 28.99)}
207
+
208
+ score = similarity(
209
+ a, b,
210
+ metric={"name": "jaro_winkler", "loc": "haversine_km"},
211
+ weights={"name": 0.7, "loc": 0.3},
212
+ )
213
+ ```
@@ -0,0 +1,176 @@
1
+ # simmetry
2
+
3
+ Blazing-fast similarity scores for **strings**, **vectors**, **points**, and **sets** — with a simple API.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install simmetry
9
+ pip install "simmetry[fast]"
10
+ ```
11
+
12
+ `simmetry[fast]` enables optional Numba acceleration for `pairwise(..., metric="euclidean_sim")` and `pairwise(..., metric="manhattan_sim")`.
13
+
14
+ ## Quickstart
15
+
16
+ ### One function
17
+ ```python
18
+ from simmetry import similarity
19
+
20
+ similarity("kitten", "sitting", metric="levenshtein")
21
+ similarity([1,2,3], [1,2,4], metric="cosine")
22
+ similarity((41.1, 29.0), (41.2, 29.1), metric="haversine_km")
23
+ similarity({1,2,3}, {2,3,4}, metric="jaccard")
24
+ ```
25
+
26
+ ### Pairwise matrices (fast for vectors)
27
+ ```python
28
+ import numpy as np
29
+ from simmetry import pairwise
30
+
31
+ X = np.random.randn(1000, 128)
32
+ S = pairwise(X, metric="cosine")
33
+ ```
34
+
35
+ ### Top-k search
36
+ ```python
37
+ import numpy as np
38
+ from simmetry import topk
39
+
40
+ X = np.random.randn(5000, 64)
41
+ q = np.random.randn(64)
42
+ idx, scores = topk(q, X, k=10, metric="cosine")
43
+ ```
44
+
45
+ ## Available metrics
46
+
47
+ ```python
48
+ from simmetry import available
49
+ available()
50
+ available("vector")
51
+ available("string")
52
+ available("point")
53
+ available("set")
54
+ ```
55
+
56
+ ### Vectors
57
+ - `cosine`, `dot`, `euclidean_sim`, `manhattan_sim`, `pearson`
58
+
59
+ ### Strings
60
+ - `levenshtein` (normalized similarity)
61
+ - `jaro_winkler`
62
+ - `ngram_jaccard` (character n-gram set Jaccard)
63
+ - `token_jaccard` (whitespace token set Jaccard)
64
+
65
+ ### Points / Geo
66
+ - `euclidean_2d`
67
+ - `haversine_km`
68
+
69
+ ### Sets
70
+ - `jaccard`, `dice`, `overlap`
71
+
72
+ ## License
73
+ MIT
74
+ ## Batch string APIs
75
+
76
+ If you need many string-to-string similarities (e.g., deduping names), use:
77
+
78
+ ```python
79
+ from simmetry.strings import pairwise_strings, topk_strings
80
+
81
+ S = pairwise_strings(["item_one", "item_two"], ["item_one", "item_alt"], metric="jaro_winkler")
82
+ idx, scores = topk_strings("samplecorp", ["samplecorp", "examplefinance", "testgroup"], k=2, metric="levenshtein")
83
+ ```
84
+
85
+ ## ANN top-k (optional, does NOT bloat core)
86
+
87
+ For very large vector corpora (100k+), exact `topk()` can be slow. ANN gives fast approximate results.
88
+
89
+ ### hnswlib (recommended)
90
+ ```bash
91
+ pip install "simmetry[ann-hnsw]"
92
+ ```
93
+
94
+ ```python
95
+ import numpy as np
96
+ from simmetry.ann import build_hnsw
97
+
98
+ X = np.random.randn(200_000, 128).astype("float32")
99
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
100
+
101
+ index = build_hnsw(X, space="cosine")
102
+ labels, distances = index.query(X[0], k=10)
103
+ ```
104
+
105
+ ### faiss
106
+ ```bash
107
+ pip install "simmetry[ann-faiss]"
108
+ ```
109
+
110
+ ```python
111
+ import numpy as np
112
+ from simmetry.ann import build_faiss
113
+
114
+ X = np.random.randn(200_000, 128).astype("float32")
115
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
116
+
117
+ index = build_faiss(X, metric="ip")
118
+ labels, scores = index.query(X[0], k=10)
119
+ ```
120
+
121
+
122
+ ## SimIndex (exact or ANN)
123
+
124
+ Exact search (no extras):
125
+
126
+ ```python
127
+ import numpy as np
128
+ from simmetry import SimIndex
129
+
130
+ X = np.random.randn(50_000, 128).astype("float32")
131
+ index = SimIndex(metric="cosine", backend="exact").add(X)
132
+
133
+ idx, scores = index.query(X[0], k=10)
134
+ ```
135
+
136
+ ANN (optional):
137
+
138
+ ```bash
139
+ pip install "simmetry[ann-hnsw]"
140
+ ```
141
+
142
+ ```python
143
+ import numpy as np
144
+ from simmetry import SimIndex
145
+
146
+ X = np.random.randn(200_000, 128).astype("float32")
147
+ X /= np.linalg.norm(X, axis=1, keepdims=True)
148
+
149
+ index = SimIndex(metric="cosine", backend="hnsw").add(X)
150
+ labels, distances = index.query(X[0], k=10)
151
+ ```
152
+
153
+ ## Auto similarity and composite records
154
+
155
+ Auto metric selection:
156
+
157
+ ```python
158
+ from simmetry import similarity
159
+
160
+ similarity("samplecorp", "sample corp")
161
+ similarity((41.0, 29.0), (41.1, 29.1))
162
+ similarity({1,2,3}, {2,3,4})
163
+ ```
164
+
165
+ Composite similarity over dict fields:
166
+
167
+ ```python
168
+ a = {"name": "Entity One", "city": "CityAlpha", "loc": (41.0, 29.0)}
169
+ b = {"name": "Entity One Extended", "city": "CityAlpha", "loc": (41.01, 28.99)}
170
+
171
+ score = similarity(
172
+ a, b,
173
+ metric={"name": "jaro_winkler", "loc": "haversine_km"},
174
+ weights={"name": 0.7, "loc": 0.3},
175
+ )
176
+ ```
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+
5
+ import numpy as np
6
+
7
+ from simmetry import pairwise, topk
8
+
9
+
10
+ def main():
11
+ X = np.random.randn(5000, 128)
12
+ t0 = time.time()
13
+ S = pairwise(X[:1000], X[:1000], metric="cosine")
14
+ t1 = time.time()
15
+ print("pairwise cosine (1000x1000) seconds:", round(t1 - t0, 4), "shape:", S.shape)
16
+
17
+ q = np.random.randn(128)
18
+ t0 = time.time()
19
+ idx, scores = topk(q, X, k=10, metric="cosine")
20
+ t1 = time.time()
21
+ print("topk cosine seconds:", round(t1 - t0, 6), "idx:", idx[:3], "scores:", scores[:3])
22
+
23
+ if __name__ == "__main__":
24
+ main()
@@ -0,0 +1,67 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.24"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "simmetry"
7
+ version = "1.0.1"
8
+ description = "Blazing-fast similarity scores for strings, vectors, points, and sets."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Ali Can Gumusrende", email = "algumusrende@gmail.com" }]
13
+ keywords = ["similarity", "distance", "cosine", "levenshtein", "jaccard", "haversine"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Operating System :: OS Independent",
25
+ "Topic :: Software Development :: Libraries",
26
+ "Topic :: Scientific/Engineering :: Information Analysis",
27
+ ]
28
+ dependencies = [
29
+ "numpy>=1.23",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ fast = [
34
+ "numba>=0.58",
35
+ ]
36
+ dev = [
37
+ "pytest>=8.0",
38
+ "ruff>=0.4",
39
+ ]
40
+
41
+ ann-hnsw = [
42
+ "hnswlib>=0.8.0",
43
+ ]
44
+ ann-faiss = [
45
+ "faiss-cpu>=1.7.4",
46
+ ]
47
+ ann = [
48
+ "hnswlib>=0.8.0",
49
+ ]
50
+
51
+ [project.urls]
52
+ Homepage = "https://pypi.org/project/simmetry/"
53
+ Repository = "https://github.com/algumusrende/simmetry"
54
+
55
+ [tool.hatch.build.targets.wheel]
56
+ packages = ["simmetry"]
57
+
58
+ [tool.ruff]
59
+ line-length = 100
60
+ target-version = "py310"
61
+
62
+ [tool.ruff.lint]
63
+ select = ["E", "F", "I", "B", "UP"]
64
+ ignore = ["E501"]
65
+
66
+ [tool.pytest.ini_options]
67
+ testpaths = ["tests"]
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from .api import pairwise, similarity, topk
4
+ from .index import SimIndex
5
+ from .points.core import euclidean_2d, haversine_km
6
+ from .registry import available, get, register
7
+ from .sets.core import dice, jaccard, overlap
8
+ from .strings.jaro import jaro_winkler
9
+ from .strings.levenshtein import levenshtein
10
+ from .strings.ngrams import ngram_jaccard, token_jaccard
11
+ from .vectors.core import cosine, dot, euclidean_sim, manhattan_sim, pearson
12
+
13
+ register("cosine", cosine, kind="vector")
14
+ register("dot", dot, kind="vector")
15
+ register("euclidean_sim", euclidean_sim, kind="vector")
16
+ register("manhattan_sim", manhattan_sim, kind="vector")
17
+ register("pearson", pearson, kind="vector")
18
+
19
+ register("levenshtein", levenshtein, kind="string")
20
+ register("jaro_winkler", jaro_winkler, kind="string")
21
+ register("ngram_jaccard", ngram_jaccard, kind="string")
22
+ register("token_jaccard", token_jaccard, kind="string")
23
+
24
+ register("euclidean_2d", euclidean_2d, kind="point")
25
+ register("haversine_km", haversine_km, kind="point")
26
+
27
+ register("jaccard", jaccard, kind="set")
28
+ register("dice", dice, kind="set")
29
+ register("overlap", overlap, kind="set")
30
+
31
+ __all__ = [
32
+ "similarity",
33
+ "SimIndex",
34
+ "pairwise",
35
+ "topk",
36
+ "register",
37
+ "get",
38
+ "available",
39
+ ]
@@ -0,0 +1,4 @@
1
+ from .faiss_ import FaissIndex, build_faiss
2
+ from .hnsw import HNSWIndex, build_hnsw
3
+
4
+ __all__ = ["HNSWIndex", "build_hnsw", "FaissIndex", "build_faiss"]
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ import numpy as np
7
+
8
+
9
+ def _require_faiss():
10
+ try:
11
+ import faiss
12
+ except Exception as e:
13
+ raise ImportError(
14
+ 'faiss is not installed. Install with: pip install "simmetry[ann-faiss]"'
15
+ ) from e
16
+ return faiss
17
+
18
+
19
+ @dataclass
20
+ class FaissIndex:
21
+ dim: int
22
+ metric: Literal["l2", "ip"]
23
+ index: object
24
+ n_items: int
25
+
26
+ def query(self, q, k: int = 10) -> tuple[np.ndarray, np.ndarray]:
27
+ q = np.asarray(q, dtype=np.float32)
28
+ if q.ndim == 1:
29
+ q = q.reshape(1, -1)
30
+ distances, labels = self.index.search(q, k)
31
+ return labels[0], distances[0]
32
+
33
+
34
+ def build_faiss(X, metric: Literal["l2", "ip"] = "ip") -> FaissIndex:
35
+ faiss = _require_faiss()
36
+ X = np.asarray(X, dtype=np.float32)
37
+ if X.ndim != 2:
38
+ raise ValueError("X must be a 2D array of shape (n, dim).")
39
+ n, dim = X.shape
40
+
41
+ if metric == "l2":
42
+ index = faiss.IndexFlatL2(dim)
43
+ elif metric == "ip":
44
+ index = faiss.IndexFlatIP(dim)
45
+ else:
46
+ raise ValueError("metric must be 'l2' or 'ip'.")
47
+
48
+ index.add(X)
49
+ return FaissIndex(dim=dim, metric=metric, index=index, n_items=n)
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ import numpy as np
7
+
8
+
9
+ def _require_hnswlib():
10
+ try:
11
+ import hnswlib
12
+ except Exception as e:
13
+ raise ImportError(
14
+ 'hnswlib is not installed. Install with: pip install "simmetry[ann-hnsw]"'
15
+ ) from e
16
+ return hnswlib
17
+
18
+
19
+ @dataclass
20
+ class HNSWIndex:
21
+ dim: int
22
+ space: Literal["cosine", "l2", "ip"]
23
+ index: object
24
+ n_items: int
25
+
26
+ def query(self, q, k: int = 10) -> tuple[np.ndarray, np.ndarray]:
27
+ q = np.asarray(q, dtype=np.float32)
28
+ if q.ndim == 1:
29
+ q = q.reshape(1, -1)
30
+ labels, distances = self.index.knn_query(q, k=k)
31
+ return labels[0], distances[0]
32
+
33
+
34
+ def build_hnsw(
35
+ X,
36
+ space: Literal["cosine", "l2", "ip"] = "cosine",
37
+ ef_construction: int = 200,
38
+ M: int = 16,
39
+ ef: int = 50,
40
+ ) -> HNSWIndex:
41
+ hnswlib = _require_hnswlib()
42
+ X = np.asarray(X, dtype=np.float32)
43
+ if X.ndim != 2:
44
+ raise ValueError("X must be a 2D array of shape (n, dim).")
45
+
46
+ n, dim = X.shape
47
+ idx = hnswlib.Index(space=space, dim=dim)
48
+ idx.init_index(max_elements=n, ef_construction=int(ef_construction), M=int(M))
49
+ idx.add_items(X, np.arange(n, dtype=np.int32))
50
+ idx.set_ef(int(ef))
51
+ return HNSWIndex(dim=dim, space=space, index=idx, n_items=n)