simmetry 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- simmetry-1.0.1/.gitignore +4 -0
- simmetry-1.0.1/CHANGELOG.md +22 -0
- simmetry-1.0.1/LICENSE +21 -0
- simmetry-1.0.1/PKG-INFO +213 -0
- simmetry-1.0.1/README.md +176 -0
- simmetry-1.0.1/bench/run.py +24 -0
- simmetry-1.0.1/pyproject.toml +67 -0
- simmetry-1.0.1/simmetry/__init__.py +39 -0
- simmetry-1.0.1/simmetry/ann/__init__.py +4 -0
- simmetry-1.0.1/simmetry/ann/faiss_.py +49 -0
- simmetry-1.0.1/simmetry/ann/hnsw.py +51 -0
- simmetry-1.0.1/simmetry/api.py +110 -0
- simmetry-1.0.1/simmetry/index.py +76 -0
- simmetry-1.0.1/simmetry/points/__init__.py +3 -0
- simmetry-1.0.1/simmetry/points/core.py +28 -0
- simmetry-1.0.1/simmetry/registry.py +38 -0
- simmetry-1.0.1/simmetry/sets/__init__.py +3 -0
- simmetry-1.0.1/simmetry/sets/core.py +42 -0
- simmetry-1.0.1/simmetry/strings/__init__.py +14 -0
- simmetry-1.0.1/simmetry/strings/jaro.py +57 -0
- simmetry-1.0.1/simmetry/strings/levenshtein.py +30 -0
- simmetry-1.0.1/simmetry/strings/ngrams.py +33 -0
- simmetry-1.0.1/simmetry/strings/pairwise.py +56 -0
- simmetry-1.0.1/simmetry/utils/numpy_utils.py +12 -0
- simmetry-1.0.1/simmetry/vectors/__init__.py +3 -0
- simmetry-1.0.1/simmetry/vectors/core.py +47 -0
- simmetry-1.0.1/simmetry/vectors/pairwise.py +88 -0
- simmetry-1.0.1/tests/test_ann_optional.py +24 -0
- simmetry-1.0.1/tests/test_auto_and_index.py +54 -0
- simmetry-1.0.1/tests/test_points_sets.py +16 -0
- simmetry-1.0.1/tests/test_registry_api.py +19 -0
- simmetry-1.0.1/tests/test_strings.py +19 -0
- simmetry-1.0.1/tests/test_strings_batch.py +17 -0
- simmetry-1.0.1/tests/test_vectors.py +38 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to **simmetry** will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on **Keep a Changelog**, and this project adheres to **Semantic Versioning**.
|
|
6
|
+
|
|
7
|
+
## [1.0.1] - 2026-02-21
|
|
8
|
+
### Added
|
|
9
|
+
- Optional Numba acceleration for `pairwise(..., metric="euclidean_sim" | "manhattan_sim")` when installed via `simmetry[fast]`.
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- Improved validation and error messages for vector dimension mismatches.
|
|
13
|
+
- Fixed `similarity([], [], metric="auto")` to route to string similarity batch behavior.
|
|
14
|
+
- Project cleanup for public/PyPI release packaging.
|
|
15
|
+
|
|
16
|
+
## [1.0.0] - 2026-02-21
|
|
17
|
+
### Added
|
|
18
|
+
- Auto similarity (`metric="auto"`) across strings/vectors/points/sets.
|
|
19
|
+
- Batch string APIs (`pairwise_strings`, `topk_strings`).
|
|
20
|
+
- Optional ANN module (`hnswlib` / `faiss-cpu`) via extras.
|
|
21
|
+
- Unified `SimIndex` with `exact` / `hnsw` / `faiss` backends.
|
|
22
|
+
- Composite similarity for dict records (field metrics + weights).
|
simmetry-1.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
simmetry-1.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: simmetry
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Blazing-fast similarity scores for strings, vectors, points, and sets.
|
|
5
|
+
Project-URL: Homepage, https://pypi.org/project/simmetry/
|
|
6
|
+
Project-URL: Repository, https://github.com/algumusrende/simmetry
|
|
7
|
+
Author-email: Ali Can Gumusrende <algumusrende@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: cosine,distance,haversine,jaccard,levenshtein,similarity
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: numpy>=1.23
|
|
25
|
+
Provides-Extra: ann
|
|
26
|
+
Requires-Dist: hnswlib>=0.8.0; extra == 'ann'
|
|
27
|
+
Provides-Extra: ann-faiss
|
|
28
|
+
Requires-Dist: faiss-cpu>=1.7.4; extra == 'ann-faiss'
|
|
29
|
+
Provides-Extra: ann-hnsw
|
|
30
|
+
Requires-Dist: hnswlib>=0.8.0; extra == 'ann-hnsw'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
34
|
+
Provides-Extra: fast
|
|
35
|
+
Requires-Dist: numba>=0.58; extra == 'fast'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# simmetry
|
|
39
|
+
|
|
40
|
+
Blazing-fast similarity scores for **strings**, **vectors**, **points**, and **sets** — with a simple API.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install simmetry
|
|
46
|
+
pip install "simmetry[fast]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
`simmetry[fast]` enables optional Numba acceleration for `pairwise(..., metric="euclidean_sim")` and `pairwise(..., metric="manhattan_sim")`.
|
|
50
|
+
|
|
51
|
+
## Quickstart
|
|
52
|
+
|
|
53
|
+
### One function
|
|
54
|
+
```python
|
|
55
|
+
from simmetry import similarity
|
|
56
|
+
|
|
57
|
+
similarity("kitten", "sitting", metric="levenshtein")
|
|
58
|
+
similarity([1,2,3], [1,2,4], metric="cosine")
|
|
59
|
+
similarity((41.1, 29.0), (41.2, 29.1), metric="haversine_km")
|
|
60
|
+
similarity({1,2,3}, {2,3,4}, metric="jaccard")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Pairwise matrices (fast for vectors)
|
|
64
|
+
```python
|
|
65
|
+
import numpy as np
|
|
66
|
+
from simmetry import pairwise
|
|
67
|
+
|
|
68
|
+
X = np.random.randn(1000, 128)
|
|
69
|
+
S = pairwise(X, metric="cosine")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Top-k search
|
|
73
|
+
```python
|
|
74
|
+
import numpy as np
|
|
75
|
+
from simmetry import topk
|
|
76
|
+
|
|
77
|
+
X = np.random.randn(5000, 64)
|
|
78
|
+
q = np.random.randn(64)
|
|
79
|
+
idx, scores = topk(q, X, k=10, metric="cosine")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Available metrics
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from simmetry import available
|
|
86
|
+
available()
|
|
87
|
+
available("vector")
|
|
88
|
+
available("string")
|
|
89
|
+
available("point")
|
|
90
|
+
available("set")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Vectors
|
|
94
|
+
- `cosine`, `dot`, `euclidean_sim`, `manhattan_sim`, `pearson`
|
|
95
|
+
|
|
96
|
+
### Strings
|
|
97
|
+
- `levenshtein` (normalized similarity)
|
|
98
|
+
- `jaro_winkler`
|
|
99
|
+
- `ngram_jaccard` (character n-gram set Jaccard)
|
|
100
|
+
- `token_jaccard` (whitespace token set Jaccard)
|
|
101
|
+
|
|
102
|
+
### Points / Geo
|
|
103
|
+
- `euclidean_2d`
|
|
104
|
+
- `haversine_km`
|
|
105
|
+
|
|
106
|
+
### Sets
|
|
107
|
+
- `jaccard`, `dice`, `overlap`
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
MIT
|
|
111
|
+
## Batch string APIs
|
|
112
|
+
|
|
113
|
+
If you need many string-to-string similarities (e.g., deduping names), use:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from simmetry.strings import pairwise_strings, topk_strings
|
|
117
|
+
|
|
118
|
+
S = pairwise_strings(["item_one", "item_two"], ["item_one", "item_alt"], metric="jaro_winkler")
|
|
119
|
+
idx, scores = topk_strings("samplecorp", ["samplecorp", "examplefinance", "testgroup"], k=2, metric="levenshtein")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## ANN top-k (optional, does NOT bloat core)
|
|
123
|
+
|
|
124
|
+
For very large vector corpora (100k+), exact `topk()` can be slow. ANN gives fast approximate results.
|
|
125
|
+
|
|
126
|
+
### hnswlib (recommended)
|
|
127
|
+
```bash
|
|
128
|
+
pip install "simmetry[ann-hnsw]"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
import numpy as np
|
|
133
|
+
from simmetry.ann import build_hnsw
|
|
134
|
+
|
|
135
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
136
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
137
|
+
|
|
138
|
+
index = build_hnsw(X, space="cosine")
|
|
139
|
+
labels, distances = index.query(X[0], k=10)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### faiss
|
|
143
|
+
```bash
|
|
144
|
+
pip install "simmetry[ann-faiss]"
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
import numpy as np
|
|
149
|
+
from simmetry.ann import build_faiss
|
|
150
|
+
|
|
151
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
152
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
153
|
+
|
|
154
|
+
index = build_faiss(X, metric="ip")
|
|
155
|
+
labels, scores = index.query(X[0], k=10)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
## SimIndex (exact or ANN)
|
|
160
|
+
|
|
161
|
+
Exact search (no extras):
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
import numpy as np
|
|
165
|
+
from simmetry import SimIndex
|
|
166
|
+
|
|
167
|
+
X = np.random.randn(50_000, 128).astype("float32")
|
|
168
|
+
index = SimIndex(metric="cosine", backend="exact").add(X)
|
|
169
|
+
|
|
170
|
+
idx, scores = index.query(X[0], k=10)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
ANN (optional):
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
pip install "simmetry[ann-hnsw]"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
import numpy as np
|
|
181
|
+
from simmetry import SimIndex
|
|
182
|
+
|
|
183
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
184
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
185
|
+
|
|
186
|
+
index = SimIndex(metric="cosine", backend="hnsw").add(X)
|
|
187
|
+
labels, distances = index.query(X[0], k=10)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Auto similarity and composite records
|
|
191
|
+
|
|
192
|
+
Auto metric selection:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from simmetry import similarity
|
|
196
|
+
|
|
197
|
+
similarity("samplecorp", "sample corp")
|
|
198
|
+
similarity((41.0, 29.0), (41.1, 29.1))
|
|
199
|
+
similarity({1,2,3}, {2,3,4})
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Composite similarity over dict fields:
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
a = {"name": "Entity One", "city": "CityAlpha", "loc": (41.0, 29.0)}
|
|
206
|
+
b = {"name": "Entity One Extended", "city": "CityAlpha", "loc": (41.01, 28.99)}
|
|
207
|
+
|
|
208
|
+
score = similarity(
|
|
209
|
+
a, b,
|
|
210
|
+
metric={"name": "jaro_winkler", "loc": "haversine_km"},
|
|
211
|
+
weights={"name": 0.7, "loc": 0.3},
|
|
212
|
+
)
|
|
213
|
+
```
|
simmetry-1.0.1/README.md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# simmetry
|
|
2
|
+
|
|
3
|
+
Blazing-fast similarity scores for **strings**, **vectors**, **points**, and **sets** — with a simple API.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install simmetry
|
|
9
|
+
pip install "simmetry[fast]"
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
`simmetry[fast]` enables optional Numba acceleration for `pairwise(..., metric="euclidean_sim")` and `pairwise(..., metric="manhattan_sim")`.
|
|
13
|
+
|
|
14
|
+
## Quickstart
|
|
15
|
+
|
|
16
|
+
### One function
|
|
17
|
+
```python
|
|
18
|
+
from simmetry import similarity
|
|
19
|
+
|
|
20
|
+
similarity("kitten", "sitting", metric="levenshtein")
|
|
21
|
+
similarity([1,2,3], [1,2,4], metric="cosine")
|
|
22
|
+
similarity((41.1, 29.0), (41.2, 29.1), metric="haversine_km")
|
|
23
|
+
similarity({1,2,3}, {2,3,4}, metric="jaccard")
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Pairwise matrices (fast for vectors)
|
|
27
|
+
```python
|
|
28
|
+
import numpy as np
|
|
29
|
+
from simmetry import pairwise
|
|
30
|
+
|
|
31
|
+
X = np.random.randn(1000, 128)
|
|
32
|
+
S = pairwise(X, metric="cosine")
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Top-k search
|
|
36
|
+
```python
|
|
37
|
+
import numpy as np
|
|
38
|
+
from simmetry import topk
|
|
39
|
+
|
|
40
|
+
X = np.random.randn(5000, 64)
|
|
41
|
+
q = np.random.randn(64)
|
|
42
|
+
idx, scores = topk(q, X, k=10, metric="cosine")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Available metrics
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from simmetry import available
|
|
49
|
+
available()
|
|
50
|
+
available("vector")
|
|
51
|
+
available("string")
|
|
52
|
+
available("point")
|
|
53
|
+
available("set")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Vectors
|
|
57
|
+
- `cosine`, `dot`, `euclidean_sim`, `manhattan_sim`, `pearson`
|
|
58
|
+
|
|
59
|
+
### Strings
|
|
60
|
+
- `levenshtein` (normalized similarity)
|
|
61
|
+
- `jaro_winkler`
|
|
62
|
+
- `ngram_jaccard` (character n-gram set Jaccard)
|
|
63
|
+
- `token_jaccard` (whitespace token set Jaccard)
|
|
64
|
+
|
|
65
|
+
### Points / Geo
|
|
66
|
+
- `euclidean_2d`
|
|
67
|
+
- `haversine_km`
|
|
68
|
+
|
|
69
|
+
### Sets
|
|
70
|
+
- `jaccard`, `dice`, `overlap`
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
MIT
|
|
74
|
+
## Batch string APIs
|
|
75
|
+
|
|
76
|
+
If you need many string-to-string similarities (e.g., deduping names), use:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from simmetry.strings import pairwise_strings, topk_strings
|
|
80
|
+
|
|
81
|
+
S = pairwise_strings(["item_one", "item_two"], ["item_one", "item_alt"], metric="jaro_winkler")
|
|
82
|
+
idx, scores = topk_strings("samplecorp", ["samplecorp", "examplefinance", "testgroup"], k=2, metric="levenshtein")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## ANN top-k (optional, does NOT bloat core)
|
|
86
|
+
|
|
87
|
+
For very large vector corpora (100k+), exact `topk()` can be slow. ANN gives fast approximate results.
|
|
88
|
+
|
|
89
|
+
### hnswlib (recommended)
|
|
90
|
+
```bash
|
|
91
|
+
pip install "simmetry[ann-hnsw]"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import numpy as np
|
|
96
|
+
from simmetry.ann import build_hnsw
|
|
97
|
+
|
|
98
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
99
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
100
|
+
|
|
101
|
+
index = build_hnsw(X, space="cosine")
|
|
102
|
+
labels, distances = index.query(X[0], k=10)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### faiss
|
|
106
|
+
```bash
|
|
107
|
+
pip install "simmetry[ann-faiss]"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
import numpy as np
|
|
112
|
+
from simmetry.ann import build_faiss
|
|
113
|
+
|
|
114
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
115
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
116
|
+
|
|
117
|
+
index = build_faiss(X, metric="ip")
|
|
118
|
+
labels, scores = index.query(X[0], k=10)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
## SimIndex (exact or ANN)
|
|
123
|
+
|
|
124
|
+
Exact search (no extras):
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import numpy as np
|
|
128
|
+
from simmetry import SimIndex
|
|
129
|
+
|
|
130
|
+
X = np.random.randn(50_000, 128).astype("float32")
|
|
131
|
+
index = SimIndex(metric="cosine", backend="exact").add(X)
|
|
132
|
+
|
|
133
|
+
idx, scores = index.query(X[0], k=10)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
ANN (optional):
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
pip install "simmetry[ann-hnsw]"
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
import numpy as np
|
|
144
|
+
from simmetry import SimIndex
|
|
145
|
+
|
|
146
|
+
X = np.random.randn(200_000, 128).astype("float32")
|
|
147
|
+
X /= np.linalg.norm(X, axis=1, keepdims=True)
|
|
148
|
+
|
|
149
|
+
index = SimIndex(metric="cosine", backend="hnsw").add(X)
|
|
150
|
+
labels, distances = index.query(X[0], k=10)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Auto similarity and composite records
|
|
154
|
+
|
|
155
|
+
Auto metric selection:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from simmetry import similarity
|
|
159
|
+
|
|
160
|
+
similarity("samplecorp", "sample corp")
|
|
161
|
+
similarity((41.0, 29.0), (41.1, 29.1))
|
|
162
|
+
similarity({1,2,3}, {2,3,4})
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Composite similarity over dict fields:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
a = {"name": "Entity One", "city": "CityAlpha", "loc": (41.0, 29.0)}
|
|
169
|
+
b = {"name": "Entity One Extended", "city": "CityAlpha", "loc": (41.01, 28.99)}
|
|
170
|
+
|
|
171
|
+
score = similarity(
|
|
172
|
+
a, b,
|
|
173
|
+
metric={"name": "jaro_winkler", "loc": "haversine_km"},
|
|
174
|
+
weights={"name": 0.7, "loc": 0.3},
|
|
175
|
+
)
|
|
176
|
+
```
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from simmetry import pairwise, topk
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
X = np.random.randn(5000, 128)
|
|
12
|
+
t0 = time.time()
|
|
13
|
+
S = pairwise(X[:1000], X[:1000], metric="cosine")
|
|
14
|
+
t1 = time.time()
|
|
15
|
+
print("pairwise cosine (1000x1000) seconds:", round(t1 - t0, 4), "shape:", S.shape)
|
|
16
|
+
|
|
17
|
+
q = np.random.randn(128)
|
|
18
|
+
t0 = time.time()
|
|
19
|
+
idx, scores = topk(q, X, k=10, metric="cosine")
|
|
20
|
+
t1 = time.time()
|
|
21
|
+
print("topk cosine seconds:", round(t1 - t0, 6), "idx:", idx[:3], "scores:", scores[:3])
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
main()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.24"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "simmetry"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "Blazing-fast similarity scores for strings, vectors, points, and sets."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Ali Can Gumusrende", email = "algumusrende@gmail.com" }]
|
|
13
|
+
keywords = ["similarity", "distance", "cosine", "levenshtein", "jaccard", "haversine"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Topic :: Software Development :: Libraries",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"numpy>=1.23",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
fast = [
|
|
34
|
+
"numba>=0.58",
|
|
35
|
+
]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest>=8.0",
|
|
38
|
+
"ruff>=0.4",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
ann-hnsw = [
|
|
42
|
+
"hnswlib>=0.8.0",
|
|
43
|
+
]
|
|
44
|
+
ann-faiss = [
|
|
45
|
+
"faiss-cpu>=1.7.4",
|
|
46
|
+
]
|
|
47
|
+
ann = [
|
|
48
|
+
"hnswlib>=0.8.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
Homepage = "https://pypi.org/project/simmetry/"
|
|
53
|
+
Repository = "https://github.com/algumusrende/simmetry"
|
|
54
|
+
|
|
55
|
+
[tool.hatch.build.targets.wheel]
|
|
56
|
+
packages = ["simmetry"]
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
line-length = 100
|
|
60
|
+
target-version = "py310"
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint]
|
|
63
|
+
select = ["E", "F", "I", "B", "UP"]
|
|
64
|
+
ignore = ["E501"]
|
|
65
|
+
|
|
66
|
+
[tool.pytest.ini_options]
|
|
67
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .api import pairwise, similarity, topk
|
|
4
|
+
from .index import SimIndex
|
|
5
|
+
from .points.core import euclidean_2d, haversine_km
|
|
6
|
+
from .registry import available, get, register
|
|
7
|
+
from .sets.core import dice, jaccard, overlap
|
|
8
|
+
from .strings.jaro import jaro_winkler
|
|
9
|
+
from .strings.levenshtein import levenshtein
|
|
10
|
+
from .strings.ngrams import ngram_jaccard, token_jaccard
|
|
11
|
+
from .vectors.core import cosine, dot, euclidean_sim, manhattan_sim, pearson
|
|
12
|
+
|
|
13
|
+
register("cosine", cosine, kind="vector")
|
|
14
|
+
register("dot", dot, kind="vector")
|
|
15
|
+
register("euclidean_sim", euclidean_sim, kind="vector")
|
|
16
|
+
register("manhattan_sim", manhattan_sim, kind="vector")
|
|
17
|
+
register("pearson", pearson, kind="vector")
|
|
18
|
+
|
|
19
|
+
register("levenshtein", levenshtein, kind="string")
|
|
20
|
+
register("jaro_winkler", jaro_winkler, kind="string")
|
|
21
|
+
register("ngram_jaccard", ngram_jaccard, kind="string")
|
|
22
|
+
register("token_jaccard", token_jaccard, kind="string")
|
|
23
|
+
|
|
24
|
+
register("euclidean_2d", euclidean_2d, kind="point")
|
|
25
|
+
register("haversine_km", haversine_km, kind="point")
|
|
26
|
+
|
|
27
|
+
register("jaccard", jaccard, kind="set")
|
|
28
|
+
register("dice", dice, kind="set")
|
|
29
|
+
register("overlap", overlap, kind="set")
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"similarity",
|
|
33
|
+
"SimIndex",
|
|
34
|
+
"pairwise",
|
|
35
|
+
"topk",
|
|
36
|
+
"register",
|
|
37
|
+
"get",
|
|
38
|
+
"available",
|
|
39
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _require_faiss():
|
|
10
|
+
try:
|
|
11
|
+
import faiss
|
|
12
|
+
except Exception as e:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
'faiss is not installed. Install with: pip install "simmetry[ann-faiss]"'
|
|
15
|
+
) from e
|
|
16
|
+
return faiss
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class FaissIndex:
|
|
21
|
+
dim: int
|
|
22
|
+
metric: Literal["l2", "ip"]
|
|
23
|
+
index: object
|
|
24
|
+
n_items: int
|
|
25
|
+
|
|
26
|
+
def query(self, q, k: int = 10) -> tuple[np.ndarray, np.ndarray]:
|
|
27
|
+
q = np.asarray(q, dtype=np.float32)
|
|
28
|
+
if q.ndim == 1:
|
|
29
|
+
q = q.reshape(1, -1)
|
|
30
|
+
distances, labels = self.index.search(q, k)
|
|
31
|
+
return labels[0], distances[0]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def build_faiss(X, metric: Literal["l2", "ip"] = "ip") -> FaissIndex:
|
|
35
|
+
faiss = _require_faiss()
|
|
36
|
+
X = np.asarray(X, dtype=np.float32)
|
|
37
|
+
if X.ndim != 2:
|
|
38
|
+
raise ValueError("X must be a 2D array of shape (n, dim).")
|
|
39
|
+
n, dim = X.shape
|
|
40
|
+
|
|
41
|
+
if metric == "l2":
|
|
42
|
+
index = faiss.IndexFlatL2(dim)
|
|
43
|
+
elif metric == "ip":
|
|
44
|
+
index = faiss.IndexFlatIP(dim)
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError("metric must be 'l2' or 'ip'.")
|
|
47
|
+
|
|
48
|
+
index.add(X)
|
|
49
|
+
return FaissIndex(dim=dim, metric=metric, index=index, n_items=n)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _require_hnswlib():
|
|
10
|
+
try:
|
|
11
|
+
import hnswlib
|
|
12
|
+
except Exception as e:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
'hnswlib is not installed. Install with: pip install "simmetry[ann-hnsw]"'
|
|
15
|
+
) from e
|
|
16
|
+
return hnswlib
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class HNSWIndex:
|
|
21
|
+
dim: int
|
|
22
|
+
space: Literal["cosine", "l2", "ip"]
|
|
23
|
+
index: object
|
|
24
|
+
n_items: int
|
|
25
|
+
|
|
26
|
+
def query(self, q, k: int = 10) -> tuple[np.ndarray, np.ndarray]:
|
|
27
|
+
q = np.asarray(q, dtype=np.float32)
|
|
28
|
+
if q.ndim == 1:
|
|
29
|
+
q = q.reshape(1, -1)
|
|
30
|
+
labels, distances = self.index.knn_query(q, k=k)
|
|
31
|
+
return labels[0], distances[0]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def build_hnsw(
|
|
35
|
+
X,
|
|
36
|
+
space: Literal["cosine", "l2", "ip"] = "cosine",
|
|
37
|
+
ef_construction: int = 200,
|
|
38
|
+
M: int = 16,
|
|
39
|
+
ef: int = 50,
|
|
40
|
+
) -> HNSWIndex:
|
|
41
|
+
hnswlib = _require_hnswlib()
|
|
42
|
+
X = np.asarray(X, dtype=np.float32)
|
|
43
|
+
if X.ndim != 2:
|
|
44
|
+
raise ValueError("X must be a 2D array of shape (n, dim).")
|
|
45
|
+
|
|
46
|
+
n, dim = X.shape
|
|
47
|
+
idx = hnswlib.Index(space=space, dim=dim)
|
|
48
|
+
idx.init_index(max_elements=n, ef_construction=int(ef_construction), M=int(M))
|
|
49
|
+
idx.add_items(X, np.arange(n, dtype=np.int32))
|
|
50
|
+
idx.set_ef(int(ef))
|
|
51
|
+
return HNSWIndex(dim=dim, space=space, index=idx, n_items=n)
|