deskit 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deskit-1.2.0/src/deskit.egg-info → deskit-1.2.1}/PKG-INFO +1 -1
- {deskit-1.2.0 → deskit-1.2.1}/pyproject.toml +1 -1
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/neighbors.py +207 -63
- {deskit-1.2.0 → deskit-1.2.1/src/deskit.egg-info}/PKG-INFO +1 -1
- {deskit-1.2.0 → deskit-1.2.1}/LICENSE +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/README.md +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/setup.cfg +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/__init__.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/_config.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/__init__.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/base.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/knnbase.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/predictbase.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/__init__.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsi.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsiv.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewst.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsu.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsv.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knorae.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knoraiu.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knorau.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/lwsei.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/lwseu.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/ola.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/metrics.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/router.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit/utils.py +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/SOURCES.txt +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/dependency_links.txt +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/requires.txt +0 -0
- {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/top_level.txt +0 -0
|
@@ -13,18 +13,28 @@ _FAISS_MIN_SAMPLES_PER_CELL = 40
|
|
|
13
13
|
# 'euclidean' is the universal default and always available.
|
|
14
14
|
#
|
|
15
15
|
# Choosing a distance metric:
|
|
16
|
-
# euclidean
|
|
17
|
-
# manhattan
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
# chebyshev
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
# minkowski
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
16
|
+
# euclidean – The standard L2 norm. Best default for most tabular data.
|
|
17
|
+
# manhattan – L1 norm (sum of absolute differences). More robust to
|
|
18
|
+
# outliers and tends to work better in moderately high-
|
|
19
|
+
# dimensional spaces because it doesn't square large diffs.
|
|
20
|
+
# chebyshev – L∞ norm (maximum absolute difference across features).
|
|
21
|
+
# Useful when a single feature dominating the distance is
|
|
22
|
+
# acceptable; common in game-grid / chess-style problems.
|
|
23
|
+
# minkowski – Generalisation of L1/L2 (controlled by p). p=1 →
|
|
24
|
+
# manhattan, p=2 → euclidean. Use when you want to tune
|
|
25
|
+
# between them.
|
|
26
|
+
# cosine – Angle between vectors, ignoring magnitude. Excellent for
|
|
27
|
+
# embeddings (text, image, audio) where direction matters
|
|
28
|
+
# more than raw scale.
|
|
29
|
+
# canberra – Weighted L1. Sensitive to small values near zero.
|
|
30
|
+
# braycurtis – Normalised L1 bounded to [0,1]. Common in ecology.
|
|
31
|
+
# jensenshannon – Symmetric KL divergence on probability distributions.
|
|
32
|
+
# Requires non-negative vectors. Supported by FAISS flat/
|
|
33
|
+
# HNSW/GPU indices natively.
|
|
34
|
+
# dot – Raw inner/dot product. Not a true metric; distances are
|
|
35
|
+
# not comparable across queries. Use for max inner-product
|
|
36
|
+
# search (recommendation systems). Prefer 'cosine' for
|
|
37
|
+
# normalised embeddings.
|
|
28
38
|
|
|
29
39
|
# Metrics that every backend supports natively.
|
|
30
40
|
_UNIVERSAL_METRICS = {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'cosine'}
|
|
@@ -33,31 +43,59 @@ _UNIVERSAL_METRICS = {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'cosin
|
|
|
33
43
|
# KNN (sklearn) supports all scipy metrics — this is the complete curated list.
|
|
34
44
|
_KNN_METRICS = _UNIVERSAL_METRICS | {'correlation', 'hamming', 'canberra', 'braycurtis'}
|
|
35
45
|
|
|
36
|
-
# FAISS
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
|
|
46
|
+
# FAISS native metric support:
|
|
47
|
+
# IndexFlat, IndexHNSW, and GpuIndexFlat support METRIC_L1, METRIC_Linf,
|
|
48
|
+
# METRIC_Lp (with metric_arg for p), METRIC_Canberra, METRIC_BrayCurtis,
|
|
49
|
+
# and METRIC_JensenShannon in addition to L2 and inner product.
|
|
50
|
+
# IndexIVFFlat only supports L2 and inner product.
|
|
51
|
+
# 'ivf' index_type will still fall back for non-L2/cosine metrics.
|
|
52
|
+
_FAISS_FLAT_HNSW_NATIVE_METRICS = {
|
|
53
|
+
'euclidean', 'cosine', 'manhattan', 'chebyshev', 'minkowski',
|
|
54
|
+
'canberra', 'braycurtis', 'jensenshannon',
|
|
55
|
+
}
|
|
56
|
+
_FAISS_IVF_NATIVE_METRICS = {'euclidean', 'cosine'}
|
|
57
|
+
|
|
58
|
+
# For backwards compatibility: the overall set accepted by FaissNeighborFinder.
|
|
59
|
+
_FAISS_METRICS = _FAISS_FLAT_HNSW_NATIVE_METRICS | {'correlation', 'hamming'}
|
|
40
60
|
|
|
41
61
|
# Annoy metric names (library-specific).
|
|
62
|
+
# Annoy natively supports: euclidean, manhattan, cosine (angular), hamming,
|
|
63
|
+
# and dot (inner product). chebyshev and minkowski have no Annoy equivalent.
|
|
42
64
|
_ANNOY_METRIC_MAP = {
|
|
43
65
|
'euclidean': 'euclidean',
|
|
44
66
|
'manhattan': 'manhattan',
|
|
45
67
|
'cosine': 'angular',
|
|
46
68
|
'hamming': 'hamming',
|
|
47
|
-
|
|
69
|
+
'dot': 'dot',
|
|
48
70
|
}
|
|
49
71
|
_ANNOY_METRICS = set(_ANNOY_METRIC_MAP)
|
|
50
72
|
|
|
51
|
-
#
|
|
52
|
-
|
|
73
|
+
# hnswlib space names — only three native spaces exist.
|
|
74
|
+
# 'ip' is inner product (not a true metric; used for max inner-product search).
|
|
75
|
+
_HNSWLIB_METRIC_MAP = {
|
|
53
76
|
'euclidean': 'l2',
|
|
54
77
|
'cosine': 'cosine',
|
|
55
|
-
|
|
78
|
+
'dot': 'ip',
|
|
56
79
|
}
|
|
80
|
+
|
|
81
|
+
# nmslib space names for DENSE_VECTOR + HNSW.
|
|
82
|
+
# l1/linf/angulardist are confirmed supported by nmslib's integration tests.
|
|
83
|
+
# 'dot' maps to negdotprod (nmslib maximises inner product via negative distance).
|
|
84
|
+
_NMSLIB_METRIC_MAP = {
|
|
85
|
+
'euclidean': 'l2',
|
|
86
|
+
'cosine': 'cosinesimil',
|
|
87
|
+
'manhattan': 'l1',
|
|
88
|
+
'chebyshev': 'linf',
|
|
89
|
+
'dot': 'negdotprod',
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Unified view for HNSWNeighborFinder validation: union of both backends.
|
|
93
|
+
# We keep the old name for backwards compatibility.
|
|
94
|
+
_HNSW_METRIC_MAP = _HNSWLIB_METRIC_MAP # kept for any external references
|
|
57
95
|
_HNSW_METRICS = _UNIVERSAL_METRICS # partial — see fit() for fallback note
|
|
58
96
|
|
|
59
97
|
# All metrics callable from the public API.
|
|
60
|
-
ALL_METRICS = _KNN_METRICS
|
|
98
|
+
ALL_METRICS = _KNN_METRICS | {'jensenshannon', 'dot'}
|
|
61
99
|
|
|
62
100
|
|
|
63
101
|
def list_distance_metrics():
|
|
@@ -65,19 +103,21 @@ def list_distance_metrics():
|
|
|
65
103
|
print("\nAvailable Distance Metrics:")
|
|
66
104
|
print("=" * 70)
|
|
67
105
|
rows = [
|
|
68
|
-
("euclidean",
|
|
69
|
-
("manhattan",
|
|
70
|
-
("chebyshev",
|
|
71
|
-
("minkowski",
|
|
72
|
-
("cosine",
|
|
73
|
-
("
|
|
74
|
-
("
|
|
75
|
-
("
|
|
76
|
-
("
|
|
106
|
+
("euclidean", "Default. L2 norm. Best for most tabular data.", "all"),
|
|
107
|
+
("manhattan", "L1 norm. More robust to outliers; good for high-dim data.", "KNN, FAISS (flat/hnsw), Annoy, HNSW-nmslib"),
|
|
108
|
+
("chebyshev", "L∞ norm. Max absolute diff across features.", "KNN, FAISS (flat/hnsw), HNSW-nmslib"),
|
|
109
|
+
("minkowski", "Generalises L1/L2 via p-param. Set minkowski_p=<float>.", "KNN, FAISS (flat/hnsw)"),
|
|
110
|
+
("cosine", "Angle between vectors. Ideal for embeddings (NLP, vision).", "all"),
|
|
111
|
+
("dot", "Inner/dot product. Not a metric; used for max-IP search.", "Annoy, HNSW (hnswlib ip / nmslib negdotprod)"),
|
|
112
|
+
("canberra", "Weighted L1. Sensitive to small values near zero.", "KNN, FAISS (flat/hnsw/gpu)"),
|
|
113
|
+
("braycurtis", "Normalised L1 bounded to [0,1]. Ecological data.", "KNN, FAISS (flat/hnsw/gpu)"),
|
|
114
|
+
("jensenshannon", "Symmetric KL divergence. Requires non-negative vectors.", "FAISS (flat/hnsw/gpu)"),
|
|
115
|
+
("correlation", "Pearson correlation distance. Good for time series.", "KNN only"),
|
|
116
|
+
("hamming", "Fraction of differing components. For binary/categorical data.", "KNN, Annoy"),
|
|
77
117
|
]
|
|
78
118
|
for name, desc, backends in rows:
|
|
79
|
-
print(f"\n {name:<
|
|
80
|
-
print(f" {'':
|
|
119
|
+
print(f"\n {name:<16} {desc}")
|
|
120
|
+
print(f" {'':16} Backends: {backends}")
|
|
81
121
|
print("\n" + "=" * 70)
|
|
82
122
|
|
|
83
123
|
|
|
@@ -160,21 +200,36 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
160
200
|
"""
|
|
161
201
|
Approximate nearest neighbors via FAISS (flat, IVF, or HNSW index).
|
|
162
202
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
203
|
+
Native metric support depends on index_type:
|
|
204
|
+
|
|
205
|
+
flat / hnsw / gpu-flat
|
|
206
|
+
FAISS IndexFlat, IndexHNSW, and GpuIndexFlat natively support:
|
|
207
|
+
euclidean, cosine, manhattan (L1), chebyshev (Linf), minkowski (Lp),
|
|
208
|
+
canberra, braycurtis, jensenshannon.
|
|
209
|
+
|
|
210
|
+
ivf
|
|
211
|
+
IndexIVFFlat only supports L2 and inner-product (cosine). All other
|
|
212
|
+
metrics fall back to an exact sklearn KNN with a warning.
|
|
213
|
+
|
|
214
|
+
correlation and hamming always fall back to sklearn for all index types.
|
|
166
215
|
"""
|
|
167
216
|
|
|
168
217
|
def __init__(self, k=10, index_type='flat', n_cells=None, n_probes=50,
|
|
169
218
|
hnsw_M=32, hnsw_efConstruction=400, hnsw_efSearch=200,
|
|
170
|
-
distance_metric='euclidean'):
|
|
219
|
+
distance_metric='euclidean', minkowski_p=2):
|
|
171
220
|
"""
|
|
172
221
|
Parameters
|
|
173
222
|
----------
|
|
174
223
|
distance_metric : str
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
224
|
+
Metric to use. flat/hnsw/gpu index types natively support:
|
|
225
|
+
'euclidean', 'cosine', 'manhattan', 'chebyshev', 'minkowski',
|
|
226
|
+
'canberra', 'braycurtis', 'jensenshannon'.
|
|
227
|
+
'ivf' only natively supports 'euclidean' and 'cosine'; all others
|
|
228
|
+
fall back to exact sklearn KNN with a warning.
|
|
229
|
+
'correlation' and 'hamming' always fall back to sklearn.
|
|
230
|
+
minkowski_p : float
|
|
231
|
+
The p-parameter for the Minkowski metric. p=1 → manhattan,
|
|
232
|
+
p=2 → euclidean. Ignored for all other metrics.
|
|
178
233
|
"""
|
|
179
234
|
if k <= 0:
|
|
180
235
|
raise ValueError(f"k must be positive, got k={k}")
|
|
@@ -193,6 +248,7 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
193
248
|
self.hnsw_efConstruction = hnsw_efConstruction
|
|
194
249
|
self.hnsw_efSearch = hnsw_efSearch
|
|
195
250
|
self.distance_metric = metric
|
|
251
|
+
self.minkowski_p = minkowski_p
|
|
196
252
|
self.index_ = None
|
|
197
253
|
self._fallback_finder = None # used for non-native metrics
|
|
198
254
|
self._check_availability()
|
|
@@ -204,6 +260,31 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
204
260
|
except ImportError:
|
|
205
261
|
raise ImportError("FAISS not found. Install with: pip install faiss-cpu")
|
|
206
262
|
|
|
263
|
+
@staticmethod
|
|
264
|
+
def _faiss_metric_type(faiss, metric, minkowski_p=2):
|
|
265
|
+
"""
|
|
266
|
+
Return (faiss_metric_constant, metric_arg) for a given metric name.
|
|
267
|
+
metric_arg is only meaningful for METRIC_Lp (minkowski).
|
|
268
|
+
Raises ValueError for metrics that have no FAISS MetricType constant
|
|
269
|
+
(i.e. those that must be handled via fallback).
|
|
270
|
+
"""
|
|
271
|
+
_MAP = {
|
|
272
|
+
'euclidean': (faiss.METRIC_L2, None),
|
|
273
|
+
'cosine': (faiss.METRIC_INNER_PRODUCT, None),
|
|
274
|
+
'manhattan': (faiss.METRIC_L1, None),
|
|
275
|
+
'chebyshev': (faiss.METRIC_Linf, None),
|
|
276
|
+
'minkowski': (faiss.METRIC_Lp, None), # metric_arg set below
|
|
277
|
+
'canberra': (faiss.METRIC_Canberra, None),
|
|
278
|
+
'braycurtis': (faiss.METRIC_BrayCurtis, None),
|
|
279
|
+
'jensenshannon':(faiss.METRIC_JensenShannon, None),
|
|
280
|
+
}
|
|
281
|
+
if metric not in _MAP:
|
|
282
|
+
raise KeyError(metric)
|
|
283
|
+
ft, arg = _MAP[metric]
|
|
284
|
+
if metric == 'minkowski':
|
|
285
|
+
arg = float(minkowski_p)
|
|
286
|
+
return ft, arg
|
|
287
|
+
|
|
207
288
|
@staticmethod
|
|
208
289
|
def _l2_normalize(X):
|
|
209
290
|
"""Row-wise L2 normalisation in-place (for cosine similarity)."""
|
|
@@ -221,11 +302,18 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
221
302
|
f"{n_samples} samples. Reduce k to at most {n_samples}."
|
|
222
303
|
)
|
|
223
304
|
|
|
224
|
-
#
|
|
225
|
-
|
|
305
|
+
# Determine whether the chosen metric is natively supported by this index type.
|
|
306
|
+
ivf_native = self.distance_metric in _FAISS_IVF_NATIVE_METRICS
|
|
307
|
+
flat_hnsw_native = self.distance_metric in _FAISS_FLAT_HNSW_NATIVE_METRICS
|
|
308
|
+
is_ivf = (self.index_type == 'ivf')
|
|
309
|
+
|
|
310
|
+
needs_fallback = is_ivf and not ivf_native
|
|
311
|
+
needs_fallback = needs_fallback or (not is_ivf and not flat_hnsw_native)
|
|
312
|
+
|
|
313
|
+
if needs_fallback:
|
|
226
314
|
warnings.warn(
|
|
227
315
|
f"distance_metric='{self.distance_metric}' is not natively supported by "
|
|
228
|
-
f"FAISS. Falling back to exact sklearn KNN
|
|
316
|
+
f"FAISS {self.index_type} index. Falling back to exact sklearn KNN. "
|
|
229
317
|
f"Use preset='exact' to avoid this overhead.",
|
|
230
318
|
UserWarning,
|
|
231
319
|
)
|
|
@@ -235,7 +323,7 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
235
323
|
self._fallback_finder.fit(X)
|
|
236
324
|
return self
|
|
237
325
|
|
|
238
|
-
# Cosine similarity: normalise all vectors
|
|
326
|
+
# Cosine similarity: normalise all vectors so inner-product == cosine.
|
|
239
327
|
if self.distance_metric == 'cosine':
|
|
240
328
|
X = self._l2_normalize(X)
|
|
241
329
|
|
|
@@ -247,12 +335,20 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
247
335
|
UserWarning
|
|
248
336
|
)
|
|
249
337
|
if self.distance_metric == 'cosine':
|
|
250
|
-
self.index_ = self.faiss.IndexFlatIP(dim)
|
|
251
|
-
|
|
338
|
+
self.index_ = self.faiss.IndexFlatIP(dim)
|
|
339
|
+
elif self.distance_metric == 'euclidean':
|
|
252
340
|
self.index_ = self.faiss.IndexFlatL2(dim)
|
|
341
|
+
else:
|
|
342
|
+
ft, metric_arg = self._faiss_metric_type(
|
|
343
|
+
self.faiss, self.distance_metric, self.minkowski_p
|
|
344
|
+
)
|
|
345
|
+
self.index_ = self.faiss.IndexFlat(dim, ft)
|
|
346
|
+
if metric_arg is not None:
|
|
347
|
+
self.index_.metric_arg = metric_arg
|
|
253
348
|
self.index_.add(X)
|
|
254
349
|
|
|
255
350
|
elif self.index_type == 'ivf':
|
|
351
|
+
# IVF only supports L2 / inner-product (guarded above).
|
|
256
352
|
if self.n_cells is None:
|
|
257
353
|
self.n_cells = min(int(np.sqrt(n_samples)), 4096)
|
|
258
354
|
|
|
@@ -300,7 +396,19 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
300
396
|
f"{n_samples} samples. Consider ef_construction >= 400.",
|
|
301
397
|
UserWarning
|
|
302
398
|
)
|
|
303
|
-
self.
|
|
399
|
+
if self.distance_metric == 'cosine':
|
|
400
|
+
self.index_ = self.faiss.IndexHNSWFlat(
|
|
401
|
+
dim, self.hnsw_M, self.faiss.METRIC_INNER_PRODUCT
|
|
402
|
+
)
|
|
403
|
+
elif self.distance_metric == 'euclidean':
|
|
404
|
+
self.index_ = self.faiss.IndexHNSWFlat(dim, self.hnsw_M)
|
|
405
|
+
else:
|
|
406
|
+
ft, metric_arg = self._faiss_metric_type(
|
|
407
|
+
self.faiss, self.distance_metric, self.minkowski_p
|
|
408
|
+
)
|
|
409
|
+
self.index_ = self.faiss.IndexHNSWFlat(dim, self.hnsw_M, ft)
|
|
410
|
+
if metric_arg is not None:
|
|
411
|
+
self.index_.metric_arg = metric_arg
|
|
304
412
|
self.index_.hnsw.efConstruction = self.hnsw_efConstruction
|
|
305
413
|
self.index_.hnsw.efSearch = self.hnsw_efSearch
|
|
306
414
|
self.index_.add(X)
|
|
@@ -328,10 +436,14 @@ class FaissNeighborFinder(NeighborFinder):
|
|
|
328
436
|
# Inner product on normalised vectors: similarity ∈ [-1, 1].
|
|
329
437
|
# Convert to a proper distance (0 = identical, 2 = opposite).
|
|
330
438
|
distances = 1.0 - scores
|
|
331
|
-
|
|
439
|
+
elif self.distance_metric == 'euclidean':
|
|
332
440
|
distances, indices = self.index_.search(X, k)
|
|
333
441
|
# FAISS returns squared L2; clamp to 0 before sqrt.
|
|
334
442
|
distances = np.sqrt(np.maximum(distances, 0))
|
|
443
|
+
else:
|
|
444
|
+
# All other native metrics (manhattan, chebyshev, minkowski, canberra,
|
|
445
|
+
# braycurtis, jensenshannon) are returned as proper distances already.
|
|
446
|
+
distances, indices = self.index_.search(X, k)
|
|
335
447
|
|
|
336
448
|
return distances.astype(np.float32), indices
|
|
337
449
|
|
|
@@ -340,8 +452,13 @@ class AnnoyNeighborFinder(NeighborFinder):
|
|
|
340
452
|
"""
|
|
341
453
|
Approximate nearest neighbors via Annoy.
|
|
342
454
|
|
|
343
|
-
Supports: euclidean, manhattan, cosine, hamming
|
|
455
|
+
Supports: euclidean, manhattan, cosine (stored as 'angular'), hamming,
|
|
456
|
+
and dot (inner product, stored as 'dot').
|
|
344
457
|
chebyshev and minkowski are not available in Annoy — use preset='exact' for those.
|
|
458
|
+
|
|
459
|
+
Note on 'dot': Annoy's dot-product space is not a true metric. Distances
|
|
460
|
+
returned are reduced inner-product values, not raw dot products — see
|
|
461
|
+
Bachrach et al. (2014). Prefer 'cosine' for normalised embeddings.
|
|
345
462
|
"""
|
|
346
463
|
|
|
347
464
|
def __init__(self, k=10, n_trees=100, distance_metric='euclidean', search_k=-1):
|
|
@@ -349,7 +466,8 @@ class AnnoyNeighborFinder(NeighborFinder):
|
|
|
349
466
|
Parameters
|
|
350
467
|
----------
|
|
351
468
|
distance_metric : str
|
|
352
|
-
One of 'euclidean', 'manhattan', 'cosine', 'hamming'
|
|
469
|
+
One of 'euclidean', 'manhattan', 'cosine', 'hamming', 'dot'.
|
|
470
|
+
Default: 'euclidean'.
|
|
353
471
|
"""
|
|
354
472
|
if k <= 0:
|
|
355
473
|
raise ValueError(f"k must be positive, got k={k}")
|
|
@@ -440,26 +558,55 @@ class HNSWNeighborFinder(NeighborFinder):
|
|
|
440
558
|
"""
|
|
441
559
|
Approximate nearest neighbors via HNSW (hnswlib or nmslib backend).
|
|
442
560
|
|
|
443
|
-
|
|
444
|
-
|
|
561
|
+
Native metric support depends on the backend:
|
|
562
|
+
|
|
563
|
+
hnswlib
|
|
564
|
+
Supports 'euclidean' (l2), 'cosine', and 'dot' (ip / inner product).
|
|
565
|
+
All other metrics raise an error — use preset='exact' instead.
|
|
566
|
+
|
|
567
|
+
nmslib
|
|
568
|
+
Supports 'euclidean' (l2), 'cosine' (cosinesimil), 'manhattan' (l1),
|
|
569
|
+
'chebyshev' (linf), and 'dot' (negdotprod / max inner-product search).
|
|
570
|
+
All other metrics raise an error — use preset='exact' instead.
|
|
571
|
+
|
|
572
|
+
Note on 'dot': inner product is not a true distance metric. Results are
|
|
573
|
+
ranked by descending similarity, not ascending distance. Use 'cosine' for
|
|
574
|
+
normalised embeddings where you want a proper distance.
|
|
445
575
|
"""
|
|
446
576
|
|
|
577
|
+
# Per-backend accepted metrics (validated in __init__).
|
|
578
|
+
_HNSWLIB_METRICS = set(_HNSWLIB_METRIC_MAP) # euclidean, cosine, dot
|
|
579
|
+
_NMSLIB_METRICS = set(_NMSLIB_METRIC_MAP) # euclidean, cosine, manhattan, chebyshev, dot
|
|
580
|
+
|
|
447
581
|
def __init__(self, k=10, M=32, ef_construction=400,
|
|
448
582
|
ef_search=200, backend='hnswlib', distance_metric='euclidean'):
|
|
449
583
|
"""
|
|
450
584
|
Parameters
|
|
451
585
|
----------
|
|
452
586
|
distance_metric : str
|
|
453
|
-
'euclidean'
|
|
454
|
-
|
|
587
|
+
hnswlib: 'euclidean', 'cosine', or 'dot'.
|
|
588
|
+
nmslib: 'euclidean', 'cosine', 'manhattan', 'chebyshev', or 'dot'.
|
|
589
|
+
Default: 'euclidean'.
|
|
590
|
+
backend : str
|
|
591
|
+
'hnswlib' (default) or 'nmslib'.
|
|
455
592
|
"""
|
|
456
593
|
if k <= 0:
|
|
457
594
|
raise ValueError(f"k must be positive, got k={k}")
|
|
458
595
|
metric = distance_metric.lower()
|
|
459
|
-
|
|
596
|
+
backend_str = backend.lower()
|
|
597
|
+
|
|
598
|
+
if backend_str == 'hnswlib':
|
|
599
|
+
allowed = self._HNSWLIB_METRICS
|
|
600
|
+
elif backend_str == 'nmslib':
|
|
601
|
+
allowed = self._NMSLIB_METRICS
|
|
602
|
+
else:
|
|
603
|
+
raise ValueError(f"Unknown backend: '{backend}'. Choose 'hnswlib' or 'nmslib'.")
|
|
604
|
+
|
|
605
|
+
if metric not in allowed:
|
|
460
606
|
raise ValueError(
|
|
461
|
-
f"distance_metric='{distance_metric}' is not
|
|
462
|
-
f"HNSWNeighborFinder
|
|
607
|
+
f"distance_metric='{distance_metric}' is not supported by "
|
|
608
|
+
f"HNSWNeighborFinder (backend='{backend_str}'). "
|
|
609
|
+
f"Available: {sorted(allowed)}. "
|
|
463
610
|
f"For other metrics use preset='exact' (KNNNeighborFinder)."
|
|
464
611
|
)
|
|
465
612
|
self.n_neighbors = k
|
|
@@ -467,7 +614,7 @@ class HNSWNeighborFinder(NeighborFinder):
|
|
|
467
614
|
self.M = M
|
|
468
615
|
self.ef_construction = ef_construction
|
|
469
616
|
self.ef_search = ef_search
|
|
470
|
-
self.backend =
|
|
617
|
+
self.backend = backend_str
|
|
471
618
|
self.index_ = None
|
|
472
619
|
self._check_availability()
|
|
473
620
|
|
|
@@ -504,7 +651,7 @@ class HNSWNeighborFinder(NeighborFinder):
|
|
|
504
651
|
)
|
|
505
652
|
|
|
506
653
|
if self.backend == 'hnswlib':
|
|
507
|
-
space =
|
|
654
|
+
space = _HNSWLIB_METRIC_MAP[self.distance_metric]
|
|
508
655
|
self.index_ = self.hnswlib.Index(space=space, dim=dim)
|
|
509
656
|
self.index_.init_index(
|
|
510
657
|
max_elements=n_samples, M=self.M, ef_construction=self.ef_construction
|
|
@@ -513,13 +660,10 @@ class HNSWNeighborFinder(NeighborFinder):
|
|
|
513
660
|
self.index_.add_items(X, np.arange(n_samples))
|
|
514
661
|
|
|
515
662
|
else: # nmslib
|
|
516
|
-
|
|
517
|
-
'euclidean': 'l2',
|
|
518
|
-
'cosine': 'cosinesimil',
|
|
519
|
-
}
|
|
663
|
+
space = _NMSLIB_METRIC_MAP[self.distance_metric]
|
|
520
664
|
self.index_ = self.nmslib.init(
|
|
521
665
|
method='hnsw',
|
|
522
|
-
space=
|
|
666
|
+
space=space,
|
|
523
667
|
data_type=self.nmslib.DataType.DENSE_VECTOR
|
|
524
668
|
)
|
|
525
669
|
self.index_.addDataPointBatch(X)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|