deskit 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {deskit-1.2.0/src/deskit.egg-info → deskit-1.2.1}/PKG-INFO +1 -1
  2. {deskit-1.2.0 → deskit-1.2.1}/pyproject.toml +1 -1
  3. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/neighbors.py +207 -63
  4. {deskit-1.2.0 → deskit-1.2.1/src/deskit.egg-info}/PKG-INFO +1 -1
  5. {deskit-1.2.0 → deskit-1.2.1}/LICENSE +0 -0
  6. {deskit-1.2.0 → deskit-1.2.1}/README.md +0 -0
  7. {deskit-1.2.0 → deskit-1.2.1}/setup.cfg +0 -0
  8. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/__init__.py +0 -0
  9. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/_config.py +0 -0
  10. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/__init__.py +0 -0
  11. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/base.py +0 -0
  12. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/knnbase.py +0 -0
  13. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/base/predictbase.py +0 -0
  14. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/__init__.py +0 -0
  15. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsi.py +0 -0
  16. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsiv.py +0 -0
  17. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewst.py +0 -0
  18. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsu.py +0 -0
  19. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/dewsv.py +0 -0
  20. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knorae.py +0 -0
  21. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knoraiu.py +0 -0
  22. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/knorau.py +0 -0
  23. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/lwsei.py +0 -0
  24. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/lwseu.py +0 -0
  25. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/des/ola.py +0 -0
  26. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/metrics.py +0 -0
  27. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/router.py +0 -0
  28. {deskit-1.2.0 → deskit-1.2.1}/src/deskit/utils.py +0 -0
  29. {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/SOURCES.txt +0 -0
  30. {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/dependency_links.txt +0 -0
  31. {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/requires.txt +0 -0
  32. {deskit-1.2.0 → deskit-1.2.1}/src/deskit.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deskit
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: A Python library for Dynamic Ensemble Selection
5
5
  Author: Tikhon Vodyanov
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "deskit"
7
- version = "1.2.0"
7
+ version = "1.2.1"
8
8
  description = "A Python library for Dynamic Ensemble Selection"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -13,18 +13,28 @@ _FAISS_MIN_SAMPLES_PER_CELL = 40
13
13
  # 'euclidean' is the universal default and always available.
14
14
  #
15
15
  # Choosing a distance metric:
16
- # euclidean – The standard L2 norm. Best default for most tabular data.
17
- # manhattan – L1 norm (sum of absolute differences). More robust to outliers
18
- # and tends to work better in moderately high-dimensional spaces
19
- # because it doesn't square large differences.
20
- # chebyshev – L∞ norm (maximum absolute difference across features). Useful
21
- # when a single feature dominating the distance is acceptable;
22
- # common in game-grid / chess-style distance problems.
23
- # minkowski – Generalisation of L1/L2 (controlled by p). p=1 → manhattan,
24
- # p=2 → euclidean. Use when you want to tune between them.
25
- # cosine – Angle between vectors, ignoring magnitude. Excellent for
26
- # embeddings (text, image, audio) where direction matters more
27
- # than raw scale.
16
+ # euclidean – The standard L2 norm. Best default for most tabular data.
17
+ # manhattan – L1 norm (sum of absolute differences). More robust to
18
+ # outliers and tends to work better in moderately high-
19
+ # dimensional spaces because it doesn't square large diffs.
20
+ # chebyshev – L∞ norm (maximum absolute difference across features).
21
+ # Useful when a single feature dominating the distance is
22
+ # acceptable; common in game-grid / chess-style problems.
23
+ # minkowski – Generalisation of L1/L2 (controlled by p). p=1 →
24
+ # manhattan, p=2 → euclidean. Use when you want to tune
25
+ # between them.
26
+ # cosine – Angle between vectors, ignoring magnitude. Excellent for
27
+ # embeddings (text, image, audio) where direction matters
28
+ # more than raw scale.
29
+ # canberra – Weighted L1. Sensitive to small values near zero.
30
+ # braycurtis – Normalised L1 bounded to [0,1]. Common in ecology.
31
+ # jensenshannon – Symmetric KL divergence on probability distributions.
32
+ # Requires non-negative vectors. Supported by FAISS flat/
33
+ # HNSW/GPU indices natively.
34
+ # dot – Raw inner/dot product. Not a true metric; distances are
35
+ # not comparable across queries. Use for max inner-product
36
+ # search (recommendation systems). Prefer 'cosine' for
37
+ # normalised embeddings.
28
38
 
29
39
  # Metrics that every backend supports natively.
30
40
  _UNIVERSAL_METRICS = {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'cosine'}
@@ -33,31 +43,59 @@ _UNIVERSAL_METRICS = {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'cosin
33
43
  # KNN (sklearn) supports all scipy metrics — this is the complete curated list.
34
44
  _KNN_METRICS = _UNIVERSAL_METRICS | {'correlation', 'hamming', 'canberra', 'braycurtis'}
35
45
 
36
- # FAISS only has built-in L2 and inner-product (cosine via normalization).
37
- # All others fall back to a manual compute-then-search path.
38
- _FAISS_NATIVE_METRICS = {'euclidean', 'cosine'}
39
- _FAISS_METRICS = _UNIVERSAL_METRICS # remainder handled via sklearn fallback
46
+ # FAISS native metric support:
47
+ # IndexFlat, IndexHNSW, and GpuIndexFlat support METRIC_L1, METRIC_Linf,
48
+ # METRIC_Lp (with metric_arg for p), METRIC_Canberra, METRIC_BrayCurtis,
49
+ # and METRIC_JensenShannon in addition to L2 and inner product.
50
+ # IndexIVFFlat only supports L2 and inner product.
51
+ # 'ivf' index_type will still fall back for non-L2/cosine metrics.
52
+ _FAISS_FLAT_HNSW_NATIVE_METRICS = {
53
+ 'euclidean', 'cosine', 'manhattan', 'chebyshev', 'minkowski',
54
+ 'canberra', 'braycurtis', 'jensenshannon',
55
+ }
56
+ _FAISS_IVF_NATIVE_METRICS = {'euclidean', 'cosine'}
57
+
58
+ # For backwards compatibility: the overall set accepted by FaissNeighborFinder.
59
+ _FAISS_METRICS = _FAISS_FLAT_HNSW_NATIVE_METRICS | {'correlation', 'hamming'}
40
60
 
41
61
  # Annoy metric names (library-specific).
62
+ # Annoy natively supports: euclidean, manhattan, cosine (angular), hamming,
63
+ # and dot (inner product). chebyshev and minkowski have no Annoy equivalent.
42
64
  _ANNOY_METRIC_MAP = {
43
65
  'euclidean': 'euclidean',
44
66
  'manhattan': 'manhattan',
45
67
  'cosine': 'angular',
46
68
  'hamming': 'hamming',
47
- # chebyshev and minkowski are not natively supported; we warn the user.
69
+ 'dot': 'dot',
48
70
  }
49
71
  _ANNOY_METRICS = set(_ANNOY_METRIC_MAP)
50
72
 
51
- # HNSW (hnswlib) space names.
52
- _HNSW_METRIC_MAP = {
73
+ # hnswlib space names — only three native spaces exist.
74
+ # 'ip' is inner product (not a true metric; used for max inner-product search).
75
+ _HNSWLIB_METRIC_MAP = {
53
76
  'euclidean': 'l2',
54
77
  'cosine': 'cosine',
55
- # Others not natively supported in hnswlib; we warn and fall back to l2.
78
+ 'dot': 'ip',
56
79
  }
80
+
81
+ # nmslib space names for DENSE_VECTOR + HNSW.
82
+ # l1/linf/angulardist are confirmed supported by nmslib's integration tests.
83
+ # 'dot' maps to negdotprod (nmslib maximises inner product via negative distance).
84
+ _NMSLIB_METRIC_MAP = {
85
+ 'euclidean': 'l2',
86
+ 'cosine': 'cosinesimil',
87
+ 'manhattan': 'l1',
88
+ 'chebyshev': 'linf',
89
+ 'dot': 'negdotprod',
90
+ }
91
+
92
+ # Unified view for HNSWNeighborFinder validation: union of both backends.
93
+ # We keep the old name for backwards compatibility.
94
+ _HNSW_METRIC_MAP = _HNSWLIB_METRIC_MAP # kept for any external references
57
95
  _HNSW_METRICS = _UNIVERSAL_METRICS # partial — see fit() for fallback note
58
96
 
59
97
  # All metrics callable from the public API.
60
- ALL_METRICS = _KNN_METRICS
98
+ ALL_METRICS = _KNN_METRICS | {'jensenshannon', 'dot'}
61
99
 
62
100
 
63
101
  def list_distance_metrics():
@@ -65,19 +103,21 @@ def list_distance_metrics():
65
103
  print("\nAvailable Distance Metrics:")
66
104
  print("=" * 70)
67
105
  rows = [
68
- ("euclidean", "Default. L2 norm. Best for most tabular data.", "all"),
69
- ("manhattan", "L1 norm. More robust to outliers; good for high-dim data.", "all"),
70
- ("chebyshev", "L∞ norm. Max absolute diff across features.", "KNN only (exact preset)"),
71
- ("minkowski", "Generalises L1/L2 via p-param. Set minkowski_p=<float>.", "KNN only (exact preset)"),
72
- ("cosine", "Angle between vectors. Ideal for embeddings (NLP, vision).", "all"),
73
- ("correlation","Pearson correlation distance. Good for time series.", "KNN only (exact preset)"),
74
- ("hamming", "Fraction of differing components. For binary/categorical data.", "KNN, Annoy"),
75
- ("canberra", "Weighted L1. Sensitive to small values near zero.", "KNN only (exact preset)"),
76
- ("braycurtis", "Normalised L1 bounded to [0,1]. Ecological data.", "KNN only (exact preset)"),
106
+ ("euclidean", "Default. L2 norm. Best for most tabular data.", "all"),
107
+ ("manhattan", "L1 norm. More robust to outliers; good for high-dim data.", "KNN, FAISS (flat/hnsw), Annoy, HNSW-nmslib"),
108
+ ("chebyshev", "L∞ norm. Max absolute diff across features.", "KNN, FAISS (flat/hnsw), HNSW-nmslib"),
109
+ ("minkowski", "Generalises L1/L2 via p-param. Set minkowski_p=<float>.", "KNN, FAISS (flat/hnsw)"),
110
+ ("cosine", "Angle between vectors. Ideal for embeddings (NLP, vision).", "all"),
111
+ ("dot", "Inner/dot product. Not a metric; used for max-IP search.", "Annoy, HNSW (hnswlib ip / nmslib negdotprod)"),
112
+ ("canberra", "Weighted L1. Sensitive to small values near zero.", "KNN, FAISS (flat/hnsw/gpu)"),
113
+ ("braycurtis", "Normalised L1 bounded to [0,1]. Ecological data.", "KNN, FAISS (flat/hnsw/gpu)"),
114
+ ("jensenshannon", "Symmetric KL divergence. Requires non-negative vectors.", "FAISS (flat/hnsw/gpu)"),
115
+ ("correlation", "Pearson correlation distance. Good for time series.", "KNN only"),
116
+ ("hamming", "Fraction of differing components. For binary/categorical data.", "KNN, Annoy"),
77
117
  ]
78
118
  for name, desc, backends in rows:
79
- print(f"\n {name:<14} {desc}")
80
- print(f" {'':14} Backends: {backends}")
119
+ print(f"\n {name:<16} {desc}")
120
+ print(f" {'':16} Backends: {backends}")
81
121
  print("\n" + "=" * 70)
82
122
 
83
123
 
@@ -160,21 +200,36 @@ class FaissNeighborFinder(NeighborFinder):
160
200
  """
161
201
  Approximate nearest neighbors via FAISS (flat, IVF, or HNSW index).
162
202
 
163
- Natively supports 'euclidean' and 'cosine'. All other metrics in
164
- _UNIVERSAL_METRICS fall back to a sklearn-based exact search with a
165
- warning, so you can still use them without switching presets.
203
+ Native metric support depends on index_type:
204
+
205
+ flat / hnsw / gpu-flat
206
+ FAISS IndexFlat, IndexHNSW, and GpuIndexFlat natively support:
207
+ euclidean, cosine, manhattan (L1), chebyshev (Linf), minkowski (Lp),
208
+ canberra, braycurtis, jensenshannon.
209
+
210
+ ivf
211
+ IndexIVFFlat only supports L2 and inner-product (cosine). All other
212
+ metrics fall back to an exact sklearn KNN with a warning.
213
+
214
+ correlation and hamming always fall back to sklearn for all index types.
166
215
  """
167
216
 
168
217
  def __init__(self, k=10, index_type='flat', n_cells=None, n_probes=50,
169
218
  hnsw_M=32, hnsw_efConstruction=400, hnsw_efSearch=200,
170
- distance_metric='euclidean'):
219
+ distance_metric='euclidean', minkowski_p=2):
171
220
  """
172
221
  Parameters
173
222
  ----------
174
223
  distance_metric : str
175
- 'euclidean' (default) or 'cosine'. Other metrics fall back to
176
- exact sklearn search with a warning — use preset='exact' to avoid
177
- the overhead.
224
+ Metric to use. flat/hnsw/gpu index types natively support:
225
+ 'euclidean', 'cosine', 'manhattan', 'chebyshev', 'minkowski',
226
+ 'canberra', 'braycurtis', 'jensenshannon'.
227
+ 'ivf' only natively supports 'euclidean' and 'cosine'; all others
228
+ fall back to exact sklearn KNN with a warning.
229
+ 'correlation' and 'hamming' always fall back to sklearn.
230
+ minkowski_p : float
231
+ The p-parameter for the Minkowski metric. p=1 → manhattan,
232
+ p=2 → euclidean. Ignored for all other metrics.
178
233
  """
179
234
  if k <= 0:
180
235
  raise ValueError(f"k must be positive, got k={k}")
@@ -193,6 +248,7 @@ class FaissNeighborFinder(NeighborFinder):
193
248
  self.hnsw_efConstruction = hnsw_efConstruction
194
249
  self.hnsw_efSearch = hnsw_efSearch
195
250
  self.distance_metric = metric
251
+ self.minkowski_p = minkowski_p
196
252
  self.index_ = None
197
253
  self._fallback_finder = None # used for non-native metrics
198
254
  self._check_availability()
@@ -204,6 +260,31 @@ class FaissNeighborFinder(NeighborFinder):
204
260
  except ImportError:
205
261
  raise ImportError("FAISS not found. Install with: pip install faiss-cpu")
206
262
 
263
+ @staticmethod
264
+ def _faiss_metric_type(faiss, metric, minkowski_p=2):
265
+ """
266
+ Return (faiss_metric_constant, metric_arg) for a given metric name.
267
+ metric_arg is only meaningful for METRIC_Lp (minkowski).
268
+ Raises ValueError for metrics that have no FAISS MetricType constant
269
+ (i.e. those that must be handled via fallback).
270
+ """
271
+ _MAP = {
272
+ 'euclidean': (faiss.METRIC_L2, None),
273
+ 'cosine': (faiss.METRIC_INNER_PRODUCT, None),
274
+ 'manhattan': (faiss.METRIC_L1, None),
275
+ 'chebyshev': (faiss.METRIC_Linf, None),
276
+ 'minkowski': (faiss.METRIC_Lp, None), # metric_arg set below
277
+ 'canberra': (faiss.METRIC_Canberra, None),
278
+ 'braycurtis': (faiss.METRIC_BrayCurtis, None),
279
+ 'jensenshannon':(faiss.METRIC_JensenShannon, None),
280
+ }
281
+ if metric not in _MAP:
282
+ raise KeyError(metric)
283
+ ft, arg = _MAP[metric]
284
+ if metric == 'minkowski':
285
+ arg = float(minkowski_p)
286
+ return ft, arg
287
+
207
288
  @staticmethod
208
289
  def _l2_normalize(X):
209
290
  """Row-wise L2 normalisation in-place (for cosine similarity)."""
@@ -221,11 +302,18 @@ class FaissNeighborFinder(NeighborFinder):
221
302
  f"{n_samples} samples. Reduce k to at most {n_samples}."
222
303
  )
223
304
 
224
- # Non-native metrics: delegate entirely to KNNNeighborFinder.
225
- if self.distance_metric not in _FAISS_NATIVE_METRICS:
305
+ # Determine whether the chosen metric is natively supported by this index type.
306
+ ivf_native = self.distance_metric in _FAISS_IVF_NATIVE_METRICS
307
+ flat_hnsw_native = self.distance_metric in _FAISS_FLAT_HNSW_NATIVE_METRICS
308
+ is_ivf = (self.index_type == 'ivf')
309
+
310
+ needs_fallback = is_ivf and not ivf_native
311
+ needs_fallback = needs_fallback or (not is_ivf and not flat_hnsw_native)
312
+
313
+ if needs_fallback:
226
314
  warnings.warn(
227
315
  f"distance_metric='{self.distance_metric}' is not natively supported by "
228
- f"FAISS. Falling back to exact sklearn KNN for this metric. "
316
+ f"FAISS {self.index_type} index. Falling back to exact sklearn KNN. "
229
317
  f"Use preset='exact' to avoid this overhead.",
230
318
  UserWarning,
231
319
  )
@@ -235,7 +323,7 @@ class FaissNeighborFinder(NeighborFinder):
235
323
  self._fallback_finder.fit(X)
236
324
  return self
237
325
 
238
- # Cosine similarity: normalise all vectors, then use inner-product index.
326
+ # Cosine similarity: normalise all vectors so inner-product == cosine.
239
327
  if self.distance_metric == 'cosine':
240
328
  X = self._l2_normalize(X)
241
329
 
@@ -247,12 +335,20 @@ class FaissNeighborFinder(NeighborFinder):
247
335
  UserWarning
248
336
  )
249
337
  if self.distance_metric == 'cosine':
250
- self.index_ = self.faiss.IndexFlatIP(dim) # inner product on normalised vecs
251
- else:
338
+ self.index_ = self.faiss.IndexFlatIP(dim)
339
+ elif self.distance_metric == 'euclidean':
252
340
  self.index_ = self.faiss.IndexFlatL2(dim)
341
+ else:
342
+ ft, metric_arg = self._faiss_metric_type(
343
+ self.faiss, self.distance_metric, self.minkowski_p
344
+ )
345
+ self.index_ = self.faiss.IndexFlat(dim, ft)
346
+ if metric_arg is not None:
347
+ self.index_.metric_arg = metric_arg
253
348
  self.index_.add(X)
254
349
 
255
350
  elif self.index_type == 'ivf':
351
+ # IVF only supports L2 / inner-product (guarded above).
256
352
  if self.n_cells is None:
257
353
  self.n_cells = min(int(np.sqrt(n_samples)), 4096)
258
354
 
@@ -300,7 +396,19 @@ class FaissNeighborFinder(NeighborFinder):
300
396
  f"{n_samples} samples. Consider ef_construction >= 400.",
301
397
  UserWarning
302
398
  )
303
- self.index_ = self.faiss.IndexHNSWFlat(dim, self.hnsw_M)
399
+ if self.distance_metric == 'cosine':
400
+ self.index_ = self.faiss.IndexHNSWFlat(
401
+ dim, self.hnsw_M, self.faiss.METRIC_INNER_PRODUCT
402
+ )
403
+ elif self.distance_metric == 'euclidean':
404
+ self.index_ = self.faiss.IndexHNSWFlat(dim, self.hnsw_M)
405
+ else:
406
+ ft, metric_arg = self._faiss_metric_type(
407
+ self.faiss, self.distance_metric, self.minkowski_p
408
+ )
409
+ self.index_ = self.faiss.IndexHNSWFlat(dim, self.hnsw_M, ft)
410
+ if metric_arg is not None:
411
+ self.index_.metric_arg = metric_arg
304
412
  self.index_.hnsw.efConstruction = self.hnsw_efConstruction
305
413
  self.index_.hnsw.efSearch = self.hnsw_efSearch
306
414
  self.index_.add(X)
@@ -328,10 +436,14 @@ class FaissNeighborFinder(NeighborFinder):
328
436
  # Inner product on normalised vectors: similarity ∈ [-1, 1].
329
437
  # Convert to a proper distance (0 = identical, 2 = opposite).
330
438
  distances = 1.0 - scores
331
- else:
439
+ elif self.distance_metric == 'euclidean':
332
440
  distances, indices = self.index_.search(X, k)
333
441
  # FAISS returns squared L2; clamp to 0 before sqrt.
334
442
  distances = np.sqrt(np.maximum(distances, 0))
443
+ else:
444
+ # All other native metrics (manhattan, chebyshev, minkowski, canberra,
445
+ # braycurtis, jensenshannon) are returned as proper distances already.
446
+ distances, indices = self.index_.search(X, k)
335
447
 
336
448
  return distances.astype(np.float32), indices
337
449
 
@@ -340,8 +452,13 @@ class AnnoyNeighborFinder(NeighborFinder):
340
452
  """
341
453
  Approximate nearest neighbors via Annoy.
342
454
 
343
- Supports: euclidean, manhattan, cosine, hamming.
455
+ Supports: euclidean, manhattan, cosine (stored as 'angular'), hamming,
456
+ and dot (inner product, stored as 'dot').
344
457
  chebyshev and minkowski are not available in Annoy — use preset='exact' for those.
458
+
459
+ Note on 'dot': Annoy's dot-product space is not a true metric. Distances
460
+ returned are reduced inner-product values, not raw dot products — see
461
+ Bachrach et al. (2014). Prefer 'cosine' for normalised embeddings.
345
462
  """
346
463
 
347
464
  def __init__(self, k=10, n_trees=100, distance_metric='euclidean', search_k=-1):
@@ -349,7 +466,8 @@ class AnnoyNeighborFinder(NeighborFinder):
349
466
  Parameters
350
467
  ----------
351
468
  distance_metric : str
352
- One of 'euclidean', 'manhattan', 'cosine', 'hamming'. Default: 'euclidean'.
469
+ One of 'euclidean', 'manhattan', 'cosine', 'hamming', 'dot'.
470
+ Default: 'euclidean'.
353
471
  """
354
472
  if k <= 0:
355
473
  raise ValueError(f"k must be positive, got k={k}")
@@ -440,26 +558,55 @@ class HNSWNeighborFinder(NeighborFinder):
440
558
  """
441
559
  Approximate nearest neighbors via HNSW (hnswlib or nmslib backend).
442
560
 
443
- Natively supports 'euclidean' and 'cosine'. Manhattan, chebyshev, and
444
- minkowski are not available in hnswlib/nmslib — use preset='exact' for those.
561
+ Native metric support depends on the backend:
562
+
563
+ hnswlib
564
+ Supports 'euclidean' (l2), 'cosine', and 'dot' (ip / inner product).
565
+ All other metrics raise an error — use preset='exact' instead.
566
+
567
+ nmslib
568
+ Supports 'euclidean' (l2), 'cosine' (cosinesimil), 'manhattan' (l1),
569
+ 'chebyshev' (linf), and 'dot' (negdotprod / max inner-product search).
570
+ All other metrics raise an error — use preset='exact' instead.
571
+
572
+ Note on 'dot': inner product is not a true distance metric. Results are
573
+ ranked by descending similarity, not ascending distance. Use 'cosine' for
574
+ normalised embeddings where you want a proper distance.
445
575
  """
446
576
 
577
+ # Per-backend accepted metrics (validated in __init__).
578
+ _HNSWLIB_METRICS = set(_HNSWLIB_METRIC_MAP) # euclidean, cosine, dot
579
+ _NMSLIB_METRICS = set(_NMSLIB_METRIC_MAP) # euclidean, cosine, manhattan, chebyshev, dot
580
+
447
581
  def __init__(self, k=10, M=32, ef_construction=400,
448
582
  ef_search=200, backend='hnswlib', distance_metric='euclidean'):
449
583
  """
450
584
  Parameters
451
585
  ----------
452
586
  distance_metric : str
453
- 'euclidean' (default) or 'cosine'. Other metrics are not supported
454
- natively and will raise an error — use preset='exact' instead.
587
+ hnswlib: 'euclidean', 'cosine', or 'dot'.
588
+ nmslib: 'euclidean', 'cosine', 'manhattan', 'chebyshev', or 'dot'.
589
+ Default: 'euclidean'.
590
+ backend : str
591
+ 'hnswlib' (default) or 'nmslib'.
455
592
  """
456
593
  if k <= 0:
457
594
  raise ValueError(f"k must be positive, got k={k}")
458
595
  metric = distance_metric.lower()
459
- if metric not in _HNSW_METRIC_MAP:
596
+ backend_str = backend.lower()
597
+
598
+ if backend_str == 'hnswlib':
599
+ allowed = self._HNSWLIB_METRICS
600
+ elif backend_str == 'nmslib':
601
+ allowed = self._NMSLIB_METRICS
602
+ else:
603
+ raise ValueError(f"Unknown backend: '{backend}'. Choose 'hnswlib' or 'nmslib'.")
604
+
605
+ if metric not in allowed:
460
606
  raise ValueError(
461
- f"distance_metric='{distance_metric}' is not natively supported by "
462
- f"HNSWNeighborFinder. Available: {sorted(_HNSW_METRIC_MAP)}. "
607
+ f"distance_metric='{distance_metric}' is not supported by "
608
+ f"HNSWNeighborFinder (backend='{backend_str}'). "
609
+ f"Available: {sorted(allowed)}. "
463
610
  f"For other metrics use preset='exact' (KNNNeighborFinder)."
464
611
  )
465
612
  self.n_neighbors = k
@@ -467,7 +614,7 @@ class HNSWNeighborFinder(NeighborFinder):
467
614
  self.M = M
468
615
  self.ef_construction = ef_construction
469
616
  self.ef_search = ef_search
470
- self.backend = backend.lower()
617
+ self.backend = backend_str
471
618
  self.index_ = None
472
619
  self._check_availability()
473
620
 
@@ -504,7 +651,7 @@ class HNSWNeighborFinder(NeighborFinder):
504
651
  )
505
652
 
506
653
  if self.backend == 'hnswlib':
507
- space = _HNSW_METRIC_MAP[self.distance_metric]
654
+ space = _HNSWLIB_METRIC_MAP[self.distance_metric]
508
655
  self.index_ = self.hnswlib.Index(space=space, dim=dim)
509
656
  self.index_.init_index(
510
657
  max_elements=n_samples, M=self.M, ef_construction=self.ef_construction
@@ -513,13 +660,10 @@ class HNSWNeighborFinder(NeighborFinder):
513
660
  self.index_.add_items(X, np.arange(n_samples))
514
661
 
515
662
  else: # nmslib
516
- nmslib_space_map = {
517
- 'euclidean': 'l2',
518
- 'cosine': 'cosinesimil',
519
- }
663
+ space = _NMSLIB_METRIC_MAP[self.distance_metric]
520
664
  self.index_ = self.nmslib.init(
521
665
  method='hnsw',
522
- space=nmslib_space_map.get(self.distance_metric, 'l2'),
666
+ space=space,
523
667
  data_type=self.nmslib.DataType.DENSE_VECTOR
524
668
  )
525
669
  self.index_.addDataPointBatch(X)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deskit
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: A Python library for Dynamic Ensemble Selection
5
5
  Author: Tikhon Vodyanov
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes