usearch 2.23.0__cp314-cp314t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
usearch/eval.py ADDED
@@ -0,0 +1,512 @@
1
+ from __future__ import annotations
2
+ from time import time_ns
3
+ from typing import Tuple, Any, Callable, Union, Optional, List
4
+ from dataclasses import dataclass, asdict
5
+ from collections import defaultdict
6
+ from math import ceil
7
+
8
+ import numpy as np
9
+
10
+ from usearch.io import load_matrix
11
+ from usearch.index import (
12
+ Index,
13
+ BatchMatches,
14
+ ScalarKind,
15
+ MetricKind,
16
+ MetricKindBitwise,
17
+ Key,
18
+ _normalize_metric,
19
+ _normalize_dtype,
20
+ _to_numpy_dtype,
21
+ )
22
+
23
+
24
+ def random_vectors(
25
+ count: int,
26
+ metric: MetricKind = MetricKind.IP,
27
+ dtype: ScalarKind = ScalarKind.F32,
28
+ ndim: Optional[int] = None,
29
+ index: Optional[Index] = None,
30
+ ) -> np.ndarray:
31
+ """Produces a collection of random vectors normalized for the provided `metric`
32
+ and matching wanted `dtype`, which can both be inferred from an existing `index`.
33
+ """
34
+
35
+ # Infer default parameters from the `index`, if passed
36
+ if index is not None:
37
+ if not isinstance(index, Index):
38
+ raise ValueError("Unsupported `index` type")
39
+
40
+ ndim = index.ndim
41
+ dtype = index.numpy_dtype
42
+ metric = index.metric
43
+
44
+ else:
45
+ metric: MetricKind = _normalize_metric(metric)
46
+ dtype: ScalarKind = _normalize_dtype(dtype, ndim=ndim, metric=metric)
47
+
48
+ # Produce data
49
+ if metric in MetricKindBitwise or dtype == ScalarKind.B1:
50
+ bit_vectors = np.random.randint(2, size=(count, ndim))
51
+ bit_vectors = np.packbits(bit_vectors, axis=1)
52
+ return bit_vectors
53
+
54
+ else:
55
+ x = np.random.rand(count, ndim)
56
+ if _to_numpy_dtype(dtype) == np.int8:
57
+ x = (x * 100).astype(np.int8)
58
+ else:
59
+ x = x.astype(_to_numpy_dtype(dtype))
60
+ if metric == MetricKind.IP:
61
+ return x / np.linalg.norm(x, axis=1, keepdims=True)
62
+ return x
63
+
64
+
65
+ @dataclass
66
+ class SearchStats:
67
+ """
68
+ Contains statistics for one or more search runs, including the number of
69
+ internal nodes that were fetched (`visited_members`) and the number
70
+ of times the distance metric was invoked (`computed_distances`).
71
+
72
+ Other derivative metrics include the `mean_recall` and `mean_efficiency`.
73
+ Recall is the share of queried vectors, that were successfully found.
74
+ Efficiency describes the number of distances that had to be computed for
75
+ each query, normalized to size of the `index`. Highest efficiency is 0.(9),
76
+ lowest is zero. Highest is achieved, when the distance metric was computed
77
+ just once per query. Lowest happens during exact search, when every distance
78
+ to every present vector had to be computed.
79
+ """
80
+
81
+ index_size: int
82
+ count_queries: int
83
+ count_matches: int
84
+
85
+ visited_members: int
86
+ computed_distances: int
87
+
88
+ @property
89
+ def mean_efficiency(self) -> float:
90
+ return 1 - float(self.computed_distances) / (self.count_queries * self.index_size)
91
+
92
+ @property
93
+ def mean_recall(self) -> float:
94
+ return self.count_matches / self.count_queries
95
+
96
+
97
+ def self_recall(index: Index, sample: Union[float, int] = 1.0, **kwargs) -> SearchStats:
98
+ """Simplest benchmark for a quality of search, which queries every
99
+ existing member of the index, to make sure approximate search finds
100
+ the point itself.
101
+
102
+ :param index: Non-empty pre-constructed index
103
+ :type index: Index
104
+ :param sample: Share (or number) of vectors to search, defaults to 1.0
105
+ :type sample: Union[float, int]
106
+ :return: Evaluation report with key metrics
107
+ :rtype: SearchStats
108
+ """
109
+ if len(index) == 0:
110
+ return 0
111
+ if "count" not in kwargs:
112
+ kwargs["count"] = 1
113
+
114
+ if "keys" in kwargs:
115
+ keys = kwargs.pop("keys")
116
+ else:
117
+ keys = np.array(index.keys)
118
+
119
+ if sample != 1.0:
120
+ if isinstance(sample, float):
121
+ sample = int(ceil(len(keys) * sample))
122
+ keys = np.random.choice(keys, sample)
123
+
124
+ if "vectors" in kwargs:
125
+ vectors = kwargs.pop("vectors")
126
+ else:
127
+ vectors = index.get(keys)
128
+
129
+ matches = index.search(vectors, **kwargs)
130
+ count_matches: int = (
131
+ matches.count_matches(keys) if isinstance(matches, BatchMatches) else int(matches.keys[0] == keys[0])
132
+ )
133
+ return SearchStats(
134
+ index_size=len(index),
135
+ count_queries=len(keys),
136
+ count_matches=count_matches,
137
+ visited_members=matches.visited_members,
138
+ computed_distances=matches.computed_distances,
139
+ )
140
+
141
+
142
+ def measure_seconds(f: Callable) -> Tuple[float, Any]:
143
+ """Simple function profiling decorator.
144
+
145
+ :param f: Function to be profiled
146
+ :type f: Callable
147
+ :return: Time elapsed in seconds and the result of the execution
148
+ :rtype: Tuple[float, Any]
149
+ """
150
+ a = time_ns()
151
+ result = f()
152
+ b = time_ns()
153
+ c = b - a
154
+ secs = c / (10**9)
155
+ return secs, result
156
+
157
+
158
+ def dcg(relevances: np.ndarray, k: Optional[int] = None) -> np.ndarray:
159
+ """Calculate DCG (Discounted Cumulative Gain) up to position k.
160
+
161
+ :param relevances: List of true relevance scores (in the order as they are ranked)
162
+ :type relevances: list
163
+ :param k: Position up to which DCG is computed
164
+ :type k: int
165
+ :return: The DCG score at position k
166
+ :rtype: float
167
+ """
168
+ if k:
169
+ relevances = np.asarray(relevances)[:k]
170
+
171
+ n_relevances = len(relevances)
172
+ if n_relevances == 0:
173
+ return 0.0
174
+
175
+ discounts = np.log2(np.arange(n_relevances) + 2)
176
+ return np.sum(relevances / discounts)
177
+
178
+
179
+ def ndcg(relevances: np.ndarray, k: Optional[int] = None) -> np.ndarray:
180
+ """Calculate NDCG (Normalized Discounted Cumulative Gain) at position k.
181
+
182
+ :param relevances: List of true relevance scores (in the order as they are ranked)
183
+ :type relevances: list
184
+ :param k: Position up to which NDCG is computed
185
+ :type k: int
186
+ :return: The NDCG score at position k
187
+ :rtype: float
188
+ """
189
+ best_dcg = dcg(sorted(relevances, reverse=True), k)
190
+ if best_dcg == 0:
191
+ return 0.0
192
+
193
+ return dcg(relevances, k) / best_dcg
194
+
195
+
196
+ def relevance(expected: np.ndarray, predicted: np.ndarray, k: Optional[int] = None) -> np.ndarray:
197
+ """Calculate relevance scores. Binary relevance scores
198
+
199
+ :param expected: ground-truth keys
200
+ :type expected: np.ndarray
201
+ :param predicted: predicted keys
202
+ :type predicted: np.ndarray
203
+ """
204
+ expected = expected[:k]
205
+ predicted = predicted[:k]
206
+ return [1 if i in expected else 0 for i in predicted]
207
+
208
+
209
+ @dataclass
210
+ class Dataset:
211
+ keys: np.ndarray
212
+ vectors: np.ndarray
213
+ queries: np.ndarray
214
+ neighbors: np.ndarray
215
+
216
+ def crop_neighbors(self, k: int):
217
+ self.neighbors = self.neighbors[:, k]
218
+
219
+ @property
220
+ def ndim(self):
221
+ return self.vectors.shape[1]
222
+
223
+ @staticmethod
224
+ def build(
225
+ vectors: Optional[str] = None,
226
+ queries: Optional[str] = None,
227
+ neighbors: Optional[str] = None,
228
+ count: Optional[int] = None,
229
+ ndim: Optional[int] = None,
230
+ k: Optional[int] = None,
231
+ ):
232
+ """Either loads an existing dataset from disk, or generates one on the fly.
233
+
234
+ :param vectors: _description_, defaults to None
235
+ :type vectors: Optional[str], optional
236
+ :param queries: _description_, defaults to None
237
+ :type queries: Optional[str], optional
238
+ :param neighbors: _description_, defaults to None
239
+ :type neighbors: Optional[str], optional
240
+ :param count: _description_, defaults to None
241
+ :type count: Optional[int], optional
242
+ :param ndim: _description_, defaults to None
243
+ :type ndim: Optional[int], optional
244
+ :param k: _description_, defaults to None
245
+ :type k: Optional[int], optional
246
+ """
247
+
248
+ d = Dataset(None, None, None, None)
249
+
250
+ if vectors is not None:
251
+ assert ndim is None
252
+
253
+ d.vectors = load_matrix(vectors)
254
+ ndim = d.vectors.shape[1]
255
+ count = min(d.vectors.shape[0], count) if count is not None else d.vectors.shape[0]
256
+ d.vectors = d.vectors[:count, :]
257
+ d.keys = np.arange(count, dtype=Key)
258
+
259
+ if queries is not None:
260
+ d.queries = load_matrix(queries)
261
+ else:
262
+ d.queries = d.vectors
263
+
264
+ if neighbors is not None:
265
+ d.neighbors = load_matrix(neighbors)
266
+ if k is not None:
267
+ d.neighbors = d.neighbors[:, :k]
268
+ else:
269
+ assert k is None, "Cant override `k`, will retrieve one neighbor"
270
+ d.neighbors = np.reshape(d.keys, (count, 1))
271
+
272
+ else:
273
+ assert ndim is not None
274
+ assert count is not None
275
+ assert k is None, "Cant override `k`, will retrieve one neighbor"
276
+
277
+ d.vectors = random_vectors(count=count, ndim=ndim)
278
+ d.queries = d.vectors
279
+ d.keys = np.arange(count, dtype=Key)
280
+ d.neighbors = np.reshape(d.keys, (count, 1))
281
+
282
+ return d
283
+
284
+
285
+ @dataclass
286
+ class TaskResult:
287
+ add_operations: Optional[int] = None
288
+ add_per_second: Optional[float] = None
289
+
290
+ search_operations: Optional[int] = None
291
+ search_per_second: Optional[float] = None
292
+ recall_at_one: Optional[float] = None
293
+
294
+ def __repr__(self) -> str:
295
+ parts = []
296
+ if self.add_per_second:
297
+ parts.append(f"{self.add_per_second:.2f} add/s")
298
+ if self.search_per_second:
299
+ parts.append(f"{self.search_per_second:.2f} search/s")
300
+ if self.recall_at_one:
301
+ parts.append(f"{self.recall_at_one * 100:.2f}% recall@1")
302
+ return ", ".join(parts)
303
+
304
+ @property
305
+ def add_seconds(self) -> float:
306
+ return self.add_operations / self.add_per_second
307
+
308
+ @property
309
+ def search_seconds(self) -> float:
310
+ return self.search_operations / self.search_per_second
311
+
312
+ def __add__(self, other: TaskResult):
313
+ result = TaskResult()
314
+ if self.add_operations and other.add_operations:
315
+ result.add_operations = self.add_operations + other.add_operations
316
+ result.add_per_second = result.add_operations / (self.add_seconds + other.add_seconds)
317
+ else:
318
+ base = self if self.add_operations else other
319
+ result.add_operations = base.add_operations
320
+ result.add_per_second = base.add_per_second
321
+
322
+ if self.search_operations and other.search_operations:
323
+ result.search_operations = self.search_operations + other.search_operations
324
+ result.recall_at_one = (
325
+ self.recall_at_one * self.search_operations + other.recall_at_one * other.search_operations
326
+ ) / (self.search_operations + other.search_operations)
327
+ result.search_per_second = result.search_operations / (self.search_seconds + other.search_seconds)
328
+ else:
329
+ base = self if self.search_operations else other
330
+ result.search_operations = base.search_operations
331
+ result.search_per_second = base.search_per_second
332
+ result.recall_at_one = base.recall_at_one
333
+
334
+ return result
335
+
336
+
337
+ @dataclass
338
+ class AddTask:
339
+ keys: np.ndarray
340
+ vectors: np.ndarray
341
+
342
+ def __call__(self, index: Index) -> TaskResult:
343
+ batch_size: int = self.vectors.shape[0]
344
+ old_size: int = len(index)
345
+ dt, _ = measure_seconds(lambda: index.add(self.keys, self.vectors))
346
+
347
+ assert len(index) == old_size + batch_size
348
+ return TaskResult(
349
+ add_operations=batch_size,
350
+ add_per_second=batch_size / dt,
351
+ )
352
+
353
+ @property
354
+ def ndim(self):
355
+ return self.vectors.shape[1]
356
+
357
+ @property
358
+ def count(self):
359
+ return self.vectors.shape[0]
360
+
361
+ def inplace_shuffle(self):
362
+ """Reorders the `vectors` and `keys`. Often used for robustness benchmarks."""
363
+
364
+ new_order = np.arange(self.count)
365
+ np.random.shuffle(new_order)
366
+ self.keys = self.keys[new_order]
367
+ self.vectors = self.vectors[new_order, :]
368
+
369
+ def slices(self, batch_size: int) -> List[AddTask]:
370
+ """Splits this dataset into smaller chunks."""
371
+
372
+ return [
373
+ AddTask(
374
+ keys=self.keys[start_row : start_row + batch_size],
375
+ vectors=self.vectors[start_row : start_row + batch_size, :],
376
+ )
377
+ for start_row in range(0, self.count, batch_size)
378
+ ]
379
+
380
+ def clusters(self, number_of_clusters: int) -> List[AddTask]:
381
+ """Splits this dataset into smaller chunks."""
382
+
383
+ from sklearn.cluster import KMeans
384
+
385
+ clustering = KMeans(
386
+ n_clusters=number_of_clusters,
387
+ random_state=0,
388
+ n_init="auto",
389
+ ).fit(self.vectors)
390
+
391
+ partitioning = defaultdict(list)
392
+ for row, cluster in enumerate(clustering.labels_):
393
+ partitioning[cluster].append(row)
394
+
395
+ return [
396
+ AddTask(
397
+ keys=self.keys[rows],
398
+ vectors=self.vectors[rows, :],
399
+ )
400
+ for rows in partitioning.values()
401
+ ]
402
+
403
+
404
+ @dataclass
405
+ class SearchTask:
406
+ queries: np.ndarray
407
+ neighbors: np.ndarray
408
+
409
+ def __call__(self, index: Index) -> TaskResult:
410
+ dt, results = measure_seconds(lambda: index.search(self.queries, self.neighbors.shape[1]))
411
+
412
+ return TaskResult(
413
+ search_per_second=self.queries.shape[0] / dt,
414
+ recall_at_one=results.mean_recall(self.neighbors[:, 0].flatten()),
415
+ )
416
+
417
+ def slices(self, batch_size: int) -> List[SearchTask]:
418
+ """Splits this dataset into smaller chunks."""
419
+
420
+ return [
421
+ SearchTask(
422
+ queries=self.queries[start_row : start_row + batch_size, :],
423
+ neighbors=self.neighbors[start_row : start_row + batch_size, :],
424
+ )
425
+ for start_row in range(0, self.queries.shape[0], batch_size)
426
+ ]
427
+
428
+
429
+ @dataclass
430
+ class Evaluation:
431
+ tasks: List[Union[AddTask, SearchTask]]
432
+ count: int
433
+ ndim: int
434
+
435
+ @staticmethod
436
+ def for_dataset(dataset: Dataset, batch_size: int = 0, clusters: int = 1) -> Evaluation:
437
+ tasks = []
438
+ add = AddTask(vectors=dataset.vectors, keys=dataset.keys)
439
+ search = SearchTask(queries=dataset.queries, neighbors=dataset.neighbors)
440
+
441
+ if batch_size:
442
+ tasks.extend(add.slices(batch_size))
443
+ tasks.extend(search.slices(batch_size))
444
+ elif clusters != 1:
445
+ tasks.extend(add.clusters(clusters))
446
+ print(tasks)
447
+ tasks.append(search)
448
+ else:
449
+ tasks.append(add)
450
+ tasks.append(search)
451
+
452
+ return Evaluation(
453
+ tasks=tasks,
454
+ count=add.count,
455
+ ndim=add.ndim,
456
+ )
457
+
458
+ def __call__(self, index: Index, post_clean: bool = True) -> dict:
459
+ task_result = TaskResult()
460
+
461
+ try:
462
+ for task in self.tasks:
463
+ task_result = task_result + task(index)
464
+ except KeyboardInterrupt:
465
+ pass
466
+
467
+ if post_clean:
468
+ index.clear()
469
+ return {
470
+ **index.specs,
471
+ **asdict(task_result),
472
+ }
473
+
474
+
475
+ if __name__ == "__main__":
476
+ import argparse
477
+
478
+ # Initialize the argument parser
479
+ parser = argparse.ArgumentParser(description="Evaluate vector search index for speed and accuracy.")
480
+
481
+ # Define expected arguments
482
+ parser.add_argument("--vectors", type=str, required=False, help="Path to the file containing the vectors.")
483
+ parser.add_argument("--queries", type=str, required=False, help="Path to the file containing the query vectors.")
484
+ parser.add_argument("--neighbors", type=str, required=False, help="Path to the file with neighbor arrays.")
485
+ parser.add_argument("--dtype", type=str, required=False, help="Quantization type for internal storage.")
486
+ parser.add_argument("--metric", type=str, required=False, help="Distance function.")
487
+ parser.add_argument("--count", type=int, help="Number of vectors to use.")
488
+ parser.add_argument("--ndim", type=int, help="Number of dimensions for the vectors.")
489
+ parser.add_argument("--batch_size", type=int, default=0, help="Batch size for indexing and searching.")
490
+ parser.add_argument("--clusters", type=int, default=1, help="Number of clusters for indexing.")
491
+
492
+ # Parse arguments from the command line
493
+ args = parser.parse_args()
494
+
495
+ # Load or generate dataset
496
+ dataset = Dataset.build(
497
+ vectors=args.vectors,
498
+ queries=args.queries,
499
+ neighbors=args.neighbors,
500
+ count=args.count,
501
+ ndim=args.ndim,
502
+ )
503
+
504
+ # Prepare the evaluation
505
+ evaluation = Evaluation.for_dataset(dataset, batch_size=args.batch_size, clusters=args.clusters)
506
+ index = Index(ndim=dataset.ndim, dtype=args.dtype, metric=args.metric)
507
+
508
+ # Perform the evaluation
509
+ results = evaluation(index)
510
+
511
+ # Print the evaluation results
512
+ print("Evaluation results:", results)