vectordb-bench 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. vectordb_bench/__init__.py +1 -0
  2. vectordb_bench/backend/cases.py +45 -1
  3. vectordb_bench/backend/clients/__init__.py +47 -0
  4. vectordb_bench/backend/clients/api.py +2 -0
  5. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +104 -40
  6. vectordb_bench/backend/clients/aws_opensearch/cli.py +52 -15
  7. vectordb_bench/backend/clients/aws_opensearch/config.py +27 -7
  8. vectordb_bench/backend/clients/hologres/cli.py +50 -0
  9. vectordb_bench/backend/clients/hologres/config.py +121 -0
  10. vectordb_bench/backend/clients/hologres/hologres.py +365 -0
  11. vectordb_bench/backend/clients/lancedb/lancedb.py +1 -0
  12. vectordb_bench/backend/clients/milvus/cli.py +29 -9
  13. vectordb_bench/backend/clients/milvus/config.py +2 -0
  14. vectordb_bench/backend/clients/milvus/milvus.py +1 -1
  15. vectordb_bench/backend/clients/oceanbase/cli.py +1 -0
  16. vectordb_bench/backend/clients/oceanbase/config.py +3 -1
  17. vectordb_bench/backend/clients/oceanbase/oceanbase.py +20 -4
  18. vectordb_bench/backend/clients/oss_opensearch/cli.py +155 -0
  19. vectordb_bench/backend/clients/oss_opensearch/config.py +157 -0
  20. vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +582 -0
  21. vectordb_bench/backend/clients/oss_opensearch/run.py +166 -0
  22. vectordb_bench/backend/clients/pgdiskann/cli.py +45 -0
  23. vectordb_bench/backend/clients/pgdiskann/config.py +16 -0
  24. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +94 -26
  25. vectordb_bench/backend/clients/s3_vectors/config.py +41 -0
  26. vectordb_bench/backend/clients/s3_vectors/s3_vectors.py +171 -0
  27. vectordb_bench/backend/clients/tidb/cli.py +0 -4
  28. vectordb_bench/backend/clients/tidb/config.py +22 -2
  29. vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -1
  30. vectordb_bench/backend/clients/zilliz_cloud/config.py +4 -1
  31. vectordb_bench/backend/dataset.py +70 -0
  32. vectordb_bench/backend/filter.py +17 -0
  33. vectordb_bench/backend/runner/mp_runner.py +4 -0
  34. vectordb_bench/backend/runner/rate_runner.py +23 -11
  35. vectordb_bench/backend/runner/read_write_runner.py +10 -9
  36. vectordb_bench/backend/runner/serial_runner.py +23 -7
  37. vectordb_bench/backend/task_runner.py +5 -4
  38. vectordb_bench/cli/cli.py +36 -0
  39. vectordb_bench/cli/vectordbbench.py +4 -0
  40. vectordb_bench/fig/custom_case_run_test.png +0 -0
  41. vectordb_bench/fig/custom_dataset.png +0 -0
  42. vectordb_bench/fig/homepage/bar-chart.png +0 -0
  43. vectordb_bench/fig/homepage/concurrent.png +0 -0
  44. vectordb_bench/fig/homepage/custom.png +0 -0
  45. vectordb_bench/fig/homepage/label_filter.png +0 -0
  46. vectordb_bench/fig/homepage/qp$.png +0 -0
  47. vectordb_bench/fig/homepage/run_test.png +0 -0
  48. vectordb_bench/fig/homepage/streaming.png +0 -0
  49. vectordb_bench/fig/homepage/table.png +0 -0
  50. vectordb_bench/fig/run_test_select_case.png +0 -0
  51. vectordb_bench/fig/run_test_select_db.png +0 -0
  52. vectordb_bench/fig/run_test_submit.png +0 -0
  53. vectordb_bench/frontend/components/check_results/filters.py +1 -4
  54. vectordb_bench/frontend/components/check_results/nav.py +2 -1
  55. vectordb_bench/frontend/components/concurrent/charts.py +5 -0
  56. vectordb_bench/frontend/components/int_filter/charts.py +60 -0
  57. vectordb_bench/frontend/components/streaming/data.py +7 -0
  58. vectordb_bench/frontend/components/welcome/welcomePrams.py +42 -4
  59. vectordb_bench/frontend/config/dbCaseConfigs.py +142 -16
  60. vectordb_bench/frontend/config/styles.py +4 -0
  61. vectordb_bench/frontend/pages/concurrent.py +1 -1
  62. vectordb_bench/frontend/pages/custom.py +1 -1
  63. vectordb_bench/frontend/pages/int_filter.py +56 -0
  64. vectordb_bench/frontend/pages/streaming.py +16 -3
  65. vectordb_bench/interface.py +5 -1
  66. vectordb_bench/metric.py +7 -0
  67. vectordb_bench/models.py +39 -4
  68. vectordb_bench/results/S3Vectors/result_20250722_standard_s3vectors.json +2509 -0
  69. vectordb_bench/results/getLeaderboardDataV2.py +23 -2
  70. vectordb_bench/results/leaderboard_v2.json +200 -0
  71. vectordb_bench/results/leaderboard_v2_streaming.json +128 -0
  72. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/METADATA +40 -8
  73. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/RECORD +77 -51
  74. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/WHEEL +0 -0
  75. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/entry_points.txt +0 -0
  76. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/licenses/LICENSE +0 -0
  77. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,582 @@
1
+ import logging
2
+ import time
3
+ from collections.abc import Iterable
4
+ from contextlib import contextmanager, suppress
5
+ from typing import Any, Final
6
+
7
+ from opensearchpy import OpenSearch
8
+
9
+ from vectordb_bench.backend.filter import Filter, FilterOp
10
+
11
+ from ..api import VectorDB
12
+ from .config import OSSOpenSearchIndexConfig, OSSOS_Engine
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+ WAITING_FOR_REFRESH_SEC: Final[int] = 30
17
+ WAITING_FOR_FORCE_MERGE_SEC: Final[int] = 30
18
+ SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC: Final[int] = 30
19
+
20
+
21
+ class OpenSearchError(Exception):
22
+ """Custom exception for OpenSearch operations."""
23
+
24
+
25
+ class OpenSearchSettingsManager:
26
+ """Manages OpenSearch cluster and index settings."""
27
+
28
+ def __init__(self, client: OpenSearch, index_name: str) -> None:
29
+ self.client = client
30
+ self.index_name = index_name
31
+
32
+ def apply_cluster_settings(self, settings: dict[str, Any], log_message: str = "Applied cluster settings") -> dict:
33
+ """Apply cluster-level settings."""
34
+ try:
35
+ response = self.client.cluster.put_settings(body={"persistent": settings})
36
+ log.info(log_message)
37
+ except Exception as e:
38
+ log.warning(f"Failed to apply cluster settings: {e}")
39
+ error_msg = f"Cluster settings application failed: {e}"
40
+ raise OpenSearchError(error_msg) from e
41
+ else:
42
+ return response
43
+
44
+ def apply_index_settings(self, settings: dict[str, Any], log_message: str = "Applied index settings") -> dict:
45
+ """Apply index-level settings."""
46
+ try:
47
+ response = self.client.indices.put_settings(index=self.index_name, body={"index": settings})
48
+ log.info(log_message)
49
+ except Exception as e:
50
+ log.warning(f"Failed to apply index settings: {e}")
51
+ error_msg = f"Index settings application failed: {e}"
52
+ raise OpenSearchError(error_msg) from e
53
+ else:
54
+ return response
55
+
56
+
57
+ class BulkInsertManager:
58
+ """Manages bulk insertion operations with chunking and parallelization."""
59
+
60
+ def __init__(self, client: OpenSearch, index_name: str, case_config: OSSOpenSearchIndexConfig) -> None:
61
+ self.client = client
62
+ self.index_name = index_name
63
+ self.case_config = case_config
64
+
65
+ def prepare_bulk_data(
66
+ self,
67
+ embeddings: list[list[float]],
68
+ metadata: list[int],
69
+ labels_data: list[str] | None,
70
+ id_col_name: str,
71
+ vector_col_name: str,
72
+ label_col_name: str,
73
+ with_scalar_labels: bool,
74
+ ) -> list[dict[str, Any]]:
75
+ """Prepare bulk actions for OpenSearch bulk insert."""
76
+ if len(embeddings) != len(metadata):
77
+ error_msg = f"Embeddings ({len(embeddings)}) and metadata ({len(metadata)}) length mismatch"
78
+ raise ValueError(error_msg)
79
+
80
+ if with_scalar_labels and labels_data and len(labels_data) != len(embeddings):
81
+ error_msg = f"Labels data ({len(labels_data)}) and embeddings ({len(embeddings)}) length mismatch"
82
+ raise ValueError(error_msg)
83
+
84
+ insert_data: list[dict[str, Any]] = []
85
+ for i in range(len(embeddings)):
86
+ index_data = {"index": {"_index": self.index_name, id_col_name: metadata[i]}}
87
+ if with_scalar_labels and self.case_config.use_routing and labels_data:
88
+ index_data["routing"] = labels_data[i]
89
+ insert_data.append(index_data)
90
+
91
+ other_data = {vector_col_name: embeddings[i]}
92
+ if with_scalar_labels and labels_data:
93
+ other_data[label_col_name] = labels_data[i]
94
+ insert_data.append(other_data)
95
+ return insert_data
96
+
97
+ def execute_single_client_insert(self, insert_data: list[dict[str, Any]]) -> tuple[int, Exception | None]:
98
+ """Execute bulk insert with single client and retry logic."""
99
+ try:
100
+ response = self.client.bulk(body=insert_data)
101
+ if response.get("errors"):
102
+ log.warning(f"Bulk insert had errors: {response}")
103
+ return len(insert_data) // 2, None
104
+ except Exception as e:
105
+ log.warning(f"Failed to insert data: {self.index_name} error: {e!s}")
106
+ time.sleep(10)
107
+ return self.execute_single_client_insert(insert_data)
108
+
109
+
110
+ class SearchQueryBuilder:
111
+ """Builds OpenSearch KNN queries with proper configuration."""
112
+
113
+ def __init__(self, case_config: OSSOpenSearchIndexConfig, vector_col_name: str) -> None:
114
+ self.case_config = case_config
115
+ self.vector_col_name = vector_col_name
116
+
117
+ def build_knn_query(
118
+ self, query_vector: list[float], k: int, filter_clause: dict[str, Any] | None = None
119
+ ) -> dict[str, Any]:
120
+ """Build a KNN query with optional filtering."""
121
+ knn_config: dict[str, Any] = {
122
+ "vector": query_vector,
123
+ "k": k,
124
+ "method_parameters": self.case_config.search_param(),
125
+ }
126
+
127
+ if filter_clause:
128
+ knn_config["filter"] = filter_clause
129
+
130
+ if self.case_config.use_quant:
131
+ knn_config["rescore"] = {"oversample_factor": self.case_config.oversample_factor}
132
+
133
+ return {"size": k, "query": {"knn": {self.vector_col_name: knn_config}}}
134
+
135
+ def build_search_kwargs(
136
+ self, index_name: str, body: dict[str, Any], k: int, id_col_name: str, routing_key: str | None = None
137
+ ) -> dict[str, Any]:
138
+ """Build search kwargs with proper field selection."""
139
+ search_kwargs: dict[str, Any] = {
140
+ "index": index_name,
141
+ "body": body,
142
+ "size": k,
143
+ "_source": False,
144
+ "preference": "_only_local" if self.case_config.number_of_shards == 1 else None,
145
+ "routing": routing_key,
146
+ }
147
+
148
+ if id_col_name == "_id":
149
+ search_kwargs["stored_fields"] = "_id"
150
+ else:
151
+ search_kwargs["docvalue_fields"] = [id_col_name]
152
+ search_kwargs["stored_fields"] = "_none_"
153
+
154
+ return search_kwargs
155
+
156
+
157
+ class OSSOpenSearch(VectorDB):
158
+ """OpenSearch client implementation for VectorDBBench."""
159
+
160
+ supported_filter_types: list[FilterOp] = [
161
+ FilterOp.NonFilter,
162
+ FilterOp.NumGE,
163
+ FilterOp.StrEqual,
164
+ ]
165
+
166
+ def __init__(
167
+ self,
168
+ dim: int,
169
+ db_config: dict[str, Any],
170
+ db_case_config: OSSOpenSearchIndexConfig,
171
+ index_name: str = "vdb_bench_index", # must be lowercase
172
+ id_col_name: str = "_id",
173
+ label_col_name: str = "label",
174
+ vector_col_name: str = "embedding",
175
+ drop_old: bool = False,
176
+ with_scalar_labels: bool = False,
177
+ **kwargs: Any,
178
+ ) -> None:
179
+ """Initialize the OpenSearch client."""
180
+ self.dim = dim
181
+ self.db_config = db_config
182
+ self.case_config = db_case_config
183
+ self.index_name = index_name
184
+ self.id_col_name = id_col_name
185
+ self.label_col_name = label_col_name
186
+ self.vector_col_name = vector_col_name
187
+ self.with_scalar_labels = with_scalar_labels
188
+
189
+ # Initialize client state
190
+ self.client: OpenSearch | None = None
191
+ self.filter: dict[str, Any] | None = None
192
+ self.routing_key: str | None = None
193
+
194
+ log.info(f"OSS_OpenSearch client config: {self.db_config}")
195
+ log.info(f"OSS_OpenSearch db case config: {self.case_config}")
196
+ client = OpenSearch(**self.db_config)
197
+ self._handle_index_initialization(client, drop_old)
198
+
199
+ def _handle_index_initialization(self, client: OpenSearch, drop_old: bool) -> None:
200
+ """Check, drop, create index, and perform post-creation setup."""
201
+ if drop_old:
202
+ log.info(f"OSS_OpenSearch client drop old index: {self.index_name}")
203
+ is_existed = client.indices.exists(index=self.index_name)
204
+ if is_existed:
205
+ client.indices.delete(index=self.index_name)
206
+ self._create_index(client)
207
+ else:
208
+ is_existed = client.indices.exists(index=self.index_name)
209
+ if not is_existed:
210
+ self._create_index(client)
211
+ log.info(f"OSS_OpenSearch client create index: {self.index_name}")
212
+ self._update_ef_search_before_search(client)
213
+ self._load_graphs_to_memory(client)
214
+
215
+ def need_normalize_cosine(self) -> bool:
216
+ """Whether this database needs to normalize dataset to support COSINE metric."""
217
+ return True
218
+
219
+ def _get_settings_manager(self, client: OpenSearch) -> OpenSearchSettingsManager:
220
+ """Get settings manager for the given client."""
221
+ return OpenSearchSettingsManager(client, self.index_name)
222
+
223
+ def _get_bulk_manager(self, client: OpenSearch) -> BulkInsertManager:
224
+ """Get bulk insert manager for the given client."""
225
+ return BulkInsertManager(client, self.index_name, self.case_config)
226
+
227
+ def _create_index(self, client: OpenSearch) -> None:
228
+ ef_search_value = self.case_config.efSearch
229
+ log.info(f"Creating index with ef_search: {ef_search_value}")
230
+ log.info(f"Creating index with number_of_replicas: {self.case_config.number_of_replicas}")
231
+ log.info(f"Creating index with engine: {self.case_config.engine}")
232
+ log.info(f"Creating index with metric type: {self.case_config.metric_type_name}")
233
+ log.info(f"All case_config parameters: {self.case_config.__dict__}")
234
+
235
+ settings_manager = self._get_settings_manager(client)
236
+ cluster_settings = {
237
+ "knn.algo_param.index_thread_qty": self.case_config.index_thread_qty,
238
+ "knn.memory.circuit_breaker.limit": self.case_config.cb_threshold,
239
+ }
240
+ settings_manager.apply_cluster_settings(
241
+ cluster_settings, "Successfully updated cluster settings for index creation"
242
+ )
243
+ settings = {
244
+ "index": {
245
+ "knn": True,
246
+ "number_of_shards": self.case_config.number_of_shards,
247
+ "number_of_replicas": self.case_config.number_of_replicas,
248
+ "translog.flush_threshold_size": self.case_config.flush_threshold_size,
249
+ "knn.advanced.approximate_threshold": "-1",
250
+ },
251
+ "refresh_interval": self.case_config.refresh_interval,
252
+ }
253
+ settings["index"]["knn.algo_param.ef_search"] = ef_search_value
254
+ # Build properties mapping, excluding _id which is automatically handled by OpenSearch
255
+ properties = {}
256
+
257
+ # Only add id field to properties if it's not the special _id field
258
+ if self.id_col_name != "_id":
259
+ properties[self.id_col_name] = {"type": "integer", "store": True}
260
+
261
+ properties[self.label_col_name] = {"type": "keyword"}
262
+ properties[self.vector_col_name] = {
263
+ "type": "knn_vector",
264
+ "dimension": self.dim,
265
+ "method": self.case_config.index_param(),
266
+ }
267
+
268
+ mappings = {
269
+ "properties": properties,
270
+ }
271
+ try:
272
+ log.info(f"Creating index with settings: {settings}")
273
+ log.info(f"Creating index with mappings: {mappings}")
274
+ client.indices.create(
275
+ index=self.index_name,
276
+ body={"settings": settings, "mappings": mappings},
277
+ )
278
+ except Exception as e:
279
+ log.warning(f"Failed to create index: {self.index_name} error: {e!s}")
280
+ raise e from None
281
+
282
+ @contextmanager
283
+ def init(self) -> None:
284
+ """Connect to OpenSearch"""
285
+ self.client = OpenSearch(**self.db_config)
286
+
287
+ yield
288
+ self.client = None
289
+ del self.client
290
+
291
+ def _prepare_bulk_data(
292
+ self,
293
+ embeddings: Iterable[list[float]],
294
+ metadata: list[int],
295
+ labels_data: list[str] | None = None,
296
+ ) -> list[dict]:
297
+ """Prepare the list of bulk actions for OpenSearch bulk insert."""
298
+ bulk_manager = self._get_bulk_manager(self.client)
299
+ return bulk_manager.prepare_bulk_data(
300
+ list(embeddings),
301
+ metadata,
302
+ labels_data,
303
+ self.id_col_name,
304
+ self.vector_col_name,
305
+ self.label_col_name,
306
+ self.with_scalar_labels,
307
+ )
308
+
309
+ def insert_embeddings(
310
+ self,
311
+ embeddings: Iterable[list[float]],
312
+ metadata: list[int],
313
+ labels_data: list[str] | None = None,
314
+ **kwargs: Any,
315
+ ) -> tuple[int, Exception | None]:
316
+ """Insert embeddings into the OpenSearch index."""
317
+ assert self.client is not None, "should self.init() first"
318
+
319
+ num_clients = self.case_config.number_of_indexing_clients or 1
320
+ log.info(f"Number of indexing clients from case_config: {num_clients}")
321
+
322
+ if num_clients <= 1:
323
+ log.info("Using single client for data insertion")
324
+ return self._insert_with_single_client(embeddings, metadata, labels_data)
325
+ log.info(f"Using {num_clients} parallel clients for data insertion")
326
+ return self._insert_with_multiple_clients(embeddings, metadata, num_clients, labels_data)
327
+
328
+ def _insert_with_single_client(
329
+ self,
330
+ embeddings: Iterable[list[float]],
331
+ metadata: list[int],
332
+ labels_data: list[str] | None = None,
333
+ ) -> tuple[int, Exception | None]:
334
+ """Insert data using a single client with retry logic."""
335
+ insert_data = self._prepare_bulk_data(embeddings, metadata, labels_data)
336
+ bulk_manager = self._get_bulk_manager(self.client)
337
+ return bulk_manager.execute_single_client_insert(insert_data)
338
+
339
+ def _insert_with_multiple_clients(
340
+ self,
341
+ embeddings: Iterable[list[float]],
342
+ metadata: list[int],
343
+ num_clients: int,
344
+ labels_data: list[str] | None = None,
345
+ ) -> tuple[int, Exception | None]:
346
+ """Insert data using multiple parallel clients for better performance."""
347
+ import concurrent.futures
348
+ from concurrent.futures import ThreadPoolExecutor
349
+
350
+ embeddings_list = list(embeddings)
351
+ chunk_size = max(1, len(embeddings_list) // num_clients)
352
+ chunks = []
353
+
354
+ for i in range(0, len(embeddings_list), chunk_size):
355
+ end = min(i + chunk_size, len(embeddings_list))
356
+ chunks.append((embeddings_list[i:end], metadata[i:end], labels_data[i:end]))
357
+ clients = [OpenSearch(**self.db_config) for _ in range(min(num_clients, len(chunks)))]
358
+ log.info(f"OSS_OpenSearch using {len(clients)} parallel clients for data insertion")
359
+
360
+ def insert_chunk(client_idx: int, chunk_idx: int):
361
+ chunk_embeddings, chunk_metadata, chunk_labels_data = chunks[chunk_idx]
362
+ client = clients[client_idx]
363
+ insert_data = self._prepare_bulk_data(chunk_embeddings, chunk_metadata, chunk_labels_data)
364
+ try:
365
+ response = client.bulk(body=insert_data)
366
+ log.info(f"Client {client_idx} added {len(response['items'])} documents")
367
+ return len(chunk_embeddings), None
368
+ except Exception as e:
369
+ log.warning(f"Client {client_idx} failed to insert data: {e!s}")
370
+ return 0, e
371
+
372
+ results = []
373
+ with ThreadPoolExecutor(max_workers=len(clients)) as executor:
374
+ futures = [
375
+ executor.submit(insert_chunk, chunk_idx % len(clients), chunk_idx) for chunk_idx in range(len(chunks))
376
+ ]
377
+ for future in concurrent.futures.as_completed(futures):
378
+ count, error = future.result()
379
+ results.append((count, error))
380
+
381
+ for client in clients:
382
+ with suppress(Exception):
383
+ client.close()
384
+
385
+ total_count = sum(count for count, _ in results)
386
+ errors = [error for _, error in results if error is not None]
387
+
388
+ if errors:
389
+ log.warning("Some clients failed to insert data, retrying with single client")
390
+ time.sleep(10)
391
+ return self._insert_with_single_client(embeddings, metadata, labels_data)
392
+
393
+ response = self.client.indices.stats(self.index_name)
394
+ log.info(
395
+ f"""Total document count in index after parallel insertion:
396
+ {response['_all']['primaries']['indexing']['index_total']}""",
397
+ )
398
+
399
+ return (total_count, None)
400
+
401
+ def _update_ef_search_before_search(self, client: OpenSearch):
402
+ ef_search_value = self.case_config.efSearch
403
+
404
+ try:
405
+ index_settings = client.indices.get_settings(index=self.index_name)
406
+ current_ef_search = (
407
+ index_settings.get(self.index_name, {})
408
+ .get("settings", {})
409
+ .get("index", {})
410
+ .get("knn.algo_param", {})
411
+ .get("ef_search")
412
+ )
413
+
414
+ if current_ef_search != str(ef_search_value):
415
+ settings_manager = self._get_settings_manager(client)
416
+ log_message = f"Successfully updated ef_search to {ef_search_value} before search"
417
+ settings_manager.apply_index_settings({"knn.algo_param.ef_search": ef_search_value}, log_message)
418
+ log.info(f"Current engine: {self.case_config.engine}")
419
+ log.info(f"Current metric_type: {self.case_config.metric_type_name}")
420
+
421
+ except Exception as e:
422
+ log.warning(f"Failed to update ef_search parameter before search: {e}")
423
+
424
+ def search_embedding(
425
+ self,
426
+ query: list[float],
427
+ k: int = 100,
428
+ filters: Filter | None = None,
429
+ **kwargs,
430
+ ) -> list[int]:
431
+ """Get k most similar embeddings to query vector.
432
+
433
+ Args:
434
+ query(list[float]): query embedding to look up documents similar to.
435
+ k(int): Number of most similar embeddings to return. Defaults to 100.
436
+ filters(Filter, optional): filtering expression to filter the data while searching.
437
+
438
+ Returns:
439
+ list[int]: list of k most similar ids to the query embedding.
440
+ """
441
+ assert self.client is not None, "should self.init() first"
442
+
443
+ search_query_builder = SearchQueryBuilder(self.case_config, self.vector_col_name)
444
+ body = search_query_builder.build_knn_query(query, k, self.filter)
445
+
446
+ try:
447
+ search_kwargs = search_query_builder.build_search_kwargs(
448
+ self.index_name, body, k, self.id_col_name, self.routing_key
449
+ )
450
+ response = self.client.search(**search_kwargs)
451
+
452
+ log.debug(f"Search took: {response['took']}")
453
+ log.debug(f"Search shards: {response['_shards']}")
454
+ log.debug(f"Search hits total: {response['hits']['total']}")
455
+ try:
456
+ if self.id_col_name == "_id":
457
+ # Get _id directly from hit metadata
458
+ result_ids = []
459
+ for h in response["hits"]["hits"]:
460
+ if (doc_id := h.get("_id")) is not None:
461
+ result_ids.append(int(doc_id))
462
+ else:
463
+ log.warning(f"Hit missing _id in final extraction: {h}")
464
+ else:
465
+ # Get custom id field from docvalue fields
466
+ result_ids = [int(h["fields"][self.id_col_name][0]) for h in response["hits"]["hits"]]
467
+
468
+ except Exception:
469
+ # empty results
470
+ return []
471
+ else:
472
+ return result_ids
473
+ except Exception as e:
474
+ log.warning(f"Failed to search: {self.index_name} error: {e!s}")
475
+ raise e from None
476
+
477
+ def prepare_filter(self, filters: Filter) -> None:
478
+ """Prepare filter conditions for search operations."""
479
+ self.routing_key = None
480
+ if filters.type == FilterOp.NonFilter:
481
+ self.filter = None
482
+ elif filters.type == FilterOp.NumGE:
483
+ self.filter = {"range": {self.id_col_name: {"gt": filters.int_value}}}
484
+ elif filters.type == FilterOp.StrEqual:
485
+ self.filter = {"term": {self.label_col_name: filters.label_value}}
486
+ if self.case_config.use_routing:
487
+ self.routing_key = filters.label_value
488
+ else:
489
+ msg = f"Filter type {filters.type} not supported for OpenSearch"
490
+ log.error(f"Unsupported filter type: {filters.type}")
491
+ raise ValueError(msg)
492
+
493
+ def optimize(self, data_size: int | None = None) -> None:
494
+ """Optimize the index for better search performance."""
495
+ self._update_ef_search()
496
+ # Call refresh first to ensure that all segments are created
497
+ self._refresh_index()
498
+ if self.case_config.force_merge_enabled:
499
+ self._do_force_merge()
500
+ self._refresh_index()
501
+ self._update_replicas()
502
+ # Call refresh again to ensure that the index is ready after force merge.
503
+ self._refresh_index()
504
+ # ensure that all graphs are loaded in memory and ready for search
505
+ self._load_graphs_to_memory(self.client)
506
+
507
+ def _update_ef_search(self):
508
+ ef_search_value = self.case_config.efSearch
509
+ settings_manager = self._get_settings_manager(self.client)
510
+ log_message = f"Successfully updated ef_search to {ef_search_value}"
511
+ settings_manager.apply_index_settings({"knn.algo_param.ef_search": ef_search_value}, log_message)
512
+ log.info(f"Current engine: {self.case_config.engine}")
513
+ log.info(f"Current metric_type: {self.case_config.metric_type}")
514
+
515
+ def _update_replicas(self):
516
+ index_settings = self.client.indices.get_settings(index=self.index_name)
517
+ current_number_of_replicas = int(index_settings[self.index_name]["settings"]["index"]["number_of_replicas"])
518
+ log.info(
519
+ f"Current Number of replicas are {current_number_of_replicas}"
520
+ f" and changing the replicas to {self.case_config.number_of_replicas}"
521
+ )
522
+ settings_manager = self._get_settings_manager(self.client)
523
+ log_message = f"Successfully updated number of replicas to {self.case_config.number_of_replicas}"
524
+ settings_manager.apply_index_settings({"number_of_replicas": self.case_config.number_of_replicas}, log_message)
525
+ self._wait_till_green()
526
+
527
+ def _wait_till_green(self):
528
+ log.info("Wait for index to become green..")
529
+ while True:
530
+ res = self.client.cat.indices(index=self.index_name, h="health", format="json")
531
+ health = res[0]["health"]
532
+ if health == "green":
533
+ break
534
+ log.info(f"The index {self.index_name} has health : {health} and is not green. Retrying")
535
+ time.sleep(SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC)
536
+ log.info(f"Index {self.index_name} is green..")
537
+
538
+ def _refresh_index(self):
539
+ log.debug(f"Starting refresh for index {self.index_name}")
540
+ while True:
541
+ try:
542
+ log.info("Starting the Refresh Index..")
543
+ self.client.indices.refresh(index=self.index_name)
544
+ break
545
+ except Exception as e:
546
+ log.info(
547
+ f"Refresh errored out. Sleeping for {WAITING_FOR_REFRESH_SEC} sec and then Retrying : {e}",
548
+ )
549
+ time.sleep(WAITING_FOR_REFRESH_SEC)
550
+ continue
551
+ log.debug(f"Completed refresh for index {self.index_name}")
552
+
553
+ def _do_force_merge(self):
554
+ log.info(f"Updating the Index thread qty to {self.case_config.index_thread_qty_during_force_merge}.")
555
+
556
+ settings_manager = self._get_settings_manager(self.client)
557
+ cluster_settings = {"knn.algo_param.index_thread_qty": self.case_config.index_thread_qty_during_force_merge}
558
+ log_message_cluster = (
559
+ f"Successfully updated cluster index thread qty to {self.case_config.index_thread_qty_during_force_merge}"
560
+ )
561
+ settings_manager.apply_cluster_settings(cluster_settings, log_message_cluster)
562
+ log.info("Updating the graph threshold to ensure that during merge we can do graph creation.")
563
+ log_message_index = "Successfully updated index approximate threshold to 0"
564
+ output = settings_manager.apply_index_settings({"knn.advanced.approximate_threshold": "0"}, log_message_index)
565
+ log.info(f"response of updating setting is: {output}")
566
+
567
+ log.info(f"Starting force merge for index {self.index_name}")
568
+ segments = self.case_config.number_of_segments
569
+ force_merge_endpoint = f"/{self.index_name}/_forcemerge?max_num_segments={segments}&wait_for_completion=false"
570
+ force_merge_task_id = self.client.transport.perform_request("POST", force_merge_endpoint)["task"]
571
+ while True:
572
+ time.sleep(WAITING_FOR_FORCE_MERGE_SEC)
573
+ task_status = self.client.tasks.get(task_id=force_merge_task_id)
574
+ if task_status["completed"]:
575
+ break
576
+ log.info(f"Completed force merge for index {self.index_name}")
577
+
578
+ def _load_graphs_to_memory(self, client: OpenSearch):
579
+ if self.case_config.engine != OSSOS_Engine.lucene:
580
+ log.info("Calling warmup API to load graphs into memory")
581
+ warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
582
+ client.transport.perform_request("GET", warmup_endpoint)