llama-index-vector-stores-opensearch 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.
- llama_index/vector_stores/opensearch/base.py +82 -29
- {llama_index_vector_stores_opensearch-0.3.0.dist-info → llama_index_vector_stores_opensearch-0.4.0.dist-info}/METADATA +1 -1
- llama_index_vector_stores_opensearch-0.4.0.dist-info/RECORD +6 -0
- llama_index_vector_stores_opensearch-0.3.0.dist-info/RECORD +0 -6
- {llama_index_vector_stores_opensearch-0.3.0.dist-info → llama_index_vector_stores_opensearch-0.4.0.dist-info}/WHEEL +0 -0
|
@@ -88,8 +88,9 @@ class OpensearchVectorClient:
|
|
|
88
88
|
settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
|
|
89
89
|
if embedding_field is None:
|
|
90
90
|
embedding_field = "embedding"
|
|
91
|
-
self._embedding_field = embedding_field
|
|
92
91
|
|
|
92
|
+
self._method = method
|
|
93
|
+
self._embedding_field = embedding_field
|
|
93
94
|
self._endpoint = endpoint
|
|
94
95
|
self._dim = dim
|
|
95
96
|
self._index = index
|
|
@@ -119,6 +120,10 @@ class OpensearchVectorClient:
|
|
|
119
120
|
self._os_async_client = self._get_async_opensearch_client(
|
|
120
121
|
self._endpoint, **kwargs
|
|
121
122
|
)
|
|
123
|
+
self._os_version = self._get_opensearch_version()
|
|
124
|
+
self._efficient_filtering_enabled = self._is_efficient_filtering_enabled(
|
|
125
|
+
self._os_version
|
|
126
|
+
)
|
|
122
127
|
not_found_error = self._import_not_found_error()
|
|
123
128
|
|
|
124
129
|
try:
|
|
@@ -192,6 +197,10 @@ class OpensearchVectorClient:
|
|
|
192
197
|
)
|
|
193
198
|
return client
|
|
194
199
|
|
|
200
|
+
def _get_opensearch_version(self) -> str:
|
|
201
|
+
info = self._os_client.info()
|
|
202
|
+
return info["version"]["number"]
|
|
203
|
+
|
|
195
204
|
def _bulk_ingest_embeddings(
|
|
196
205
|
self,
|
|
197
206
|
client: Any,
|
|
@@ -298,14 +307,27 @@ class OpensearchVectorClient:
|
|
|
298
307
|
self,
|
|
299
308
|
query_vector: List[float],
|
|
300
309
|
k: int = 4,
|
|
310
|
+
filters: Optional[Union[Dict, List]] = None,
|
|
301
311
|
vector_field: str = "embedding",
|
|
302
312
|
) -> Dict:
|
|
303
313
|
"""For Approximate k-NN Search, this is the default query."""
|
|
304
|
-
|
|
314
|
+
query = {
|
|
305
315
|
"size": k,
|
|
306
|
-
"query": {
|
|
316
|
+
"query": {
|
|
317
|
+
"knn": {
|
|
318
|
+
vector_field: {
|
|
319
|
+
"vector": query_vector,
|
|
320
|
+
"k": k,
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
},
|
|
307
324
|
}
|
|
308
325
|
|
|
326
|
+
if filters:
|
|
327
|
+
# filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
|
|
328
|
+
query["query"]["knn"][vector_field]["filter"] = filters
|
|
329
|
+
return query
|
|
330
|
+
|
|
309
331
|
def _is_text_field(self, value: Any) -> bool:
|
|
310
332
|
"""Check if value is a string and keyword filtering needs to be performed.
|
|
311
333
|
|
|
@@ -346,7 +368,12 @@ class OpensearchVectorClient:
|
|
|
346
368
|
}
|
|
347
369
|
}
|
|
348
370
|
elif op in [FilterOperator.IN, FilterOperator.ANY]:
|
|
349
|
-
|
|
371
|
+
if isinstance(filter.value, list) and all(
|
|
372
|
+
self._is_text_field(val) for val in filter.value
|
|
373
|
+
):
|
|
374
|
+
return {"terms": {f"{key}.keyword": filter.value}}
|
|
375
|
+
else:
|
|
376
|
+
return {"terms": {key: filter.value}}
|
|
350
377
|
elif op == FilterOperator.NIN:
|
|
351
378
|
return {"bool": {"must_not": {"terms": {key: filter.value}}}}
|
|
352
379
|
elif op == FilterOperator.ALL:
|
|
@@ -396,52 +423,73 @@ class OpensearchVectorClient:
|
|
|
396
423
|
query_embedding: List[float],
|
|
397
424
|
k: int,
|
|
398
425
|
filters: Optional[MetadataFilters] = None,
|
|
426
|
+
search_method="approximate",
|
|
399
427
|
) -> Dict:
|
|
400
428
|
"""
|
|
401
|
-
|
|
429
|
+
Perform a k-Nearest Neighbors (kNN) search.
|
|
402
430
|
|
|
403
|
-
If
|
|
404
|
-
|
|
405
|
-
|
|
431
|
+
If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
|
|
432
|
+
Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
|
|
433
|
+
OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
|
|
406
434
|
|
|
407
435
|
Note:
|
|
408
|
-
-AWS
|
|
409
|
-
-
|
|
436
|
+
- AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
|
|
437
|
+
- Approximate kNN search does not support pre-filtering.
|
|
410
438
|
|
|
411
439
|
Args:
|
|
412
|
-
query_embedding: Vector embedding to query.
|
|
413
|
-
k: Maximum number of results.
|
|
414
|
-
filters: Optional filters to apply
|
|
440
|
+
query_embedding (List[float]): Vector embedding to query.
|
|
441
|
+
k (int): Maximum number of results.
|
|
442
|
+
filters (Optional[MetadataFilters]): Optional filters to apply for the search.
|
|
415
443
|
Supports filter-context queries documented at
|
|
416
444
|
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
417
445
|
|
|
418
446
|
Returns:
|
|
419
|
-
Up to k
|
|
447
|
+
Dict: Up to k documents closest to query_embedding.
|
|
420
448
|
"""
|
|
421
|
-
|
|
422
|
-
|
|
449
|
+
filters = self._parse_filters(filters)
|
|
450
|
+
|
|
451
|
+
if not filters:
|
|
423
452
|
search_query = self._default_approximate_search_query(
|
|
424
|
-
query_embedding, k, vector_field=embedding_field
|
|
425
|
-
)
|
|
426
|
-
elif self.is_aoss:
|
|
427
|
-
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
428
|
-
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
429
|
-
search_query = self._default_scoring_script_query(
|
|
430
453
|
query_embedding,
|
|
431
454
|
k,
|
|
432
|
-
space_type=self.space_type,
|
|
433
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
434
455
|
vector_field=embedding_field,
|
|
435
456
|
)
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
457
|
+
elif (
|
|
458
|
+
search_method == "approximate"
|
|
459
|
+
and self._method["engine"]
|
|
460
|
+
in [
|
|
461
|
+
"lucene",
|
|
462
|
+
"faiss",
|
|
463
|
+
]
|
|
464
|
+
and self._efficient_filtering_enabled
|
|
465
|
+
):
|
|
466
|
+
# if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
|
|
467
|
+
search_query = self._default_approximate_search_query(
|
|
439
468
|
query_embedding,
|
|
440
469
|
k,
|
|
441
|
-
|
|
442
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
470
|
+
filters={"bool": {"filter": filters}},
|
|
443
471
|
vector_field=embedding_field,
|
|
444
472
|
)
|
|
473
|
+
else:
|
|
474
|
+
if self.is_aoss:
|
|
475
|
+
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
476
|
+
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
477
|
+
search_query = self._default_scoring_script_query(
|
|
478
|
+
query_embedding,
|
|
479
|
+
k,
|
|
480
|
+
space_type=self.space_type,
|
|
481
|
+
pre_filter={"bool": {"filter": filters}},
|
|
482
|
+
vector_field=embedding_field,
|
|
483
|
+
)
|
|
484
|
+
else:
|
|
485
|
+
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
486
|
+
search_query = self._default_scoring_script_query(
|
|
487
|
+
query_embedding,
|
|
488
|
+
k,
|
|
489
|
+
space_type="l2Squared",
|
|
490
|
+
pre_filter={"bool": {"filter": filters}},
|
|
491
|
+
vector_field=embedding_field,
|
|
492
|
+
)
|
|
445
493
|
return search_query
|
|
446
494
|
|
|
447
495
|
def _hybrid_search_query(
|
|
@@ -566,6 +614,11 @@ class OpensearchVectorClient:
|
|
|
566
614
|
return True
|
|
567
615
|
return False
|
|
568
616
|
|
|
617
|
+
def _is_efficient_filtering_enabled(self, os_version: str) -> bool:
|
|
618
|
+
"""Check if kNN with efficient filtering is enabled."""
|
|
619
|
+
major, minor, patch = os_version.split(".")
|
|
620
|
+
return int(major) >= 2 and int(minor) >= 9
|
|
621
|
+
|
|
569
622
|
def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
570
623
|
"""Store results in the index."""
|
|
571
624
|
embeddings: List[List[float]] = []
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
|
|
3
|
+
llama_index/vector_stores/opensearch/base.py,sha256=vd14RnvqTNIPjopWptkQEkf5gVp71pBB7G9X_4l4jwI,36854
|
|
4
|
+
llama_index_vector_stores_opensearch-0.4.0.dist-info/METADATA,sha256=NecQMS0jaNnWSGZMAdeMauy3ehZYjhqc6EpnGn8PHx0,728
|
|
5
|
+
llama_index_vector_stores_opensearch-0.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
6
|
+
llama_index_vector_stores_opensearch-0.4.0.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
|
|
3
|
-
llama_index/vector_stores/opensearch/base.py,sha256=Dh7ppOm3eXJ8PNbPop37yvyhO22J6pKZ_1y6CU8Rwvk,34760
|
|
4
|
-
llama_index_vector_stores_opensearch-0.3.0.dist-info/METADATA,sha256=V2-zpbLlVE4x2FrKs2H0PNC_T9YRSTuB_SqGA72vDSo,728
|
|
5
|
-
llama_index_vector_stores_opensearch-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
6
|
-
llama_index_vector_stores_opensearch-0.3.0.dist-info/RECORD,,
|
|
File without changes
|