llama-index-vector-stores-opensearch 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/PKG-INFO +1 -1
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/llama_index/vector_stores/opensearch/base.py +86 -30
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/pyproject.toml +1 -1
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/README.md +0 -0
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/llama_index/py.typed +0 -0
- {llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/llama_index/vector_stores/opensearch/__init__.py +0 -0
|
@@ -56,6 +56,8 @@ class OpensearchVectorClient:
|
|
|
56
56
|
settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
|
|
57
57
|
{"index": {"knn": True, "knn.algo_param.ef_search": 100}}
|
|
58
58
|
space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
|
|
59
|
+
os_client (Optional[OSClient]): Custom synchronous client (see OpenSearch from opensearch-py)
|
|
60
|
+
os_async_client (Optional[OSClient]): Custom asynchronous client (see AsyncOpenSearch from opensearch-py)
|
|
59
61
|
**kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
|
|
60
62
|
|
|
61
63
|
"""
|
|
@@ -74,6 +76,7 @@ class OpensearchVectorClient:
|
|
|
74
76
|
max_chunk_bytes: int = 1 * 1024 * 1024,
|
|
75
77
|
search_pipeline: Optional[str] = None,
|
|
76
78
|
os_client: Optional[OSClient] = None,
|
|
79
|
+
os_async_client: Optional[OSClient] = None,
|
|
77
80
|
**kwargs: Any,
|
|
78
81
|
):
|
|
79
82
|
"""Init params."""
|
|
@@ -88,8 +91,9 @@ class OpensearchVectorClient:
|
|
|
88
91
|
settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
|
|
89
92
|
if embedding_field is None:
|
|
90
93
|
embedding_field = "embedding"
|
|
91
|
-
self._embedding_field = embedding_field
|
|
92
94
|
|
|
95
|
+
self._method = method
|
|
96
|
+
self._embedding_field = embedding_field
|
|
93
97
|
self._endpoint = endpoint
|
|
94
98
|
self._dim = dim
|
|
95
99
|
self._index = index
|
|
@@ -116,9 +120,13 @@ class OpensearchVectorClient:
|
|
|
116
120
|
self._os_client = os_client or self._get_opensearch_client(
|
|
117
121
|
self._endpoint, **kwargs
|
|
118
122
|
)
|
|
119
|
-
self._os_async_client = self._get_async_opensearch_client(
|
|
123
|
+
self._os_async_client = os_async_client or self._get_async_opensearch_client(
|
|
120
124
|
self._endpoint, **kwargs
|
|
121
125
|
)
|
|
126
|
+
self._os_version = self._get_opensearch_version()
|
|
127
|
+
self._efficient_filtering_enabled = self._is_efficient_filtering_enabled(
|
|
128
|
+
self._os_version
|
|
129
|
+
)
|
|
122
130
|
not_found_error = self._import_not_found_error()
|
|
123
131
|
|
|
124
132
|
try:
|
|
@@ -192,6 +200,10 @@ class OpensearchVectorClient:
|
|
|
192
200
|
)
|
|
193
201
|
return client
|
|
194
202
|
|
|
203
|
+
def _get_opensearch_version(self) -> str:
|
|
204
|
+
info = self._os_client.info()
|
|
205
|
+
return info["version"]["number"]
|
|
206
|
+
|
|
195
207
|
def _bulk_ingest_embeddings(
|
|
196
208
|
self,
|
|
197
209
|
client: Any,
|
|
@@ -298,14 +310,27 @@ class OpensearchVectorClient:
|
|
|
298
310
|
self,
|
|
299
311
|
query_vector: List[float],
|
|
300
312
|
k: int = 4,
|
|
313
|
+
filters: Optional[Union[Dict, List]] = None,
|
|
301
314
|
vector_field: str = "embedding",
|
|
302
315
|
) -> Dict:
|
|
303
316
|
"""For Approximate k-NN Search, this is the default query."""
|
|
304
|
-
|
|
317
|
+
query = {
|
|
305
318
|
"size": k,
|
|
306
|
-
"query": {
|
|
319
|
+
"query": {
|
|
320
|
+
"knn": {
|
|
321
|
+
vector_field: {
|
|
322
|
+
"vector": query_vector,
|
|
323
|
+
"k": k,
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
},
|
|
307
327
|
}
|
|
308
328
|
|
|
329
|
+
if filters:
|
|
330
|
+
# filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
|
|
331
|
+
query["query"]["knn"][vector_field]["filter"] = filters
|
|
332
|
+
return query
|
|
333
|
+
|
|
309
334
|
def _is_text_field(self, value: Any) -> bool:
|
|
310
335
|
"""Check if value is a string and keyword filtering needs to be performed.
|
|
311
336
|
|
|
@@ -346,7 +371,12 @@ class OpensearchVectorClient:
|
|
|
346
371
|
}
|
|
347
372
|
}
|
|
348
373
|
elif op in [FilterOperator.IN, FilterOperator.ANY]:
|
|
349
|
-
|
|
374
|
+
if isinstance(filter.value, list) and all(
|
|
375
|
+
self._is_text_field(val) for val in filter.value
|
|
376
|
+
):
|
|
377
|
+
return {"terms": {f"{key}.keyword": filter.value}}
|
|
378
|
+
else:
|
|
379
|
+
return {"terms": {key: filter.value}}
|
|
350
380
|
elif op == FilterOperator.NIN:
|
|
351
381
|
return {"bool": {"must_not": {"terms": {key: filter.value}}}}
|
|
352
382
|
elif op == FilterOperator.ALL:
|
|
@@ -396,52 +426,73 @@ class OpensearchVectorClient:
|
|
|
396
426
|
query_embedding: List[float],
|
|
397
427
|
k: int,
|
|
398
428
|
filters: Optional[MetadataFilters] = None,
|
|
429
|
+
search_method="approximate",
|
|
399
430
|
) -> Dict:
|
|
400
431
|
"""
|
|
401
|
-
|
|
432
|
+
Perform a k-Nearest Neighbors (kNN) search.
|
|
402
433
|
|
|
403
|
-
If
|
|
404
|
-
|
|
405
|
-
|
|
434
|
+
If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
|
|
435
|
+
Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
|
|
436
|
+
OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
|
|
406
437
|
|
|
407
438
|
Note:
|
|
408
|
-
-AWS
|
|
409
|
-
-
|
|
439
|
+
- AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
|
|
440
|
+
- Approximate kNN search does not support pre-filtering.
|
|
410
441
|
|
|
411
442
|
Args:
|
|
412
|
-
query_embedding: Vector embedding to query.
|
|
413
|
-
k: Maximum number of results.
|
|
414
|
-
filters: Optional filters to apply
|
|
443
|
+
query_embedding (List[float]): Vector embedding to query.
|
|
444
|
+
k (int): Maximum number of results.
|
|
445
|
+
filters (Optional[MetadataFilters]): Optional filters to apply for the search.
|
|
415
446
|
Supports filter-context queries documented at
|
|
416
447
|
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
417
448
|
|
|
418
449
|
Returns:
|
|
419
|
-
Up to k
|
|
450
|
+
Dict: Up to k documents closest to query_embedding.
|
|
420
451
|
"""
|
|
421
|
-
|
|
422
|
-
|
|
452
|
+
filters = self._parse_filters(filters)
|
|
453
|
+
|
|
454
|
+
if not filters:
|
|
423
455
|
search_query = self._default_approximate_search_query(
|
|
424
|
-
query_embedding, k, vector_field=embedding_field
|
|
425
|
-
)
|
|
426
|
-
elif self.is_aoss:
|
|
427
|
-
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
428
|
-
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
429
|
-
search_query = self._default_scoring_script_query(
|
|
430
456
|
query_embedding,
|
|
431
457
|
k,
|
|
432
|
-
space_type=self.space_type,
|
|
433
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
434
458
|
vector_field=embedding_field,
|
|
435
459
|
)
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
460
|
+
elif (
|
|
461
|
+
search_method == "approximate"
|
|
462
|
+
and self._method["engine"]
|
|
463
|
+
in [
|
|
464
|
+
"lucene",
|
|
465
|
+
"faiss",
|
|
466
|
+
]
|
|
467
|
+
and self._efficient_filtering_enabled
|
|
468
|
+
):
|
|
469
|
+
# if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
|
|
470
|
+
search_query = self._default_approximate_search_query(
|
|
439
471
|
query_embedding,
|
|
440
472
|
k,
|
|
441
|
-
|
|
442
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
473
|
+
filters={"bool": {"filter": filters}},
|
|
443
474
|
vector_field=embedding_field,
|
|
444
475
|
)
|
|
476
|
+
else:
|
|
477
|
+
if self.is_aoss:
|
|
478
|
+
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
479
|
+
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
480
|
+
search_query = self._default_scoring_script_query(
|
|
481
|
+
query_embedding,
|
|
482
|
+
k,
|
|
483
|
+
space_type=self.space_type,
|
|
484
|
+
pre_filter={"bool": {"filter": filters}},
|
|
485
|
+
vector_field=embedding_field,
|
|
486
|
+
)
|
|
487
|
+
else:
|
|
488
|
+
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
489
|
+
search_query = self._default_scoring_script_query(
|
|
490
|
+
query_embedding,
|
|
491
|
+
k,
|
|
492
|
+
space_type="l2Squared",
|
|
493
|
+
pre_filter={"bool": {"filter": filters}},
|
|
494
|
+
vector_field=embedding_field,
|
|
495
|
+
)
|
|
445
496
|
return search_query
|
|
446
497
|
|
|
447
498
|
def _hybrid_search_query(
|
|
@@ -566,6 +617,11 @@ class OpensearchVectorClient:
|
|
|
566
617
|
return True
|
|
567
618
|
return False
|
|
568
619
|
|
|
620
|
+
def _is_efficient_filtering_enabled(self, os_version: str) -> bool:
|
|
621
|
+
"""Check if kNN with efficient filtering is enabled."""
|
|
622
|
+
major, minor, patch = os_version.split(".")
|
|
623
|
+
return int(major) >= 2 and int(minor) >= 9
|
|
624
|
+
|
|
569
625
|
def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
570
626
|
"""Store results in the index."""
|
|
571
627
|
embeddings: List[List[float]] = []
|
{llama_index_vector_stores_opensearch-0.3.0 → llama_index_vector_stores_opensearch-0.4.1}/README.md
RENAMED
|
File without changes
|
|
File without changes
|