llama-index-vector-stores-opensearch 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/PKG-INFO +1 -1
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/llama_index/vector_stores/opensearch/base.py +61 -15
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/pyproject.toml +1 -1
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/README.md +0 -0
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/llama_index/py.typed +0 -0
- {llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/llama_index/vector_stores/opensearch/__init__.py +0 -0
|
@@ -56,6 +56,7 @@ class OpensearchVectorClient:
|
|
|
56
56
|
This includes engine, metric, and other config params. Defaults to:
|
|
57
57
|
{"name": "hnsw", "space_type": "l2", "engine": "faiss",
|
|
58
58
|
"parameters": {"ef_construction": 256, "m": 48}}
|
|
59
|
+
space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
|
|
59
60
|
**kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
|
|
60
61
|
|
|
61
62
|
"""
|
|
@@ -69,6 +70,7 @@ class OpensearchVectorClient:
|
|
|
69
70
|
text_field: str = "content",
|
|
70
71
|
method: Optional[dict] = None,
|
|
71
72
|
engine: Optional[str] = "nmslib",
|
|
73
|
+
space_type: Optional[str] = "l2",
|
|
72
74
|
max_chunk_bytes: int = 1 * 1024 * 1024,
|
|
73
75
|
search_pipeline: Optional[str] = None,
|
|
74
76
|
os_client: Optional[OSClient] = None,
|
|
@@ -94,6 +96,7 @@ class OpensearchVectorClient:
|
|
|
94
96
|
|
|
95
97
|
self._search_pipeline = search_pipeline
|
|
96
98
|
http_auth = kwargs.get("http_auth")
|
|
99
|
+
self.space_type = space_type
|
|
97
100
|
self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
|
|
98
101
|
# initialize mapping
|
|
99
102
|
idx_conf = {
|
|
@@ -309,9 +312,11 @@ class OpensearchVectorClient:
|
|
|
309
312
|
|
|
310
313
|
If there are no filters do approx-knn search.
|
|
311
314
|
If there are (pre)-filters, do an exhaustive exact knn search using 'painless
|
|
312
|
-
scripting'.
|
|
315
|
+
scripting' if the version of Opensearch supports it, otherwise uses knn_score scripting score.
|
|
313
316
|
|
|
314
|
-
Note
|
|
317
|
+
Note:
|
|
318
|
+
-AWS Opensearch Serverless does not support the painless scripting functionality at this time according to AWS.
|
|
319
|
+
-Also note that approximate knn search does not support pre-filtering.
|
|
315
320
|
|
|
316
321
|
Args:
|
|
317
322
|
query_embedding: Vector embedding to query.
|
|
@@ -328,16 +333,25 @@ class OpensearchVectorClient:
|
|
|
328
333
|
search_query = self._default_approximate_search_query(
|
|
329
334
|
query_embedding, k, vector_field=embedding_field
|
|
330
335
|
)
|
|
336
|
+
elif self.is_aoss:
|
|
337
|
+
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
338
|
+
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
339
|
+
search_query = self._default_scoring_script_query(
|
|
340
|
+
query_embedding,
|
|
341
|
+
k,
|
|
342
|
+
space_type=self.space_type,
|
|
343
|
+
pre_filter={"bool": {"filter": pre_filter}},
|
|
344
|
+
vector_field=embedding_field,
|
|
345
|
+
)
|
|
331
346
|
else:
|
|
332
347
|
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
333
|
-
search_query = self.
|
|
348
|
+
search_query = self._default_scoring_script_query(
|
|
334
349
|
query_embedding,
|
|
335
350
|
k,
|
|
336
351
|
space_type="l2Squared",
|
|
337
352
|
pre_filter={"bool": {"filter": pre_filter}},
|
|
338
353
|
vector_field=embedding_field,
|
|
339
354
|
)
|
|
340
|
-
|
|
341
355
|
return search_query
|
|
342
356
|
|
|
343
357
|
def _hybrid_search_query(
|
|
@@ -382,7 +396,9 @@ class OpensearchVectorClient:
|
|
|
382
396
|
def __get_painless_scripting_source(
|
|
383
397
|
self, space_type: str, vector_field: str = "embedding"
|
|
384
398
|
) -> str:
|
|
385
|
-
"""For Painless Scripting, it returns the script source based on space type.
|
|
399
|
+
"""For Painless Scripting, it returns the script source based on space type.
|
|
400
|
+
This does not work with Opensearch Serverless currently.
|
|
401
|
+
"""
|
|
386
402
|
source_value = (
|
|
387
403
|
f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
388
404
|
)
|
|
@@ -391,7 +407,29 @@ class OpensearchVectorClient:
|
|
|
391
407
|
else:
|
|
392
408
|
return f"1/{source_value}"
|
|
393
409
|
|
|
394
|
-
def
|
|
410
|
+
def _get_knn_scoring_script(self, space_type, vector_field, query_vector):
|
|
411
|
+
"""Default scoring script that will work with AWS Opensearch Serverless."""
|
|
412
|
+
return {
|
|
413
|
+
"source": "knn_score",
|
|
414
|
+
"lang": "knn",
|
|
415
|
+
"params": {
|
|
416
|
+
"field": vector_field,
|
|
417
|
+
"query_value": query_vector,
|
|
418
|
+
"space_type": space_type,
|
|
419
|
+
},
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
def _get_painless_scoring_script(self, space_type, vector_field, query_vector):
|
|
423
|
+
source = self.__get_painless_scripting_source(space_type, vector_field)
|
|
424
|
+
return {
|
|
425
|
+
"source": source,
|
|
426
|
+
"params": {
|
|
427
|
+
"field": vector_field,
|
|
428
|
+
"query_value": query_vector,
|
|
429
|
+
},
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
def _default_scoring_script_query(
|
|
395
433
|
self,
|
|
396
434
|
query_vector: List[float],
|
|
397
435
|
k: int = 4,
|
|
@@ -399,23 +437,31 @@ class OpensearchVectorClient:
|
|
|
399
437
|
pre_filter: Optional[Union[Dict, List]] = None,
|
|
400
438
|
vector_field: str = "embedding",
|
|
401
439
|
) -> Dict:
|
|
402
|
-
"""For
|
|
440
|
+
"""For Scoring Script Search, this is the default query. Has to account for Opensearch Service
|
|
441
|
+
Serverless which does not support painless scripting functions so defaults to knn_score.
|
|
442
|
+
"""
|
|
403
443
|
if not pre_filter:
|
|
404
444
|
pre_filter = MATCH_ALL_QUERY
|
|
405
445
|
|
|
406
|
-
|
|
446
|
+
# check if we can use painless scripting or have to use default knn_score script
|
|
447
|
+
if self.is_aoss:
|
|
448
|
+
if space_type == "l2Squared":
|
|
449
|
+
raise ValueError(
|
|
450
|
+
"Unsupported space type for aoss. Can only use l1, l2, cosinesimil."
|
|
451
|
+
)
|
|
452
|
+
script = self._get_knn_scoring_script(
|
|
453
|
+
space_type, vector_field, query_vector
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
script = self._get_painless_scoring_script(
|
|
457
|
+
space_type, vector_field, query_vector
|
|
458
|
+
)
|
|
407
459
|
return {
|
|
408
460
|
"size": k,
|
|
409
461
|
"query": {
|
|
410
462
|
"script_score": {
|
|
411
463
|
"query": pre_filter,
|
|
412
|
-
"script":
|
|
413
|
-
"source": source,
|
|
414
|
-
"params": {
|
|
415
|
-
"field": vector_field,
|
|
416
|
-
"query_value": query_vector,
|
|
417
|
-
},
|
|
418
|
-
},
|
|
464
|
+
"script": script,
|
|
419
465
|
}
|
|
420
466
|
},
|
|
421
467
|
}
|
{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.2.2}/README.md
RENAMED
|
File without changes
|
|
File without changes
|