llama-index-vector-stores-opensearch 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

@@ -56,6 +56,8 @@ class OpensearchVectorClient:
56
56
  settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
57
57
  {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
58
58
  space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
59
+ os_client (Optional[OSClient]): Custom synchronous client (see OpenSearch from opensearch-py)
60
+ os_async_client (Optional[OSClient]): Custom asynchronous client (see AsyncOpenSearch from opensearch-py)
59
61
  **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
60
62
 
61
63
  """
@@ -74,6 +76,7 @@ class OpensearchVectorClient:
74
76
  max_chunk_bytes: int = 1 * 1024 * 1024,
75
77
  search_pipeline: Optional[str] = None,
76
78
  os_client: Optional[OSClient] = None,
79
+ os_async_client: Optional[OSClient] = None,
77
80
  **kwargs: Any,
78
81
  ):
79
82
  """Init params."""
@@ -88,8 +91,9 @@ class OpensearchVectorClient:
88
91
  settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
89
92
  if embedding_field is None:
90
93
  embedding_field = "embedding"
91
- self._embedding_field = embedding_field
92
94
 
95
+ self._method = method
96
+ self._embedding_field = embedding_field
93
97
  self._endpoint = endpoint
94
98
  self._dim = dim
95
99
  self._index = index
@@ -116,9 +120,13 @@ class OpensearchVectorClient:
116
120
  self._os_client = os_client or self._get_opensearch_client(
117
121
  self._endpoint, **kwargs
118
122
  )
119
- self._os_async_client = self._get_async_opensearch_client(
123
+ self._os_async_client = os_async_client or self._get_async_opensearch_client(
120
124
  self._endpoint, **kwargs
121
125
  )
126
+ self._os_version = self._get_opensearch_version()
127
+ self._efficient_filtering_enabled = self._is_efficient_filtering_enabled(
128
+ self._os_version
129
+ )
122
130
  not_found_error = self._import_not_found_error()
123
131
 
124
132
  try:
@@ -192,6 +200,10 @@ class OpensearchVectorClient:
192
200
  )
193
201
  return client
194
202
 
203
+ def _get_opensearch_version(self) -> str:
204
+ info = self._os_client.info()
205
+ return info["version"]["number"]
206
+
195
207
  def _bulk_ingest_embeddings(
196
208
  self,
197
209
  client: Any,
@@ -298,14 +310,27 @@ class OpensearchVectorClient:
298
310
  self,
299
311
  query_vector: List[float],
300
312
  k: int = 4,
313
+ filters: Optional[Union[Dict, List]] = None,
301
314
  vector_field: str = "embedding",
302
315
  ) -> Dict:
303
316
  """For Approximate k-NN Search, this is the default query."""
304
- return {
317
+ query = {
305
318
  "size": k,
306
- "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
319
+ "query": {
320
+ "knn": {
321
+ vector_field: {
322
+ "vector": query_vector,
323
+ "k": k,
324
+ }
325
+ }
326
+ },
307
327
  }
308
328
 
329
+ if filters:
330
+ # filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
331
+ query["query"]["knn"][vector_field]["filter"] = filters
332
+ return query
333
+
309
334
  def _is_text_field(self, value: Any) -> bool:
310
335
  """Check if value is a string and keyword filtering needs to be performed.
311
336
 
@@ -346,7 +371,12 @@ class OpensearchVectorClient:
346
371
  }
347
372
  }
348
373
  elif op in [FilterOperator.IN, FilterOperator.ANY]:
349
- return {"terms": {key: filter.value}}
374
+ if isinstance(filter.value, list) and all(
375
+ self._is_text_field(val) for val in filter.value
376
+ ):
377
+ return {"terms": {f"{key}.keyword": filter.value}}
378
+ else:
379
+ return {"terms": {key: filter.value}}
350
380
  elif op == FilterOperator.NIN:
351
381
  return {"bool": {"must_not": {"terms": {key: filter.value}}}}
352
382
  elif op == FilterOperator.ALL:
@@ -396,52 +426,73 @@ class OpensearchVectorClient:
396
426
  query_embedding: List[float],
397
427
  k: int,
398
428
  filters: Optional[MetadataFilters] = None,
429
+ search_method="approximate",
399
430
  ) -> Dict:
400
431
  """
401
- Do knn search.
432
+ Perform a k-Nearest Neighbors (kNN) search.
402
433
 
403
- If there are no filters do approx-knn search.
404
- If there are (pre)-filters, do an exhaustive exact knn search using 'painless
405
- scripting' if the version of Opensearch supports it, otherwise uses knn_score scripting score.
434
+ If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
435
+ Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
436
+ OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
406
437
 
407
438
  Note:
408
- -AWS Opensearch Serverless does not support the painless scripting functionality at this time according to AWS.
409
- -Also note that approximate knn search does not support pre-filtering.
439
+ - AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
440
+ - Approximate kNN search does not support pre-filtering.
410
441
 
411
442
  Args:
412
- query_embedding: Vector embedding to query.
413
- k: Maximum number of results.
414
- filters: Optional filters to apply before the search.
443
+ query_embedding (List[float]): Vector embedding to query.
444
+ k (int): Maximum number of results.
445
+ filters (Optional[MetadataFilters]): Optional filters to apply for the search.
415
446
  Supports filter-context queries documented at
416
447
  https://opensearch.org/docs/latest/query-dsl/query-filter-context/
417
448
 
418
449
  Returns:
419
- Up to k docs closest to query_embedding
450
+ Dict: Up to k documents closest to query_embedding.
420
451
  """
421
- pre_filter = self._parse_filters(filters)
422
- if not pre_filter:
452
+ filters = self._parse_filters(filters)
453
+
454
+ if not filters:
423
455
  search_query = self._default_approximate_search_query(
424
- query_embedding, k, vector_field=embedding_field
425
- )
426
- elif self.is_aoss:
427
- # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
428
- # painless scripting so default scoring script returned will be just normal knn_score script
429
- search_query = self._default_scoring_script_query(
430
456
  query_embedding,
431
457
  k,
432
- space_type=self.space_type,
433
- pre_filter={"bool": {"filter": pre_filter}},
434
458
  vector_field=embedding_field,
435
459
  )
436
- else:
437
- # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
438
- search_query = self._default_scoring_script_query(
460
+ elif (
461
+ search_method == "approximate"
462
+ and self._method["engine"]
463
+ in [
464
+ "lucene",
465
+ "faiss",
466
+ ]
467
+ and self._efficient_filtering_enabled
468
+ ):
469
+ # if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
470
+ search_query = self._default_approximate_search_query(
439
471
  query_embedding,
440
472
  k,
441
- space_type="l2Squared",
442
- pre_filter={"bool": {"filter": pre_filter}},
473
+ filters={"bool": {"filter": filters}},
443
474
  vector_field=embedding_field,
444
475
  )
476
+ else:
477
+ if self.is_aoss:
478
+ # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
479
+ # painless scripting so default scoring script returned will be just normal knn_score script
480
+ search_query = self._default_scoring_script_query(
481
+ query_embedding,
482
+ k,
483
+ space_type=self.space_type,
484
+ pre_filter={"bool": {"filter": filters}},
485
+ vector_field=embedding_field,
486
+ )
487
+ else:
488
+ # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
489
+ search_query = self._default_scoring_script_query(
490
+ query_embedding,
491
+ k,
492
+ space_type="l2Squared",
493
+ pre_filter={"bool": {"filter": filters}},
494
+ vector_field=embedding_field,
495
+ )
445
496
  return search_query
446
497
 
447
498
  def _hybrid_search_query(
@@ -566,6 +617,11 @@ class OpensearchVectorClient:
566
617
  return True
567
618
  return False
568
619
 
620
+ def _is_efficient_filtering_enabled(self, os_version: str) -> bool:
621
+ """Check if kNN with efficient filtering is enabled."""
622
+ major, minor, patch = os_version.split(".")
623
+ return int(major) >= 2 and int(minor) >= 9
624
+
569
625
  def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
570
626
  """Store results in the index."""
571
627
  embeddings: List[List[float]] = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-vector-stores-opensearch
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: llama-index vector_stores opensearch integration
5
5
  License: MIT
6
6
  Author: Your Name
@@ -0,0 +1,6 @@
1
+ llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
3
+ llama_index/vector_stores/opensearch/base.py,sha256=H_5DUMn8q4E9iY7_LIQcdpfT8dZRjLfnml3OTjoVM2Q,37141
4
+ llama_index_vector_stores_opensearch-0.4.1.dist-info/METADATA,sha256=S8hRWaD0NKc9Yzt8pxXr3-21aIL1q3ff0N7JOTengdc,728
5
+ llama_index_vector_stores_opensearch-0.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
6
+ llama_index_vector_stores_opensearch-0.4.1.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
3
- llama_index/vector_stores/opensearch/base.py,sha256=Dh7ppOm3eXJ8PNbPop37yvyhO22J6pKZ_1y6CU8Rwvk,34760
4
- llama_index_vector_stores_opensearch-0.3.0.dist-info/METADATA,sha256=V2-zpbLlVE4x2FrKs2H0PNC_T9YRSTuB_SqGA72vDSo,728
5
- llama_index_vector_stores_opensearch-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
6
- llama_index_vector_stores_opensearch-0.3.0.dist-info/RECORD,,