llama-index-vector-stores-opensearch 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-vector-stores-opensearch
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: llama-index vector_stores opensearch integration
5
5
  License: MIT
6
6
  Author: Your Name
@@ -88,8 +88,9 @@ class OpensearchVectorClient:
88
88
  settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
89
89
  if embedding_field is None:
90
90
  embedding_field = "embedding"
91
- self._embedding_field = embedding_field
92
91
 
92
+ self._method = method
93
+ self._embedding_field = embedding_field
93
94
  self._endpoint = endpoint
94
95
  self._dim = dim
95
96
  self._index = index
@@ -119,6 +120,10 @@ class OpensearchVectorClient:
119
120
  self._os_async_client = self._get_async_opensearch_client(
120
121
  self._endpoint, **kwargs
121
122
  )
123
+ self._os_version = self._get_opensearch_version()
124
+ self._efficient_filtering_enabled = self._is_efficient_filtering_enabled(
125
+ self._os_version
126
+ )
122
127
  not_found_error = self._import_not_found_error()
123
128
 
124
129
  try:
@@ -192,6 +197,10 @@ class OpensearchVectorClient:
192
197
  )
193
198
  return client
194
199
 
200
+ def _get_opensearch_version(self) -> str:
201
+ info = self._os_client.info()
202
+ return info["version"]["number"]
203
+
195
204
  def _bulk_ingest_embeddings(
196
205
  self,
197
206
  client: Any,
@@ -298,14 +307,27 @@ class OpensearchVectorClient:
298
307
  self,
299
308
  query_vector: List[float],
300
309
  k: int = 4,
310
+ filters: Optional[Union[Dict, List]] = None,
301
311
  vector_field: str = "embedding",
302
312
  ) -> Dict:
303
313
  """For Approximate k-NN Search, this is the default query."""
304
- return {
314
+ query = {
305
315
  "size": k,
306
- "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
316
+ "query": {
317
+ "knn": {
318
+ vector_field: {
319
+ "vector": query_vector,
320
+ "k": k,
321
+ }
322
+ }
323
+ },
307
324
  }
308
325
 
326
+ if filters:
327
+ # filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
328
+ query["query"]["knn"][vector_field]["filter"] = filters
329
+ return query
330
+
309
331
  def _is_text_field(self, value: Any) -> bool:
310
332
  """Check if value is a string and keyword filtering needs to be performed.
311
333
 
@@ -346,7 +368,12 @@ class OpensearchVectorClient:
346
368
  }
347
369
  }
348
370
  elif op in [FilterOperator.IN, FilterOperator.ANY]:
349
- return {"terms": {key: filter.value}}
371
+ if isinstance(filter.value, list) and all(
372
+ self._is_text_field(val) for val in filter.value
373
+ ):
374
+ return {"terms": {f"{key}.keyword": filter.value}}
375
+ else:
376
+ return {"terms": {key: filter.value}}
350
377
  elif op == FilterOperator.NIN:
351
378
  return {"bool": {"must_not": {"terms": {key: filter.value}}}}
352
379
  elif op == FilterOperator.ALL:
@@ -396,52 +423,73 @@ class OpensearchVectorClient:
396
423
  query_embedding: List[float],
397
424
  k: int,
398
425
  filters: Optional[MetadataFilters] = None,
426
+ search_method="approximate",
399
427
  ) -> Dict:
400
428
  """
401
- Do knn search.
429
+ Perform a k-Nearest Neighbors (kNN) search.
402
430
 
403
- If there are no filters do approx-knn search.
404
- If there are (pre)-filters, do an exhaustive exact knn search using 'painless
405
- scripting' if the version of Opensearch supports it, otherwise uses knn_score scripting score.
431
+ If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
432
+ Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
433
+ OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
406
434
 
407
435
  Note:
408
- -AWS Opensearch Serverless does not support the painless scripting functionality at this time according to AWS.
409
- -Also note that approximate knn search does not support pre-filtering.
436
+ - AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
437
+ - Approximate kNN search does not support pre-filtering.
410
438
 
411
439
  Args:
412
- query_embedding: Vector embedding to query.
413
- k: Maximum number of results.
414
- filters: Optional filters to apply before the search.
440
+ query_embedding (List[float]): Vector embedding to query.
441
+ k (int): Maximum number of results.
442
+ filters (Optional[MetadataFilters]): Optional filters to apply for the search.
415
443
  Supports filter-context queries documented at
416
444
  https://opensearch.org/docs/latest/query-dsl/query-filter-context/
417
445
 
418
446
  Returns:
419
- Up to k docs closest to query_embedding
447
+ Dict: Up to k documents closest to query_embedding.
420
448
  """
421
- pre_filter = self._parse_filters(filters)
422
- if not pre_filter:
449
+ filters = self._parse_filters(filters)
450
+
451
+ if not filters:
423
452
  search_query = self._default_approximate_search_query(
424
- query_embedding, k, vector_field=embedding_field
425
- )
426
- elif self.is_aoss:
427
- # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
428
- # painless scripting so default scoring script returned will be just normal knn_score script
429
- search_query = self._default_scoring_script_query(
430
453
  query_embedding,
431
454
  k,
432
- space_type=self.space_type,
433
- pre_filter={"bool": {"filter": pre_filter}},
434
455
  vector_field=embedding_field,
435
456
  )
436
- else:
437
- # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
438
- search_query = self._default_scoring_script_query(
457
+ elif (
458
+ search_method == "approximate"
459
+ and self._method["engine"]
460
+ in [
461
+ "lucene",
462
+ "faiss",
463
+ ]
464
+ and self._efficient_filtering_enabled
465
+ ):
466
+ # if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
467
+ search_query = self._default_approximate_search_query(
439
468
  query_embedding,
440
469
  k,
441
- space_type="l2Squared",
442
- pre_filter={"bool": {"filter": pre_filter}},
470
+ filters={"bool": {"filter": filters}},
443
471
  vector_field=embedding_field,
444
472
  )
473
+ else:
474
+ if self.is_aoss:
475
+ # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
476
+ # painless scripting so default scoring script returned will be just normal knn_score script
477
+ search_query = self._default_scoring_script_query(
478
+ query_embedding,
479
+ k,
480
+ space_type=self.space_type,
481
+ pre_filter={"bool": {"filter": filters}},
482
+ vector_field=embedding_field,
483
+ )
484
+ else:
485
+ # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
486
+ search_query = self._default_scoring_script_query(
487
+ query_embedding,
488
+ k,
489
+ space_type="l2Squared",
490
+ pre_filter={"bool": {"filter": filters}},
491
+ vector_field=embedding_field,
492
+ )
445
493
  return search_query
446
494
 
447
495
  def _hybrid_search_query(
@@ -566,6 +614,11 @@ class OpensearchVectorClient:
566
614
  return True
567
615
  return False
568
616
 
617
+ def _is_efficient_filtering_enabled(self, os_version: str) -> bool:
618
+ """Check if kNN with efficient filtering is enabled."""
619
+ major, minor, patch = os_version.split(".")
620
+ return int(major) >= 2 and int(minor) >= 9
621
+
569
622
  def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
570
623
  """Store results in the index."""
571
624
  embeddings: List[List[float]] = []
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
27
27
  license = "MIT"
28
28
  name = "llama-index-vector-stores-opensearch"
29
29
  readme = "README.md"
30
- version = "0.3.0"
30
+ version = "0.4.0"
31
31
 
32
32
  [tool.poetry.dependencies]
33
33
  python = ">=3.8.1,<4.0"