llama-index-vector-stores-opensearch 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-vector-stores-opensearch
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: llama-index vector_stores opensearch integration
5
5
  License: MIT
6
6
  Author: Your Name
@@ -1,6 +1,5 @@
1
1
  """Elasticsearch/Opensearch vector store."""
2
2
 
3
- import asyncio
4
3
  import uuid
5
4
  from datetime import datetime
6
5
  from typing import Any, Dict, Iterable, List, Optional, Union, cast
@@ -22,14 +21,12 @@ from llama_index.core.vector_stores.utils import (
22
21
  metadata_dict_to_node,
23
22
  node_to_metadata_dict,
24
23
  )
25
- from opensearchpy import AsyncOpenSearch
26
24
  from opensearchpy.client import Client as OSClient
27
- from opensearchpy.exceptions import NotFoundError
28
- from opensearchpy.helpers import async_bulk
29
25
 
30
26
  IMPORT_OPENSEARCH_PY_ERROR = (
31
27
  "Could not import OpenSearch. Please install it with `pip install opensearch-py`."
32
28
  )
29
+ IMPORT_ASYNC_OPENSEARCH_PY_ERROR = "Could not import AsyncOpenSearch. Please install it with `pip install opensearch-py`."
33
30
  INVALID_HYBRID_QUERY_ERROR = (
34
31
  "Please specify the lexical_query and search_pipeline for hybrid search."
35
32
  )
@@ -54,8 +51,10 @@ class OpensearchVectorClient:
54
51
  method (Optional[dict]): Opensearch "method" JSON obj for configuring
55
52
  the KNN index.
56
53
  This includes engine, metric, and other config params. Defaults to:
57
- {"name": "hnsw", "space_type": "l2", "engine": "faiss",
54
+ {"name": "hnsw", "space_type": "l2", "engine": "nmslib",
58
55
  "parameters": {"ef_construction": 256, "m": 48}}
56
+ settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
57
+ {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
59
58
  space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
60
59
  **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
61
60
 
@@ -69,6 +68,7 @@ class OpensearchVectorClient:
69
68
  embedding_field: str = "embedding",
70
69
  text_field: str = "content",
71
70
  method: Optional[dict] = None,
71
+ settings: Optional[dict] = None,
72
72
  engine: Optional[str] = "nmslib",
73
73
  space_type: Optional[str] = "l2",
74
74
  max_chunk_bytes: int = 1 * 1024 * 1024,
@@ -84,6 +84,8 @@ class OpensearchVectorClient:
84
84
  "engine": engine,
85
85
  "parameters": {"ef_construction": 256, "m": 48},
86
86
  }
87
+ if settings is None:
88
+ settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
87
89
  if embedding_field is None:
88
90
  embedding_field = "embedding"
89
91
  self._embedding_field = embedding_field
@@ -100,7 +102,7 @@ class OpensearchVectorClient:
100
102
  self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
101
103
  # initialize mapping
102
104
  idx_conf = {
103
- "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
105
+ "settings": settings,
104
106
  "mappings": {
105
107
  "properties": {
106
108
  embedding_field: {
@@ -111,36 +113,72 @@ class OpensearchVectorClient:
111
113
  }
112
114
  },
113
115
  }
114
- self._os_client = os_client or self._get_async_opensearch_client(
116
+ self._os_client = os_client or self._get_opensearch_client(
117
+ self._endpoint, **kwargs
118
+ )
119
+ self._os_async_client = self._get_async_opensearch_client(
115
120
  self._endpoint, **kwargs
116
121
  )
117
122
  not_found_error = self._import_not_found_error()
118
123
 
119
- event_loop = asyncio.get_event_loop()
120
124
  try:
121
- event_loop.run_until_complete(
122
- self._os_client.indices.get(index=self._index)
123
- )
125
+ self._os_client.indices.get(index=self._index)
124
126
  except not_found_error:
125
- event_loop.run_until_complete(
126
- self._os_client.indices.create(index=self._index, body=idx_conf)
127
- )
128
- event_loop.run_until_complete(
129
- self._os_client.indices.refresh(index=self._index)
130
- )
127
+ self._os_client.indices.create(index=self._index, body=idx_conf)
128
+ self._os_client.indices.refresh(index=self._index)
131
129
 
132
- def _import_async_opensearch(self) -> Any:
130
+ def _import_opensearch(self) -> Any:
133
131
  """Import OpenSearch if available, otherwise raise error."""
132
+ try:
133
+ from opensearchpy import OpenSearch
134
+ except ImportError:
135
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
136
+ return OpenSearch
137
+
138
+ def _import_async_opensearch(self) -> Any:
139
+ """Import AsyncOpenSearch if available, otherwise raise error."""
140
+ try:
141
+ from opensearchpy import AsyncOpenSearch
142
+ except ImportError:
143
+ raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
134
144
  return AsyncOpenSearch
135
145
 
136
- def _import_async_bulk(self) -> Any:
146
+ def _import_bulk(self) -> Any:
137
147
  """Import bulk if available, otherwise raise error."""
148
+ try:
149
+ from opensearchpy.helpers import bulk
150
+ except ImportError:
151
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
152
+ return bulk
153
+
154
+ def _import_async_bulk(self) -> Any:
155
+ """Import async_bulk if available, otherwise raise error."""
156
+ try:
157
+ from opensearchpy.helpers import async_bulk
158
+ except ImportError:
159
+ raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
138
160
  return async_bulk
139
161
 
140
162
  def _import_not_found_error(self) -> Any:
141
163
  """Import not found error if available, otherwise raise error."""
164
+ try:
165
+ from opensearchpy.exceptions import NotFoundError
166
+ except ImportError:
167
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
142
168
  return NotFoundError
143
169
 
170
+ def _get_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
171
+ """Get OpenSearch client from the opensearch_url, otherwise raise error."""
172
+ try:
173
+ opensearch = self._import_opensearch()
174
+ client = opensearch(opensearch_url, **kwargs)
175
+ except ValueError as e:
176
+ raise ImportError(
177
+ f"OpenSearch client string provided is not in proper format. "
178
+ f"Got error: {e} "
179
+ )
180
+ return client
181
+
144
182
  def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
145
183
  """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
146
184
  try:
@@ -154,7 +192,58 @@ class OpensearchVectorClient:
154
192
  )
155
193
  return client
156
194
 
157
- async def _bulk_ingest_embeddings(
195
+ def _bulk_ingest_embeddings(
196
+ self,
197
+ client: Any,
198
+ index_name: str,
199
+ embeddings: List[List[float]],
200
+ texts: Iterable[str],
201
+ metadatas: Optional[List[dict]] = None,
202
+ ids: Optional[List[str]] = None,
203
+ vector_field: str = "embedding",
204
+ text_field: str = "content",
205
+ mapping: Optional[Dict] = None,
206
+ max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
207
+ is_aoss: bool = False,
208
+ ) -> List[str]:
209
+ """Bulk Ingest Embeddings into given index."""
210
+ if not mapping:
211
+ mapping = {}
212
+
213
+ bulk = self._import_bulk()
214
+ not_found_error = self._import_not_found_error()
215
+ requests = []
216
+ return_ids = []
217
+
218
+ try:
219
+ client.indices.get(index=index_name)
220
+ except not_found_error:
221
+ client.indices.create(index=index_name, body=mapping)
222
+
223
+ for i, text in enumerate(texts):
224
+ metadata = metadatas[i] if metadatas else {}
225
+ _id = ids[i] if ids else str(uuid.uuid4())
226
+ request = {
227
+ "_op_type": "index",
228
+ "_index": index_name,
229
+ vector_field: embeddings[i],
230
+ text_field: text,
231
+ "metadata": metadata,
232
+ }
233
+ if is_aoss:
234
+ request["id"] = _id
235
+ else:
236
+ request["_id"] = _id
237
+ requests.append(request)
238
+ return_ids.append(_id)
239
+
240
+ bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
241
+ if not is_aoss:
242
+ client.indices.refresh(index=index_name)
243
+
244
+ return return_ids
245
+
246
+ async def _abulk_ingest_embeddings(
158
247
  self,
159
248
  client: Any,
160
249
  index_name: str,
@@ -176,7 +265,6 @@ class OpensearchVectorClient:
176
265
  not_found_error = self._import_not_found_error()
177
266
  requests = []
178
267
  return_ids = []
179
- mapping = mapping
180
268
 
181
269
  try:
182
270
  await client.indices.get(index=index_name)
@@ -199,9 +287,11 @@ class OpensearchVectorClient:
199
287
  request["_id"] = _id
200
288
  requests.append(request)
201
289
  return_ids.append(_id)
290
+
202
291
  await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
203
292
  if not is_aoss:
204
293
  await client.indices.refresh(index=index_name)
294
+
205
295
  return return_ids
206
296
 
207
297
  def _default_approximate_search_query(
@@ -476,7 +566,7 @@ class OpensearchVectorClient:
476
566
  return True
477
567
  return False
478
568
 
479
- async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
569
+ def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
480
570
  """Store results in the index."""
481
571
  embeddings: List[List[float]] = []
482
572
  texts: List[str] = []
@@ -488,7 +578,7 @@ class OpensearchVectorClient:
488
578
  texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
489
579
  metadatas.append(node_to_metadata_dict(node, remove_text=True))
490
580
 
491
- return await self._bulk_ingest_embeddings(
581
+ return self._bulk_ingest_embeddings(
492
582
  self._os_client,
493
583
  self._index,
494
584
  embeddings,
@@ -502,7 +592,47 @@ class OpensearchVectorClient:
502
592
  is_aoss=self.is_aoss,
503
593
  )
504
594
 
505
- async def delete_by_doc_id(self, doc_id: str) -> None:
595
+ async def aindex_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
596
+ """Store results in the index."""
597
+ embeddings: List[List[float]] = []
598
+ texts: List[str] = []
599
+ metadatas: List[dict] = []
600
+ ids: List[str] = []
601
+ for node in nodes:
602
+ ids.append(node.node_id)
603
+ embeddings.append(node.get_embedding())
604
+ texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
605
+ metadatas.append(node_to_metadata_dict(node, remove_text=True))
606
+
607
+ return await self._abulk_ingest_embeddings(
608
+ self._os_async_client,
609
+ self._index,
610
+ embeddings,
611
+ texts,
612
+ metadatas=metadatas,
613
+ ids=ids,
614
+ vector_field=self._embedding_field,
615
+ text_field=self._text_field,
616
+ mapping=None,
617
+ max_chunk_bytes=self._max_chunk_bytes,
618
+ is_aoss=self.is_aoss,
619
+ )
620
+
621
+ def delete_by_doc_id(self, doc_id: str) -> None:
622
+ """
623
+ Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
624
+
625
+ Args:
626
+ doc_id (str): a LlamaIndex `Document` id
627
+ """
628
+ search_query = {
629
+ "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
630
+ }
631
+ self._os_client.delete_by_query(
632
+ index=self._index, body=search_query, refresh=True
633
+ )
634
+
635
+ async def adelete_by_doc_id(self, doc_id: str) -> None:
506
636
  """
507
637
  Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
508
638
 
@@ -512,11 +642,35 @@ class OpensearchVectorClient:
512
642
  search_query = {
513
643
  "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
514
644
  }
515
- await self._os_client.delete_by_query(
645
+ await self._os_async_client.delete_by_query(
516
646
  index=self._index, body=search_query, refresh=True
517
647
  )
518
648
 
519
- async def delete_nodes(
649
+ def delete_nodes(
650
+ self,
651
+ node_ids: Optional[List[str]] = None,
652
+ filters: Optional[MetadataFilters] = None,
653
+ **delete_kwargs: Any,
654
+ ) -> None:
655
+ """Deletes nodes.
656
+
657
+ Args:
658
+ node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
659
+ filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
660
+ """
661
+ if not node_ids and not filters:
662
+ return
663
+
664
+ query = {"query": {"bool": {"filter": []}}}
665
+ if node_ids:
666
+ query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
667
+
668
+ if filters:
669
+ query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
670
+
671
+ self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
672
+
673
+ async def adelete_nodes(
520
674
  self,
521
675
  node_ids: Optional[List[str]] = None,
522
676
  filters: Optional[MetadataFilters] = None,
@@ -538,17 +692,61 @@ class OpensearchVectorClient:
538
692
  if filters:
539
693
  query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
540
694
 
541
- await self._os_client.delete_by_query(
695
+ await self._os_async_client.delete_by_query(
542
696
  index=self._index, body=query, refresh=True
543
697
  )
544
698
 
545
- async def clear(self) -> None:
699
+ def clear(self) -> None:
700
+ """Clears index."""
701
+ query = {"query": {"bool": {"filter": []}}}
702
+ self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
703
+
704
+ async def aclear(self) -> None:
546
705
  """Clears index."""
547
706
  query = {"query": {"bool": {"filter": []}}}
548
- await self._os_client.delete_by_query(
707
+ await self._os_async_client.delete_by_query(
549
708
  index=self._index, body=query, refresh=True
550
709
  )
551
710
 
711
+ def query(
712
+ self,
713
+ query_mode: VectorStoreQueryMode,
714
+ query_str: Optional[str],
715
+ query_embedding: List[float],
716
+ k: int,
717
+ filters: Optional[MetadataFilters] = None,
718
+ ) -> VectorStoreQueryResult:
719
+ if query_mode == VectorStoreQueryMode.HYBRID:
720
+ if query_str is None or self._search_pipeline is None:
721
+ raise ValueError(INVALID_HYBRID_QUERY_ERROR)
722
+ search_query = self._hybrid_search_query(
723
+ self._text_field,
724
+ query_str,
725
+ self._embedding_field,
726
+ query_embedding,
727
+ k,
728
+ filters=filters,
729
+ )
730
+ params = {
731
+ "search_pipeline": self._search_pipeline,
732
+ }
733
+ elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
734
+ search_query = self._lexical_search_query(
735
+ self._text_field, query_str, k, filters=filters
736
+ )
737
+ params = None
738
+ else:
739
+ search_query = self._knn_search_query(
740
+ self._embedding_field, query_embedding, k, filters=filters
741
+ )
742
+ params = None
743
+
744
+ res = self._os_client.search(
745
+ index=self._index, body=search_query, params=params
746
+ )
747
+
748
+ return self._to_query_result(res)
749
+
552
750
  async def aquery(
553
751
  self,
554
752
  query_mode: VectorStoreQueryMode,
@@ -582,7 +780,7 @@ class OpensearchVectorClient:
582
780
  )
583
781
  params = None
584
782
 
585
- res = await self._os_client.search(
783
+ res = await self._os_async_client.search(
586
784
  index=self._index, body=search_query, params=params
587
785
  )
588
786
 
@@ -693,9 +891,8 @@ class OpensearchVectorStore(BasePydanticVectorStore):
693
891
  nodes: List[BaseNode]: list of nodes with embeddings.
694
892
 
695
893
  """
696
- return asyncio.get_event_loop().run_until_complete(
697
- self.async_add(nodes, **add_kwargs)
698
- )
894
+ self._client.index_results(nodes)
895
+ return [result.node_id for result in nodes]
699
896
 
700
897
  async def async_add(
701
898
  self,
@@ -709,32 +906,30 @@ class OpensearchVectorStore(BasePydanticVectorStore):
709
906
  nodes: List[BaseNode]: list of nodes with embeddings.
710
907
 
711
908
  """
712
- await self._client.index_results(nodes)
909
+ await self._client.aindex_results(nodes)
713
910
  return [result.node_id for result in nodes]
714
911
 
715
912
  def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
716
913
  """
717
- Delete nodes using a ref_doc_id.
914
+ Delete nodes using with ref_doc_id.
718
915
 
719
916
  Args:
720
- ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
917
+ ref_doc_id (str): The doc_id of the document to delete.
721
918
 
722
919
  """
723
- asyncio.get_event_loop().run_until_complete(
724
- self.adelete(ref_doc_id, **delete_kwargs)
725
- )
920
+ self._client.delete_by_doc_id(ref_doc_id)
726
921
 
727
922
  async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
728
923
  """
729
- Async delete nodes using a ref_doc_id.
924
+ Async delete nodes using with ref_doc_id.
730
925
 
731
926
  Args:
732
- ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
927
+ ref_doc_id (str): The doc_id of the document to delete.
733
928
 
734
929
  """
735
- await self._client.delete_by_doc_id(ref_doc_id)
930
+ await self._client.adelete_by_doc_id(ref_doc_id)
736
931
 
737
- async def adelete_nodes(
932
+ def delete_nodes(
738
933
  self,
739
934
  node_ids: Optional[List[str]] = None,
740
935
  filters: Optional[MetadataFilters] = None,
@@ -746,31 +941,29 @@ class OpensearchVectorStore(BasePydanticVectorStore):
746
941
  node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
747
942
  filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
748
943
  """
749
- await self._client.delete_nodes(node_ids, filters, **delete_kwargs)
944
+ self._client.delete_nodes(node_ids, filters, **delete_kwargs)
750
945
 
751
- def delete_nodes(
946
+ async def adelete_nodes(
752
947
  self,
753
948
  node_ids: Optional[List[str]] = None,
754
949
  filters: Optional[MetadataFilters] = None,
755
950
  **delete_kwargs: Any,
756
951
  ) -> None:
757
- """Deletes nodes.
952
+ """Async deletes nodes async.
758
953
 
759
954
  Args:
760
955
  node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
761
956
  filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
762
957
  """
763
- asyncio.get_event_loop().run_until_complete(
764
- self.adelete_nodes(node_ids, filters, **delete_kwargs)
765
- )
766
-
767
- async def aclear(self) -> None:
768
- """Clears index."""
769
- await self._client.clear()
958
+ await self._client.adelete_nodes(node_ids, filters, **delete_kwargs)
770
959
 
771
960
  def clear(self) -> None:
772
961
  """Clears index."""
773
- asyncio.get_event_loop().run_until_complete(self.aclear())
962
+ self._client.clear()
963
+
964
+ async def aclear(self) -> None:
965
+ """Async clears index."""
966
+ await self._client.aclear()
774
967
 
775
968
  def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
776
969
  """
@@ -780,7 +973,15 @@ class OpensearchVectorStore(BasePydanticVectorStore):
780
973
  query (VectorStoreQuery): Store query object.
781
974
 
782
975
  """
783
- return asyncio.get_event_loop().run_until_complete(self.aquery(query, **kwargs))
976
+ query_embedding = cast(List[float], query.query_embedding)
977
+
978
+ return self._client.query(
979
+ query.mode,
980
+ query.query_str,
981
+ query_embedding,
982
+ query.similarity_top_k,
983
+ filters=query.filters,
984
+ )
784
985
 
785
986
  async def aquery(
786
987
  self, query: VectorStoreQuery, **kwargs: Any
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
27
27
  license = "MIT"
28
28
  name = "llama-index-vector-stores-opensearch"
29
29
  readme = "README.md"
30
- version = "0.2.2"
30
+ version = "0.3.0"
31
31
 
32
32
  [tool.poetry.dependencies]
33
33
  python = ">=3.8.1,<4.0"