llama-index-vector-stores-opensearch 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

@@ -32,220 +32,6 @@ INVALID_HYBRID_QUERY_ERROR = (
32
32
  MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
33
33
 
34
34
 
35
- def _import_async_opensearch() -> Any:
36
- """Import OpenSearch if available, otherwise raise error."""
37
- return AsyncOpenSearch
38
-
39
-
40
- def _import_async_bulk() -> Any:
41
- """Import bulk if available, otherwise raise error."""
42
- return async_bulk
43
-
44
-
45
- def _import_not_found_error() -> Any:
46
- """Import not found error if available, otherwise raise error."""
47
- return NotFoundError
48
-
49
-
50
- def _get_async_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
51
- """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
52
- try:
53
- opensearch = _import_async_opensearch()
54
- client = opensearch(opensearch_url, **kwargs)
55
-
56
- except ValueError as e:
57
- raise ValueError(
58
- f"AsyncOpenSearch client string provided is not in proper format. "
59
- f"Got error: {e} "
60
- )
61
- return client
62
-
63
-
64
- async def _bulk_ingest_embeddings(
65
- client: Any,
66
- index_name: str,
67
- embeddings: List[List[float]],
68
- texts: Iterable[str],
69
- metadatas: Optional[List[dict]] = None,
70
- ids: Optional[List[str]] = None,
71
- vector_field: str = "embedding",
72
- text_field: str = "content",
73
- mapping: Optional[Dict] = None,
74
- max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
75
- is_aoss: bool = False,
76
- ) -> List[str]:
77
- """Async Bulk Ingest Embeddings into given index."""
78
- if not mapping:
79
- mapping = {}
80
-
81
- async_bulk = _import_async_bulk()
82
- not_found_error = _import_not_found_error()
83
- requests = []
84
- return_ids = []
85
- mapping = mapping
86
-
87
- try:
88
- await client.indices.get(index=index_name)
89
- except not_found_error:
90
- await client.indices.create(index=index_name, body=mapping)
91
-
92
- for i, text in enumerate(texts):
93
- metadata = metadatas[i] if metadatas else {}
94
- _id = ids[i] if ids else str(uuid.uuid4())
95
- request = {
96
- "_op_type": "index",
97
- "_index": index_name,
98
- vector_field: embeddings[i],
99
- text_field: text,
100
- "metadata": metadata,
101
- }
102
- if is_aoss:
103
- request["id"] = _id
104
- else:
105
- request["_id"] = _id
106
- requests.append(request)
107
- return_ids.append(_id)
108
- await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
109
- if not is_aoss:
110
- await client.indices.refresh(index=index_name)
111
- return return_ids
112
-
113
-
114
- def _default_approximate_search_query(
115
- query_vector: List[float],
116
- k: int = 4,
117
- vector_field: str = "embedding",
118
- ) -> Dict:
119
- """For Approximate k-NN Search, this is the default query."""
120
- return {
121
- "size": k,
122
- "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
123
- }
124
-
125
-
126
- def _parse_filters(filters: Optional[MetadataFilters]) -> Any:
127
- pre_filter = []
128
- if filters is not None:
129
- for f in filters.legacy_filters():
130
- pre_filter.append({f.key: json.loads(str(f.value))})
131
-
132
- return pre_filter
133
-
134
-
135
- def _knn_search_query(
136
- embedding_field: str,
137
- query_embedding: List[float],
138
- k: int,
139
- filters: Optional[MetadataFilters] = None,
140
- ) -> Dict:
141
- """
142
- Do knn search.
143
-
144
- If there are no filters do approx-knn search.
145
- If there are (pre)-filters, do an exhaustive exact knn search using 'painless
146
- scripting'.
147
-
148
- Note that approximate knn search does not support pre-filtering.
149
-
150
- Args:
151
- query_embedding: Vector embedding to query.
152
- k: Maximum number of results.
153
- filters: Optional filters to apply before the search.
154
- Supports filter-context queries documented at
155
- https://opensearch.org/docs/latest/query-dsl/query-filter-context/
156
-
157
- Returns:
158
- Up to k docs closest to query_embedding
159
- """
160
- if filters is None:
161
- search_query = _default_approximate_search_query(
162
- query_embedding, k, vector_field=embedding_field
163
- )
164
- else:
165
- pre_filter = _parse_filters(filters)
166
- # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
167
- search_query = _default_painless_scripting_query(
168
- query_embedding,
169
- k,
170
- space_type="l2Squared",
171
- pre_filter={"bool": {"filter": pre_filter}},
172
- vector_field=embedding_field,
173
- )
174
-
175
- return search_query
176
-
177
-
178
- def _hybrid_search_query(
179
- text_field: str,
180
- query_str: str,
181
- embedding_field: str,
182
- query_embedding: List[float],
183
- k: int,
184
- filters: Optional[MetadataFilters] = None,
185
- ) -> Dict:
186
- knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"]
187
- lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
188
-
189
- parsed_filters = _parse_filters(filters)
190
- if len(parsed_filters) > 0:
191
- lexical_query["filter"] = parsed_filters
192
- return {
193
- "size": k,
194
- "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
195
- }
196
-
197
-
198
- def __get_painless_scripting_source(
199
- space_type: str, vector_field: str = "embedding"
200
- ) -> str:
201
- """For Painless Scripting, it returns the script source based on space type."""
202
- source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
203
- if space_type == "cosineSimilarity":
204
- return source_value
205
- else:
206
- return f"1/{source_value}"
207
-
208
-
209
- def _default_painless_scripting_query(
210
- query_vector: List[float],
211
- k: int = 4,
212
- space_type: str = "l2Squared",
213
- pre_filter: Optional[Union[Dict, List]] = None,
214
- vector_field: str = "embedding",
215
- ) -> Dict:
216
- """For Painless Scripting Search, this is the default query."""
217
- if not pre_filter:
218
- pre_filter = MATCH_ALL_QUERY
219
-
220
- source = __get_painless_scripting_source(space_type, vector_field)
221
- return {
222
- "size": k,
223
- "query": {
224
- "script_score": {
225
- "query": pre_filter,
226
- "script": {
227
- "source": source,
228
- "params": {
229
- "field": vector_field,
230
- "query_value": query_vector,
231
- },
232
- },
233
- }
234
- },
235
- }
236
-
237
-
238
- def _is_aoss_enabled(http_auth: Any) -> bool:
239
- """Check if the service is http_auth is set as `aoss`."""
240
- if (
241
- http_auth is not None
242
- and hasattr(http_auth, "service")
243
- and http_auth.service == "aoss"
244
- ):
245
- return True
246
- return False
247
-
248
-
249
35
  class OpensearchVectorClient:
250
36
  """
251
37
  Object encapsulating an Opensearch index that has vector search enabled.
@@ -302,7 +88,7 @@ class OpensearchVectorClient:
302
88
 
303
89
  self._search_pipeline = search_pipeline
304
90
  http_auth = kwargs.get("http_auth")
305
- self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
91
+ self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
306
92
  # initialize mapping
307
93
  idx_conf = {
308
94
  "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
@@ -316,8 +102,8 @@ class OpensearchVectorClient:
316
102
  }
317
103
  },
318
104
  }
319
- self._os_client = _get_async_opensearch_client(self._endpoint, **kwargs)
320
- not_found_error = _import_not_found_error()
105
+ self._os_client = self._get_async_opensearch_client(self._endpoint, **kwargs)
106
+ not_found_error = self._import_not_found_error()
321
107
 
322
108
  event_loop = asyncio.get_event_loop()
323
109
  try:
@@ -332,6 +118,217 @@ class OpensearchVectorClient:
332
118
  self._os_client.indices.refresh(index=self._index)
333
119
  )
334
120
 
121
+ def _import_async_opensearch(self) -> Any:
122
+ """Import OpenSearch if available, otherwise raise error."""
123
+ return AsyncOpenSearch
124
+
125
+ def _import_async_bulk(self) -> Any:
126
+ """Import bulk if available, otherwise raise error."""
127
+ return async_bulk
128
+
129
+ def _import_not_found_error(self) -> Any:
130
+ """Import not found error if available, otherwise raise error."""
131
+ return NotFoundError
132
+
133
+ def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
134
+ """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
135
+ try:
136
+ opensearch = self._import_async_opensearch()
137
+ client = opensearch(opensearch_url, **kwargs)
138
+
139
+ except ValueError as e:
140
+ raise ValueError(
141
+ f"AsyncOpenSearch client string provided is not in proper format. "
142
+ f"Got error: {e} "
143
+ )
144
+ return client
145
+
146
+ async def _bulk_ingest_embeddings(
147
+ self,
148
+ client: Any,
149
+ index_name: str,
150
+ embeddings: List[List[float]],
151
+ texts: Iterable[str],
152
+ metadatas: Optional[List[dict]] = None,
153
+ ids: Optional[List[str]] = None,
154
+ vector_field: str = "embedding",
155
+ text_field: str = "content",
156
+ mapping: Optional[Dict] = None,
157
+ max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
158
+ is_aoss: bool = False,
159
+ ) -> List[str]:
160
+ """Async Bulk Ingest Embeddings into given index."""
161
+ if not mapping:
162
+ mapping = {}
163
+
164
+ async_bulk = self._import_async_bulk()
165
+ not_found_error = self._import_not_found_error()
166
+ requests = []
167
+ return_ids = []
168
+ mapping = mapping
169
+
170
+ try:
171
+ await client.indices.get(index=index_name)
172
+ except not_found_error:
173
+ await client.indices.create(index=index_name, body=mapping)
174
+
175
+ for i, text in enumerate(texts):
176
+ metadata = metadatas[i] if metadatas else {}
177
+ _id = ids[i] if ids else str(uuid.uuid4())
178
+ request = {
179
+ "_op_type": "index",
180
+ "_index": index_name,
181
+ vector_field: embeddings[i],
182
+ text_field: text,
183
+ "metadata": metadata,
184
+ }
185
+ if is_aoss:
186
+ request["id"] = _id
187
+ else:
188
+ request["_id"] = _id
189
+ requests.append(request)
190
+ return_ids.append(_id)
191
+ await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
192
+ if not is_aoss:
193
+ await client.indices.refresh(index=index_name)
194
+ return return_ids
195
+
196
+ def _default_approximate_search_query(
197
+ self,
198
+ query_vector: List[float],
199
+ k: int = 4,
200
+ vector_field: str = "embedding",
201
+ ) -> Dict:
202
+ """For Approximate k-NN Search, this is the default query."""
203
+ return {
204
+ "size": k,
205
+ "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
206
+ }
207
+
208
+ def _parse_filters(self, filters: Optional[MetadataFilters]) -> Any:
209
+ pre_filter = []
210
+ if filters is not None:
211
+ for f in filters.legacy_filters():
212
+ pre_filter.append({f.key: json.loads(str(f.value))})
213
+
214
+ return pre_filter
215
+
216
+ def _knn_search_query(
217
+ self,
218
+ embedding_field: str,
219
+ query_embedding: List[float],
220
+ k: int,
221
+ filters: Optional[MetadataFilters] = None,
222
+ ) -> Dict:
223
+ """
224
+ Do knn search.
225
+
226
+ If there are no filters do approx-knn search.
227
+ If there are (pre)-filters, do an exhaustive exact knn search using 'painless
228
+ scripting'.
229
+
230
+ Note that approximate knn search does not support pre-filtering.
231
+
232
+ Args:
233
+ query_embedding: Vector embedding to query.
234
+ k: Maximum number of results.
235
+ filters: Optional filters to apply before the search.
236
+ Supports filter-context queries documented at
237
+ https://opensearch.org/docs/latest/query-dsl/query-filter-context/
238
+
239
+ Returns:
240
+ Up to k docs closest to query_embedding
241
+ """
242
+ if filters is None:
243
+ search_query = self._default_approximate_search_query(
244
+ query_embedding, k, vector_field=embedding_field
245
+ )
246
+ else:
247
+ pre_filter = self._parse_filters(filters)
248
+ # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
249
+ search_query = self._default_painless_scripting_query(
250
+ query_embedding,
251
+ k,
252
+ space_type="l2Squared",
253
+ pre_filter={"bool": {"filter": pre_filter}},
254
+ vector_field=embedding_field,
255
+ )
256
+
257
+ return search_query
258
+
259
+ def _hybrid_search_query(
260
+ self,
261
+ text_field: str,
262
+ query_str: str,
263
+ embedding_field: str,
264
+ query_embedding: List[float],
265
+ k: int,
266
+ filters: Optional[MetadataFilters] = None,
267
+ ) -> Dict:
268
+ knn_query = self._knn_search_query(
269
+ embedding_field, query_embedding, k, filters
270
+ )["query"]
271
+ lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
272
+
273
+ parsed_filters = self._parse_filters(filters)
274
+ if len(parsed_filters) > 0:
275
+ lexical_query["filter"] = parsed_filters
276
+ return {
277
+ "size": k,
278
+ "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
279
+ }
280
+
281
+ def __get_painless_scripting_source(
282
+ self, space_type: str, vector_field: str = "embedding"
283
+ ) -> str:
284
+ """For Painless Scripting, it returns the script source based on space type."""
285
+ source_value = (
286
+ f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
287
+ )
288
+ if space_type == "cosineSimilarity":
289
+ return source_value
290
+ else:
291
+ return f"1/{source_value}"
292
+
293
+ def _default_painless_scripting_query(
294
+ self,
295
+ query_vector: List[float],
296
+ k: int = 4,
297
+ space_type: str = "l2Squared",
298
+ pre_filter: Optional[Union[Dict, List]] = None,
299
+ vector_field: str = "embedding",
300
+ ) -> Dict:
301
+ """For Painless Scripting Search, this is the default query."""
302
+ if not pre_filter:
303
+ pre_filter = MATCH_ALL_QUERY
304
+
305
+ source = self.__get_painless_scripting_source(space_type, vector_field)
306
+ return {
307
+ "size": k,
308
+ "query": {
309
+ "script_score": {
310
+ "query": pre_filter,
311
+ "script": {
312
+ "source": source,
313
+ "params": {
314
+ "field": vector_field,
315
+ "query_value": query_vector,
316
+ },
317
+ },
318
+ }
319
+ },
320
+ }
321
+
322
+ def _is_aoss_enabled(self, http_auth: Any) -> bool:
323
+ """Check if the service is http_auth is set as `aoss`."""
324
+ if (
325
+ http_auth is not None
326
+ and hasattr(http_auth, "service")
327
+ and http_auth.service == "aoss"
328
+ ):
329
+ return True
330
+ return False
331
+
335
332
  async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
336
333
  """Store results in the index."""
337
334
  embeddings: List[List[float]] = []
@@ -344,7 +341,7 @@ class OpensearchVectorClient:
344
341
  texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
345
342
  metadatas.append(node_to_metadata_dict(node, remove_text=True))
346
343
 
347
- return await _bulk_ingest_embeddings(
344
+ return await self._bulk_ingest_embeddings(
348
345
  self._os_client,
349
346
  self._index,
350
347
  embeddings,
@@ -358,12 +355,12 @@ class OpensearchVectorClient:
358
355
  is_aoss=self.is_aoss,
359
356
  )
360
357
 
361
- async def delete_doc_id(self, doc_id: str) -> None:
358
+ async def delete_by_doc_id(self, doc_id: str) -> None:
362
359
  """
363
- Delete a document.
360
+ Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
364
361
 
365
362
  Args:
366
- doc_id (str): document id
363
+ doc_id (str): a LlamaIndex `Document` id
367
364
  """
368
365
  search_query = {
369
366
  "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
@@ -381,7 +378,7 @@ class OpensearchVectorClient:
381
378
  if query_mode == VectorStoreQueryMode.HYBRID:
382
379
  if query_str is None or self._search_pipeline is None:
383
380
  raise ValueError(INVALID_HYBRID_QUERY_ERROR)
384
- search_query = _hybrid_search_query(
381
+ search_query = self._hybrid_search_query(
385
382
  self._text_field,
386
383
  query_str,
387
384
  self._embedding_field,
@@ -389,12 +386,15 @@ class OpensearchVectorClient:
389
386
  k,
390
387
  filters=filters,
391
388
  )
392
- params = {"search_pipeline": self._search_pipeline}
389
+ params = {
390
+ "search_pipeline": self._search_pipeline,
391
+ "_source_excludes": ["embedding"],
392
+ }
393
393
  else:
394
- search_query = _knn_search_query(
394
+ search_query = self._knn_search_query(
395
395
  self._embedding_field, query_embedding, k, filters=filters
396
396
  )
397
- params = None
397
+ params = {"_source_excludes": ["embedding"]}
398
398
 
399
399
  res = await self._os_client.search(
400
400
  index=self._index, body=search_query, params=params
@@ -443,6 +443,35 @@ class OpensearchVectorStore(BasePydanticVectorStore):
443
443
  Args:
444
444
  client (OpensearchVectorClient): Vector index client to use
445
445
  for data insertion/querying.
446
+
447
+ Examples:
448
+ `pip install llama-index-vector-stores-opensearch`
449
+
450
+ ```python
451
+ from llama_index.vector_stores.opensearch import (
452
+ OpensearchVectorStore,
453
+ OpensearchVectorClient,
454
+ )
455
+
456
+ # http endpoint for your cluster (opensearch required for vector index usage)
457
+ endpoint = "http://localhost:9200"
458
+ # index to demonstrate the VectorStore impl
459
+ idx = "gpt-index-demo"
460
+
461
+ # OpensearchVectorClient stores text in this field by default
462
+ text_field = "content"
463
+ # OpensearchVectorClient stores embeddings in this field by default
464
+ embedding_field = "embedding"
465
+
466
+ # OpensearchVectorClient encapsulates logic for a
467
+ # single opensearch index with vector search enabled
468
+ client = OpensearchVectorClient(
469
+ endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
470
+ )
471
+
472
+ # initialize vector store
473
+ vector_store = OpensearchVectorStore(client)
474
+ ```
446
475
  """
447
476
 
448
477
  stores_text: bool = True
@@ -494,10 +523,10 @@ class OpensearchVectorStore(BasePydanticVectorStore):
494
523
 
495
524
  def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
496
525
  """
497
- Delete nodes using with ref_doc_id.
526
+ Delete nodes using a ref_doc_id.
498
527
 
499
528
  Args:
500
- ref_doc_id (str): The doc_id of the document to delete.
529
+ ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
501
530
 
502
531
  """
503
532
  asyncio.get_event_loop().run_until_complete(
@@ -506,13 +535,13 @@ class OpensearchVectorStore(BasePydanticVectorStore):
506
535
 
507
536
  async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
508
537
  """
509
- Async delete nodes using with ref_doc_id.
538
+ Async delete nodes using a ref_doc_id.
510
539
 
511
540
  Args:
512
- ref_doc_id (str): The doc_id of the document to delete.
541
+ ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
513
542
 
514
543
  """
515
- await self._client.delete_doc_id(ref_doc_id)
544
+ await self._client.delete_by_doc_id(ref_doc_id)
516
545
 
517
546
  def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
518
547
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-vector-stores-opensearch
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: llama-index vector_stores opensearch integration
5
5
  License: MIT
6
6
  Author: Your Name
@@ -0,0 +1,5 @@
1
+ llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
2
+ llama_index/vector_stores/opensearch/base.py,sha256=KoDUhRZXknBXBiODovw_PgvtFFAhbdkT0hU9NpMYl4o,19141
3
+ llama_index_vector_stores_opensearch-0.1.9.dist-info/METADATA,sha256=r6OTnDiVcC9eU-mzDS_dxHjRd9Q7GM2mHNF7-oFuxOc,677
4
+ llama_index_vector_stores_opensearch-0.1.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
+ llama_index_vector_stores_opensearch-0.1.9.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
2
- llama_index/vector_stores/opensearch/base.py,sha256=eHWcqPEeiYObJu8vIVfDPl9ryg8lOE__zEexvFV4lGA,16958
3
- llama_index_vector_stores_opensearch-0.1.8.dist-info/METADATA,sha256=Oq_NhlE5Mq_tpHFUiERyEgWdruzzUi0XKXdRYpDFMFk,677
4
- llama_index_vector_stores_opensearch-0.1.8.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
- llama_index_vector_stores_opensearch-0.1.8.dist-info/RECORD,,