llama-index-vector-stores-opensearch 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

@@ -32,220 +32,6 @@ INVALID_HYBRID_QUERY_ERROR = (
32
32
  MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
33
33
 
34
34
 
35
- def _import_async_opensearch() -> Any:
36
- """Import OpenSearch if available, otherwise raise error."""
37
- return AsyncOpenSearch
38
-
39
-
40
- def _import_async_bulk() -> Any:
41
- """Import bulk if available, otherwise raise error."""
42
- return async_bulk
43
-
44
-
45
- def _import_not_found_error() -> Any:
46
- """Import not found error if available, otherwise raise error."""
47
- return NotFoundError
48
-
49
-
50
- def _get_async_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
51
- """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
52
- try:
53
- opensearch = _import_async_opensearch()
54
- client = opensearch(opensearch_url, **kwargs)
55
-
56
- except ValueError as e:
57
- raise ValueError(
58
- f"AsyncOpenSearch client string provided is not in proper format. "
59
- f"Got error: {e} "
60
- )
61
- return client
62
-
63
-
64
- async def _bulk_ingest_embeddings(
65
- client: Any,
66
- index_name: str,
67
- embeddings: List[List[float]],
68
- texts: Iterable[str],
69
- metadatas: Optional[List[dict]] = None,
70
- ids: Optional[List[str]] = None,
71
- vector_field: str = "embedding",
72
- text_field: str = "content",
73
- mapping: Optional[Dict] = None,
74
- max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
75
- is_aoss: bool = False,
76
- ) -> List[str]:
77
- """Async Bulk Ingest Embeddings into given index."""
78
- if not mapping:
79
- mapping = {}
80
-
81
- async_bulk = _import_async_bulk()
82
- not_found_error = _import_not_found_error()
83
- requests = []
84
- return_ids = []
85
- mapping = mapping
86
-
87
- try:
88
- await client.indices.get(index=index_name)
89
- except not_found_error:
90
- await client.indices.create(index=index_name, body=mapping)
91
-
92
- for i, text in enumerate(texts):
93
- metadata = metadatas[i] if metadatas else {}
94
- _id = ids[i] if ids else str(uuid.uuid4())
95
- request = {
96
- "_op_type": "index",
97
- "_index": index_name,
98
- vector_field: embeddings[i],
99
- text_field: text,
100
- "metadata": metadata,
101
- }
102
- if is_aoss:
103
- request["id"] = _id
104
- else:
105
- request["_id"] = _id
106
- requests.append(request)
107
- return_ids.append(_id)
108
- await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
109
- if not is_aoss:
110
- await client.indices.refresh(index=index_name)
111
- return return_ids
112
-
113
-
114
- def _default_approximate_search_query(
115
- query_vector: List[float],
116
- k: int = 4,
117
- vector_field: str = "embedding",
118
- ) -> Dict:
119
- """For Approximate k-NN Search, this is the default query."""
120
- return {
121
- "size": k,
122
- "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
123
- }
124
-
125
-
126
- def _parse_filters(filters: Optional[MetadataFilters]) -> Any:
127
- pre_filter = []
128
- if filters is not None:
129
- for f in filters.legacy_filters():
130
- pre_filter.append({f.key: json.loads(str(f.value))})
131
-
132
- return pre_filter
133
-
134
-
135
- def _knn_search_query(
136
- embedding_field: str,
137
- query_embedding: List[float],
138
- k: int,
139
- filters: Optional[MetadataFilters] = None,
140
- ) -> Dict:
141
- """
142
- Do knn search.
143
-
144
- If there are no filters do approx-knn search.
145
- If there are (pre)-filters, do an exhaustive exact knn search using 'painless
146
- scripting'.
147
-
148
- Note that approximate knn search does not support pre-filtering.
149
-
150
- Args:
151
- query_embedding: Vector embedding to query.
152
- k: Maximum number of results.
153
- filters: Optional filters to apply before the search.
154
- Supports filter-context queries documented at
155
- https://opensearch.org/docs/latest/query-dsl/query-filter-context/
156
-
157
- Returns:
158
- Up to k docs closest to query_embedding
159
- """
160
- if filters is None:
161
- search_query = _default_approximate_search_query(
162
- query_embedding, k, vector_field=embedding_field
163
- )
164
- else:
165
- pre_filter = _parse_filters(filters)
166
- # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
167
- search_query = _default_painless_scripting_query(
168
- query_embedding,
169
- k,
170
- space_type="l2Squared",
171
- pre_filter={"bool": {"filter": pre_filter}},
172
- vector_field=embedding_field,
173
- )
174
-
175
- return search_query
176
-
177
-
178
- def _hybrid_search_query(
179
- text_field: str,
180
- query_str: str,
181
- embedding_field: str,
182
- query_embedding: List[float],
183
- k: int,
184
- filters: Optional[MetadataFilters] = None,
185
- ) -> Dict:
186
- knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"]
187
- lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
188
-
189
- parsed_filters = _parse_filters(filters)
190
- if len(parsed_filters) > 0:
191
- lexical_query["filter"] = parsed_filters
192
- return {
193
- "size": k,
194
- "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
195
- }
196
-
197
-
198
- def __get_painless_scripting_source(
199
- space_type: str, vector_field: str = "embedding"
200
- ) -> str:
201
- """For Painless Scripting, it returns the script source based on space type."""
202
- source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
203
- if space_type == "cosineSimilarity":
204
- return source_value
205
- else:
206
- return f"1/{source_value}"
207
-
208
-
209
- def _default_painless_scripting_query(
210
- query_vector: List[float],
211
- k: int = 4,
212
- space_type: str = "l2Squared",
213
- pre_filter: Optional[Union[Dict, List]] = None,
214
- vector_field: str = "embedding",
215
- ) -> Dict:
216
- """For Painless Scripting Search, this is the default query."""
217
- if not pre_filter:
218
- pre_filter = MATCH_ALL_QUERY
219
-
220
- source = __get_painless_scripting_source(space_type, vector_field)
221
- return {
222
- "size": k,
223
- "query": {
224
- "script_score": {
225
- "query": pre_filter,
226
- "script": {
227
- "source": source,
228
- "params": {
229
- "field": vector_field,
230
- "query_value": query_vector,
231
- },
232
- },
233
- }
234
- },
235
- }
236
-
237
-
238
- def _is_aoss_enabled(http_auth: Any) -> bool:
239
- """Check if the service is http_auth is set as `aoss`."""
240
- if (
241
- http_auth is not None
242
- and hasattr(http_auth, "service")
243
- and http_auth.service == "aoss"
244
- ):
245
- return True
246
- return False
247
-
248
-
249
35
  class OpensearchVectorClient:
250
36
  """
251
37
  Object encapsulating an Opensearch index that has vector search enabled.
@@ -302,7 +88,7 @@ class OpensearchVectorClient:
302
88
 
303
89
  self._search_pipeline = search_pipeline
304
90
  http_auth = kwargs.get("http_auth")
305
- self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
91
+ self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
306
92
  # initialize mapping
307
93
  idx_conf = {
308
94
  "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
@@ -316,8 +102,8 @@ class OpensearchVectorClient:
316
102
  }
317
103
  },
318
104
  }
319
- self._os_client = _get_async_opensearch_client(self._endpoint, **kwargs)
320
- not_found_error = _import_not_found_error()
105
+ self._os_client = self._get_async_opensearch_client(self._endpoint, **kwargs)
106
+ not_found_error = self._import_not_found_error()
321
107
 
322
108
  event_loop = asyncio.get_event_loop()
323
109
  try:
@@ -332,6 +118,217 @@ class OpensearchVectorClient:
332
118
  self._os_client.indices.refresh(index=self._index)
333
119
  )
334
120
 
121
+ def _import_async_opensearch(self) -> Any:
122
+ """Import OpenSearch if available, otherwise raise error."""
123
+ return AsyncOpenSearch
124
+
125
+ def _import_async_bulk(self) -> Any:
126
+ """Import bulk if available, otherwise raise error."""
127
+ return async_bulk
128
+
129
+ def _import_not_found_error(self) -> Any:
130
+ """Import not found error if available, otherwise raise error."""
131
+ return NotFoundError
132
+
133
+ def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
134
+ """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
135
+ try:
136
+ opensearch = self._import_async_opensearch()
137
+ client = opensearch(opensearch_url, **kwargs)
138
+
139
+ except ValueError as e:
140
+ raise ValueError(
141
+ f"AsyncOpenSearch client string provided is not in proper format. "
142
+ f"Got error: {e} "
143
+ )
144
+ return client
145
+
146
+ async def _bulk_ingest_embeddings(
147
+ self,
148
+ client: Any,
149
+ index_name: str,
150
+ embeddings: List[List[float]],
151
+ texts: Iterable[str],
152
+ metadatas: Optional[List[dict]] = None,
153
+ ids: Optional[List[str]] = None,
154
+ vector_field: str = "embedding",
155
+ text_field: str = "content",
156
+ mapping: Optional[Dict] = None,
157
+ max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
158
+ is_aoss: bool = False,
159
+ ) -> List[str]:
160
+ """Async Bulk Ingest Embeddings into given index."""
161
+ if not mapping:
162
+ mapping = {}
163
+
164
+ async_bulk = self._import_async_bulk()
165
+ not_found_error = self._import_not_found_error()
166
+ requests = []
167
+ return_ids = []
168
+ mapping = mapping
169
+
170
+ try:
171
+ await client.indices.get(index=index_name)
172
+ except not_found_error:
173
+ await client.indices.create(index=index_name, body=mapping)
174
+
175
+ for i, text in enumerate(texts):
176
+ metadata = metadatas[i] if metadatas else {}
177
+ _id = ids[i] if ids else str(uuid.uuid4())
178
+ request = {
179
+ "_op_type": "index",
180
+ "_index": index_name,
181
+ vector_field: embeddings[i],
182
+ text_field: text,
183
+ "metadata": metadata,
184
+ }
185
+ if is_aoss:
186
+ request["id"] = _id
187
+ else:
188
+ request["_id"] = _id
189
+ requests.append(request)
190
+ return_ids.append(_id)
191
+ await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
192
+ if not is_aoss:
193
+ await client.indices.refresh(index=index_name)
194
+ return return_ids
195
+
196
+ def _default_approximate_search_query(
197
+ self,
198
+ query_vector: List[float],
199
+ k: int = 4,
200
+ vector_field: str = "embedding",
201
+ ) -> Dict:
202
+ """For Approximate k-NN Search, this is the default query."""
203
+ return {
204
+ "size": k,
205
+ "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
206
+ }
207
+
208
+ def _parse_filters(self, filters: Optional[MetadataFilters]) -> Any:
209
+ pre_filter = []
210
+ if filters is not None:
211
+ for f in filters.legacy_filters():
212
+ pre_filter.append({f.key: json.loads(str(f.value))})
213
+
214
+ return pre_filter
215
+
216
+ def _knn_search_query(
217
+ self,
218
+ embedding_field: str,
219
+ query_embedding: List[float],
220
+ k: int,
221
+ filters: Optional[MetadataFilters] = None,
222
+ ) -> Dict:
223
+ """
224
+ Do knn search.
225
+
226
+ If there are no filters do approx-knn search.
227
+ If there are (pre)-filters, do an exhaustive exact knn search using 'painless
228
+ scripting'.
229
+
230
+ Note that approximate knn search does not support pre-filtering.
231
+
232
+ Args:
233
+ query_embedding: Vector embedding to query.
234
+ k: Maximum number of results.
235
+ filters: Optional filters to apply before the search.
236
+ Supports filter-context queries documented at
237
+ https://opensearch.org/docs/latest/query-dsl/query-filter-context/
238
+
239
+ Returns:
240
+ Up to k docs closest to query_embedding
241
+ """
242
+ pre_filter = self._parse_filters(filters)
243
+ if not pre_filter:
244
+ search_query = self._default_approximate_search_query(
245
+ query_embedding, k, vector_field=embedding_field
246
+ )
247
+ else:
248
+ # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
249
+ search_query = self._default_painless_scripting_query(
250
+ query_embedding,
251
+ k,
252
+ space_type="l2Squared",
253
+ pre_filter={"bool": {"filter": pre_filter}},
254
+ vector_field=embedding_field,
255
+ )
256
+
257
+ return search_query
258
+
259
+ def _hybrid_search_query(
260
+ self,
261
+ text_field: str,
262
+ query_str: str,
263
+ embedding_field: str,
264
+ query_embedding: List[float],
265
+ k: int,
266
+ filters: Optional[MetadataFilters] = None,
267
+ ) -> Dict:
268
+ knn_query = self._knn_search_query(
269
+ embedding_field, query_embedding, k, filters
270
+ )["query"]
271
+ lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
272
+
273
+ parsed_filters = self._parse_filters(filters)
274
+ if len(parsed_filters) > 0:
275
+ lexical_query["filter"] = parsed_filters
276
+ return {
277
+ "size": k,
278
+ "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
279
+ }
280
+
281
+ def __get_painless_scripting_source(
282
+ self, space_type: str, vector_field: str = "embedding"
283
+ ) -> str:
284
+ """For Painless Scripting, it returns the script source based on space type."""
285
+ source_value = (
286
+ f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
287
+ )
288
+ if space_type == "cosineSimilarity":
289
+ return source_value
290
+ else:
291
+ return f"1/{source_value}"
292
+
293
+ def _default_painless_scripting_query(
294
+ self,
295
+ query_vector: List[float],
296
+ k: int = 4,
297
+ space_type: str = "l2Squared",
298
+ pre_filter: Optional[Union[Dict, List]] = None,
299
+ vector_field: str = "embedding",
300
+ ) -> Dict:
301
+ """For Painless Scripting Search, this is the default query."""
302
+ if not pre_filter:
303
+ pre_filter = MATCH_ALL_QUERY
304
+
305
+ source = self.__get_painless_scripting_source(space_type, vector_field)
306
+ return {
307
+ "size": k,
308
+ "query": {
309
+ "script_score": {
310
+ "query": pre_filter,
311
+ "script": {
312
+ "source": source,
313
+ "params": {
314
+ "field": vector_field,
315
+ "query_value": query_vector,
316
+ },
317
+ },
318
+ }
319
+ },
320
+ }
321
+
322
+ def _is_aoss_enabled(self, http_auth: Any) -> bool:
323
+ """Check if the service is http_auth is set as `aoss`."""
324
+ if (
325
+ http_auth is not None
326
+ and hasattr(http_auth, "service")
327
+ and http_auth.service == "aoss"
328
+ ):
329
+ return True
330
+ return False
331
+
335
332
  async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
336
333
  """Store results in the index."""
337
334
  embeddings: List[List[float]] = []
@@ -344,7 +341,7 @@ class OpensearchVectorClient:
344
341
  texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
345
342
  metadatas.append(node_to_metadata_dict(node, remove_text=True))
346
343
 
347
- return await _bulk_ingest_embeddings(
344
+ return await self._bulk_ingest_embeddings(
348
345
  self._os_client,
349
346
  self._index,
350
347
  embeddings,
@@ -358,12 +355,12 @@ class OpensearchVectorClient:
358
355
  is_aoss=self.is_aoss,
359
356
  )
360
357
 
361
- async def delete_doc_id(self, doc_id: str) -> None:
358
+ async def delete_by_doc_id(self, doc_id: str) -> None:
362
359
  """
363
- Delete a document.
360
+ Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
364
361
 
365
362
  Args:
366
- doc_id (str): document id
363
+ doc_id (str): a LlamaIndex `Document` id
367
364
  """
368
365
  search_query = {
369
366
  "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
@@ -381,7 +378,7 @@ class OpensearchVectorClient:
381
378
  if query_mode == VectorStoreQueryMode.HYBRID:
382
379
  if query_str is None or self._search_pipeline is None:
383
380
  raise ValueError(INVALID_HYBRID_QUERY_ERROR)
384
- search_query = _hybrid_search_query(
381
+ search_query = self._hybrid_search_query(
385
382
  self._text_field,
386
383
  query_str,
387
384
  self._embedding_field,
@@ -389,9 +386,11 @@ class OpensearchVectorClient:
389
386
  k,
390
387
  filters=filters,
391
388
  )
392
- params = {"search_pipeline": self._search_pipeline}
389
+ params = {
390
+ "search_pipeline": self._search_pipeline,
391
+ }
393
392
  else:
394
- search_query = _knn_search_query(
393
+ search_query = self._knn_search_query(
395
394
  self._embedding_field, query_embedding, k, filters=filters
396
395
  )
397
396
  params = None
@@ -399,6 +398,10 @@ class OpensearchVectorClient:
399
398
  res = await self._os_client.search(
400
399
  index=self._index, body=search_query, params=params
401
400
  )
401
+
402
+ return self._to_query_result(res)
403
+
404
+ def _to_query_result(self, res) -> VectorStoreQueryResult:
402
405
  nodes = []
403
406
  ids = []
404
407
  scores = []
@@ -433,6 +436,7 @@ class OpensearchVectorClient:
433
436
  ids.append(node_id)
434
437
  nodes.append(node)
435
438
  scores.append(hit["_score"])
439
+
436
440
  return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)
437
441
 
438
442
 
@@ -443,6 +447,35 @@ class OpensearchVectorStore(BasePydanticVectorStore):
443
447
  Args:
444
448
  client (OpensearchVectorClient): Vector index client to use
445
449
  for data insertion/querying.
450
+
451
+ Examples:
452
+ `pip install llama-index-vector-stores-opensearch`
453
+
454
+ ```python
455
+ from llama_index.vector_stores.opensearch import (
456
+ OpensearchVectorStore,
457
+ OpensearchVectorClient,
458
+ )
459
+
460
+ # http endpoint for your cluster (opensearch required for vector index usage)
461
+ endpoint = "http://localhost:9200"
462
+ # index to demonstrate the VectorStore impl
463
+ idx = "gpt-index-demo"
464
+
465
+ # OpensearchVectorClient stores text in this field by default
466
+ text_field = "content"
467
+ # OpensearchVectorClient stores embeddings in this field by default
468
+ embedding_field = "embedding"
469
+
470
+ # OpensearchVectorClient encapsulates logic for a
471
+ # single opensearch index with vector search enabled
472
+ client = OpensearchVectorClient(
473
+ endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
474
+ )
475
+
476
+ # initialize vector store
477
+ vector_store = OpensearchVectorStore(client)
478
+ ```
446
479
  """
447
480
 
448
481
  stores_text: bool = True
@@ -494,10 +527,10 @@ class OpensearchVectorStore(BasePydanticVectorStore):
494
527
 
495
528
  def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
496
529
  """
497
- Delete nodes using with ref_doc_id.
530
+ Delete nodes using a ref_doc_id.
498
531
 
499
532
  Args:
500
- ref_doc_id (str): The doc_id of the document to delete.
533
+ ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
501
534
 
502
535
  """
503
536
  asyncio.get_event_loop().run_until_complete(
@@ -506,13 +539,13 @@ class OpensearchVectorStore(BasePydanticVectorStore):
506
539
 
507
540
  async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
508
541
  """
509
- Async delete nodes using with ref_doc_id.
542
+ Async delete nodes using a ref_doc_id.
510
543
 
511
544
  Args:
512
- ref_doc_id (str): The doc_id of the document to delete.
545
+ ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
513
546
 
514
547
  """
515
- await self._client.delete_doc_id(ref_doc_id)
548
+ await self._client.delete_by_doc_id(ref_doc_id)
516
549
 
517
550
  def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
518
551
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-vector-stores-opensearch
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: llama-index vector_stores opensearch integration
5
5
  License: MIT
6
6
  Author: Your Name
@@ -0,0 +1,5 @@
1
+ llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
2
+ llama_index/vector_stores/opensearch/base.py,sha256=LGXHRzcMVpvE-cb778I_W_I7wfGbLKgFQq9jR4OY6N8,19162
3
+ llama_index_vector_stores_opensearch-0.1.10.dist-info/METADATA,sha256=_-b77ZvNRypAK2awVaSGwoKqL3VdyS9SgpfVmyvmH1o,678
4
+ llama_index_vector_stores_opensearch-0.1.10.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
+ llama_index_vector_stores_opensearch-0.1.10.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
2
- llama_index/vector_stores/opensearch/base.py,sha256=eHWcqPEeiYObJu8vIVfDPl9ryg8lOE__zEexvFV4lGA,16958
3
- llama_index_vector_stores_opensearch-0.1.8.dist-info/METADATA,sha256=Oq_NhlE5Mq_tpHFUiERyEgWdruzzUi0XKXdRYpDFMFk,677
4
- llama_index_vector_stores_opensearch-0.1.8.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
- llama_index_vector_stores_opensearch-0.1.8.dist-info/RECORD,,