llama-index-vector-stores-opensearch 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_index/py.typed ADDED
File without changes
@@ -0,0 +1,6 @@
1
+ from llama_index.vector_stores.opensearch.base import (
2
+ OpensearchVectorStore,
3
+ OpensearchVectorClient,
4
+ )
5
+
6
+ __all__ = ["OpensearchVectorStore", "OpensearchVectorClient"]
@@ -0,0 +1,1167 @@
1
+ """Elasticsearch/Opensearch vector store."""
2
+
3
+ import asyncio
4
+ import uuid
5
+ from datetime import datetime
6
+ from typing import Any, Dict, Iterable, List, Optional, Union, cast
7
+
8
+ from llama_index.core.async_utils import asyncio_run
9
+ from llama_index.core.bridge.pydantic import PrivateAttr
10
+ from llama_index.core.schema import BaseNode, MetadataMode, TextNode
11
+ from llama_index.core.vector_stores.types import (
12
+ FilterCondition,
13
+ FilterOperator,
14
+ MetadataFilter,
15
+ MetadataFilters,
16
+ BasePydanticVectorStore,
17
+ VectorStoreQuery,
18
+ VectorStoreQueryMode,
19
+ VectorStoreQueryResult,
20
+ )
21
+ from llama_index.core.vector_stores.utils import (
22
+ metadata_dict_to_node,
23
+ node_to_metadata_dict,
24
+ )
25
+ from opensearchpy.client import Client as OSClient
26
+
27
+ IMPORT_OPENSEARCH_PY_ERROR = (
28
+ "Could not import OpenSearch. Please install it with `pip install opensearch-py`."
29
+ )
30
+ IMPORT_ASYNC_OPENSEARCH_PY_ERROR = "Could not import AsyncOpenSearch. Please install it with `pip install opensearch-py`."
31
+ INVALID_HYBRID_QUERY_ERROR = (
32
+ "Please specify the lexical_query and search_pipeline for hybrid search."
33
+ )
34
+ MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
35
+
36
+
37
+ class OpensearchVectorClient:
38
+ """
39
+ Object encapsulating an Opensearch index that has vector search enabled.
40
+
41
+ If the index does not yet exist, it is created during init.
42
+ Therefore, the underlying index is assumed to either:
43
+ 1) not exist yet or 2) be created due to previous usage of this class.
44
+
45
+ Args:
46
+ endpoint (str): URL (http/https) of elasticsearch endpoint
47
+ index (str): Name of the elasticsearch index
48
+ dim (int): Dimension of the vector
49
+ embedding_field (str): Name of the field in the index to store
50
+ embedding array in.
51
+ text_field (str): Name of the field to grab text from
52
+ method (Optional[dict]): Opensearch "method" JSON obj for configuring
53
+ the KNN index.
54
+ This includes engine, metric, and other config params. Defaults to:
55
+ {"name": "hnsw", "space_type": "l2", "engine": "nmslib",
56
+ "parameters": {"ef_construction": 256, "m": 48}}
57
+ settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
58
+ {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
59
+ space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
60
+ os_client (Optional[OSClient]): Custom synchronous client (see OpenSearch from opensearch-py)
61
+ os_async_client (Optional[OSClient]): Custom asynchronous client (see AsyncOpenSearch from opensearch-py)
62
+ excluded_source_fields (Optional[List[str]]): Optional list of document "source" fields to exclude from OpenSearch responses.
63
+ **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
64
+
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ endpoint: str,
70
+ index: str,
71
+ dim: int,
72
+ embedding_field: str = "embedding",
73
+ text_field: str = "content",
74
+ method: Optional[dict] = None,
75
+ settings: Optional[dict] = None,
76
+ engine: Optional[str] = "nmslib",
77
+ space_type: Optional[str] = "l2",
78
+ max_chunk_bytes: int = 1 * 1024 * 1024,
79
+ search_pipeline: Optional[str] = None,
80
+ os_client: Optional[OSClient] = None,
81
+ os_async_client: Optional[OSClient] = None,
82
+ excluded_source_fields: Optional[List[str]] = None,
83
+ **kwargs: Any,
84
+ ):
85
+ """Init params."""
86
+ if method is None:
87
+ method = {
88
+ "name": "hnsw",
89
+ "space_type": "l2",
90
+ "engine": engine,
91
+ "parameters": {"ef_construction": 256, "m": 48},
92
+ }
93
+ if settings is None:
94
+ settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
95
+ if embedding_field is None:
96
+ embedding_field = "embedding"
97
+
98
+ self._method = method
99
+ self._embedding_field = embedding_field
100
+ self._endpoint = endpoint
101
+ self._dim = dim
102
+ self._index = index
103
+ self._text_field = text_field
104
+ self._max_chunk_bytes = max_chunk_bytes
105
+ self._excluded_source_fields = excluded_source_fields
106
+
107
+ self._search_pipeline = search_pipeline
108
+ http_auth = kwargs.get("http_auth")
109
+ self.space_type = space_type
110
+ self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
111
+ # initialize mapping
112
+ idx_conf = {
113
+ "settings": settings,
114
+ "mappings": {
115
+ "properties": {
116
+ embedding_field: {
117
+ "type": "knn_vector",
118
+ "dimension": dim,
119
+ "method": method,
120
+ },
121
+ }
122
+ },
123
+ }
124
+ self._os_client = os_client or self._get_opensearch_client(
125
+ self._endpoint, **kwargs
126
+ )
127
+ self._os_async_client = os_async_client or self._get_async_opensearch_client(
128
+ self._endpoint, **kwargs
129
+ )
130
+ self._efficient_filtering_enabled = self._is_efficient_filtering_enabled()
131
+ not_found_error = self._import_not_found_error()
132
+
133
+ try:
134
+ self._os_client.indices.get(index=self._index)
135
+ except TypeError:
136
+ # Probably using async so switch to async client
137
+ try:
138
+ asyncio_run(self._os_async_client.indices.get(index=self._index))
139
+ except not_found_error:
140
+ asyncio_run(
141
+ self._os_async_client.indices.create(
142
+ index=self._index, body=idx_conf
143
+ )
144
+ )
145
+ if self.is_aoss:
146
+ asyncio_run(self._os_async_client.indices.exists(index=self._index))
147
+ else:
148
+ asyncio_run(
149
+ self._os_async_client.indices.refresh(index=self._index)
150
+ )
151
+ except not_found_error:
152
+ self._os_client.indices.create(index=self._index, body=idx_conf)
153
+ if self.is_aoss:
154
+ self._os_client.indices.exists(index=self._index)
155
+ else:
156
+ self._os_client.indices.refresh(index=self._index)
157
+
158
+ def _import_opensearch(self) -> Any:
159
+ """Import OpenSearch if available, otherwise raise error."""
160
+ try:
161
+ from opensearchpy import OpenSearch
162
+ except ImportError:
163
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
164
+ return OpenSearch
165
+
166
+ def _import_async_opensearch(self) -> Any:
167
+ """Import AsyncOpenSearch if available, otherwise raise error."""
168
+ try:
169
+ from opensearchpy import AsyncOpenSearch
170
+ except ImportError:
171
+ raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
172
+ return AsyncOpenSearch
173
+
174
+ def _import_bulk(self) -> Any:
175
+ """Import bulk if available, otherwise raise error."""
176
+ try:
177
+ from opensearchpy.helpers import bulk
178
+ except ImportError:
179
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
180
+ return bulk
181
+
182
+ def _import_async_bulk(self) -> Any:
183
+ """Import async_bulk if available, otherwise raise error."""
184
+ try:
185
+ from opensearchpy.helpers import async_bulk
186
+ except ImportError:
187
+ raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
188
+ return async_bulk
189
+
190
+ def _import_not_found_error(self) -> Any:
191
+ """Import not found error if available, otherwise raise error."""
192
+ try:
193
+ from opensearchpy.exceptions import NotFoundError
194
+ except ImportError:
195
+ raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
196
+ return NotFoundError
197
+
198
+ def _get_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
199
+ """Get OpenSearch client from the opensearch_url, otherwise raise error."""
200
+ try:
201
+ opensearch = self._import_opensearch()
202
+ client = opensearch(opensearch_url, **kwargs)
203
+ except ValueError as e:
204
+ raise ImportError(
205
+ f"OpenSearch client string provided is not in proper format. "
206
+ f"Got error: {e} "
207
+ )
208
+ return client
209
+
210
+ def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
211
+ """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
212
+ try:
213
+ opensearch = self._import_async_opensearch()
214
+ client = opensearch(opensearch_url, **kwargs)
215
+
216
+ except ValueError as e:
217
+ raise ValueError(
218
+ f"AsyncOpenSearch client string provided is not in proper format. "
219
+ f"Got error: {e} "
220
+ )
221
+ return client
222
+
223
+ def _get_opensearch_version(self) -> str:
224
+ info = self._os_client.info()
225
+ return info["version"]["number"]
226
+
227
+ def _bulk_ingest_embeddings(
228
+ self,
229
+ client: Any,
230
+ index_name: str,
231
+ embeddings: List[List[float]],
232
+ texts: Iterable[str],
233
+ metadatas: Optional[List[dict]] = None,
234
+ ids: Optional[List[str]] = None,
235
+ vector_field: str = "embedding",
236
+ text_field: str = "content",
237
+ mapping: Optional[Dict] = None,
238
+ max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
239
+ is_aoss: bool = False,
240
+ ) -> List[str]:
241
+ """Bulk Ingest Embeddings into given index."""
242
+ if not mapping:
243
+ mapping = {}
244
+
245
+ bulk = self._import_bulk()
246
+ not_found_error = self._import_not_found_error()
247
+ requests = []
248
+ return_ids = []
249
+
250
+ try:
251
+ client.indices.get(index=index_name)
252
+ except not_found_error:
253
+ client.indices.create(index=index_name, body=mapping)
254
+
255
+ for i, text in enumerate(texts):
256
+ metadata = metadatas[i] if metadatas else {}
257
+ _id = ids[i] if ids else str(uuid.uuid4())
258
+ request = {
259
+ "_op_type": "index",
260
+ "_index": index_name,
261
+ vector_field: embeddings[i],
262
+ text_field: text,
263
+ "metadata": metadata,
264
+ }
265
+ if is_aoss:
266
+ request["id"] = _id
267
+ else:
268
+ request["_id"] = _id
269
+ requests.append(request)
270
+ return_ids.append(_id)
271
+
272
+ bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
273
+ if not is_aoss:
274
+ client.indices.refresh(index=index_name)
275
+
276
+ return return_ids
277
+
278
+ async def _abulk_ingest_embeddings(
279
+ self,
280
+ client: Any,
281
+ index_name: str,
282
+ embeddings: List[List[float]],
283
+ texts: Iterable[str],
284
+ metadatas: Optional[List[dict]] = None,
285
+ ids: Optional[List[str]] = None,
286
+ vector_field: str = "embedding",
287
+ text_field: str = "content",
288
+ mapping: Optional[Dict] = None,
289
+ max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
290
+ is_aoss: bool = False,
291
+ ) -> List[str]:
292
+ """Async Bulk Ingest Embeddings into given index."""
293
+ if not mapping:
294
+ mapping = {}
295
+
296
+ async_bulk = self._import_async_bulk()
297
+ not_found_error = self._import_not_found_error()
298
+ requests = []
299
+ return_ids = []
300
+
301
+ try:
302
+ await client.indices.get(index=index_name)
303
+ except not_found_error:
304
+ await client.indices.create(index=index_name, body=mapping)
305
+
306
+ for i, text in enumerate(texts):
307
+ metadata = metadatas[i] if metadatas else {}
308
+ _id = ids[i] if ids else str(uuid.uuid4())
309
+ request = {
310
+ "_op_type": "index",
311
+ "_index": index_name,
312
+ vector_field: embeddings[i],
313
+ text_field: text,
314
+ "metadata": metadata,
315
+ }
316
+ if is_aoss:
317
+ request["id"] = _id
318
+ else:
319
+ request["_id"] = _id
320
+ requests.append(request)
321
+ return_ids.append(_id)
322
+
323
+ await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
324
+ if not is_aoss:
325
+ await client.indices.refresh(index=index_name)
326
+
327
+ return return_ids
328
+
329
+ def _default_approximate_search_query(
330
+ self,
331
+ query_vector: List[float],
332
+ k: int = 4,
333
+ filters: Optional[Union[Dict, List]] = None,
334
+ vector_field: str = "embedding",
335
+ excluded_source_fields: Optional[List[str]] = None,
336
+ ) -> Dict:
337
+ """For Approximate k-NN Search, this is the default query."""
338
+ query = {
339
+ "size": k,
340
+ "query": {
341
+ "knn": {
342
+ vector_field: {
343
+ "vector": query_vector,
344
+ "k": k,
345
+ }
346
+ }
347
+ },
348
+ }
349
+
350
+ if filters:
351
+ # filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
352
+ query["query"]["knn"][vector_field]["filter"] = filters
353
+ if excluded_source_fields:
354
+ query["_source"] = {"exclude": excluded_source_fields}
355
+ return query
356
+
357
+ def _is_text_field(self, value: Any) -> bool:
358
+ """
359
+ Check if value is a string and keyword filtering needs to be performed.
360
+
361
+ Not applied to datetime strings.
362
+ """
363
+ if isinstance(value, str):
364
+ try:
365
+ datetime.fromisoformat(value)
366
+ return False
367
+ except ValueError as e:
368
+ return True
369
+ else:
370
+ return False
371
+
372
+ def _parse_filter(self, filter: MetadataFilter) -> dict:
373
+ """
374
+ Parse a single MetadataFilter to equivalent OpenSearch expression.
375
+
376
+ As Opensearch does not differentiate between scalar/array keyword fields, IN and ANY are equivalent.
377
+ """
378
+ key = f"metadata.{filter.key}"
379
+ op = filter.operator
380
+
381
+ equality_postfix = ".keyword" if self._is_text_field(value=filter.value) else ""
382
+
383
+ if op == FilterOperator.EQ:
384
+ return {"term": {f"{key}{equality_postfix}": filter.value}}
385
+ elif op in [
386
+ FilterOperator.GT,
387
+ FilterOperator.GTE,
388
+ FilterOperator.LT,
389
+ FilterOperator.LTE,
390
+ ]:
391
+ return {"range": {key: {filter.operator.name.lower(): filter.value}}}
392
+ elif op == FilterOperator.NE:
393
+ return {
394
+ "bool": {
395
+ "must_not": {"term": {f"{key}{equality_postfix}": filter.value}}
396
+ }
397
+ }
398
+ elif op in [FilterOperator.IN, FilterOperator.ANY]:
399
+ if isinstance(filter.value, list) and all(
400
+ self._is_text_field(val) for val in filter.value
401
+ ):
402
+ return {"terms": {f"{key}.keyword": filter.value}}
403
+ else:
404
+ return {"terms": {key: filter.value}}
405
+ elif op == FilterOperator.NIN:
406
+ return {"bool": {"must_not": {"terms": {key: filter.value}}}}
407
+ elif op == FilterOperator.ALL:
408
+ return {
409
+ "terms_set": {
410
+ key: {
411
+ "terms": filter.value,
412
+ "minimum_should_match_script": {"source": "params.num_terms"},
413
+ }
414
+ }
415
+ }
416
+ elif op == FilterOperator.TEXT_MATCH:
417
+ return {"match": {key: {"query": filter.value, "fuzziness": "AUTO"}}}
418
+ elif op == FilterOperator.CONTAINS:
419
+ return {"wildcard": {key: f"*{filter.value}*"}}
420
+ elif op == FilterOperator.IS_EMPTY:
421
+ return {"bool": {"must_not": {"exists": {"field": key}}}}
422
+ else:
423
+ raise ValueError(f"Unsupported filter operator: {filter.operator}")
424
+
425
+ def _parse_filters_recursively(self, filters: MetadataFilters) -> dict:
426
+ """Parse (possibly nested) MetadataFilters to equivalent OpenSearch expression."""
427
+ condition_map = {FilterCondition.AND: "must", FilterCondition.OR: "should"}
428
+
429
+ bool_clause = condition_map[filters.condition]
430
+ bool_query: dict[str, dict[str, list[dict]]] = {"bool": {bool_clause: []}}
431
+
432
+ for filter_item in filters.filters:
433
+ if isinstance(filter_item, MetadataFilter):
434
+ bool_query["bool"][bool_clause].append(self._parse_filter(filter_item))
435
+ elif isinstance(filter_item, MetadataFilters):
436
+ bool_query["bool"][bool_clause].append(
437
+ self._parse_filters_recursively(filter_item)
438
+ )
439
+ else:
440
+ raise ValueError(f"Unsupported filter type: {type(filter_item)}")
441
+
442
+ return bool_query
443
+
444
+ def _parse_filters(self, filters: Optional[MetadataFilters]) -> List[dict]:
445
+ """Parse MetadataFilters to equivalent OpenSearch expression."""
446
+ if filters is None:
447
+ return []
448
+ return [self._parse_filters_recursively(filters=filters)]
449
+
450
+ def _knn_search_query(
451
+ self,
452
+ embedding_field: str,
453
+ query_embedding: List[float],
454
+ k: int,
455
+ filters: Optional[MetadataFilters] = None,
456
+ search_method="approximate",
457
+ excluded_source_fields: Optional[List[str]] = None,
458
+ ) -> Dict:
459
+ """
460
+ Perform a k-Nearest Neighbors (kNN) search.
461
+
462
+ If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
463
+ Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
464
+ OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
465
+
466
+ Note:
467
+ - AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
468
+ - Approximate kNN search does not support pre-filtering.
469
+
470
+ Args:
471
+ query_embedding (List[float]): Vector embedding to query.
472
+ k (int): Maximum number of results.
473
+ filters (Optional[MetadataFilters]): Optional filters to apply for the search.
474
+ Supports filter-context queries documented at
475
+ https://opensearch.org/docs/latest/query-dsl/query-filter-context/
476
+ excluded_source_fields: Optional list of document "source" fields to exclude from the response.
477
+
478
+ Returns:
479
+ Dict: Up to k documents closest to query_embedding.
480
+
481
+ """
482
+ filters = self._parse_filters(filters)
483
+
484
+ if not filters:
485
+ search_query = self._default_approximate_search_query(
486
+ query_embedding,
487
+ k,
488
+ vector_field=embedding_field,
489
+ excluded_source_fields=excluded_source_fields,
490
+ )
491
+ elif (
492
+ search_method == "approximate"
493
+ and self._method["engine"]
494
+ in [
495
+ "lucene",
496
+ "faiss",
497
+ ]
498
+ and self._efficient_filtering_enabled
499
+ ):
500
+ # if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
501
+ search_query = self._default_approximate_search_query(
502
+ query_embedding,
503
+ k,
504
+ filters={"bool": {"filter": filters}},
505
+ vector_field=embedding_field,
506
+ excluded_source_fields=excluded_source_fields,
507
+ )
508
+ else:
509
+ if self.is_aoss:
510
+ # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
511
+ # painless scripting so default scoring script returned will be just normal knn_score script
512
+ search_query = self._default_scoring_script_query(
513
+ query_embedding,
514
+ k,
515
+ space_type=self.space_type,
516
+ pre_filter={"bool": {"filter": filters}},
517
+ vector_field=embedding_field,
518
+ excluded_source_fields=excluded_source_fields,
519
+ )
520
+ else:
521
+ # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
522
+ search_query = self._default_scoring_script_query(
523
+ query_embedding,
524
+ k,
525
+ space_type="l2Squared",
526
+ pre_filter={"bool": {"filter": filters}},
527
+ vector_field=embedding_field,
528
+ excluded_source_fields=excluded_source_fields,
529
+ )
530
+ return search_query
531
+
532
+ def _hybrid_search_query(
533
+ self,
534
+ text_field: str,
535
+ query_str: str,
536
+ embedding_field: str,
537
+ query_embedding: List[float],
538
+ k: int,
539
+ filters: Optional[MetadataFilters] = None,
540
+ excluded_source_fields: Optional[List[str]] = None,
541
+ ) -> Dict:
542
+ knn_query = self._knn_search_query(embedding_field, query_embedding, k, filters)
543
+ lexical_query = self._lexical_search_query(text_field, query_str, k, filters)
544
+
545
+ query = {
546
+ "size": k,
547
+ "query": {
548
+ "hybrid": {"queries": [lexical_query["query"], knn_query["query"]]}
549
+ },
550
+ }
551
+ if excluded_source_fields:
552
+ query["_source"] = {"exclude": excluded_source_fields}
553
+ return query
554
+
555
+ def _lexical_search_query(
556
+ self,
557
+ text_field: str,
558
+ query_str: str,
559
+ k: int,
560
+ filters: Optional[MetadataFilters] = None,
561
+ excluded_source_fields: Optional[List[str]] = None,
562
+ ) -> Dict:
563
+ lexical_query = {
564
+ "bool": {"must": {"match": {text_field: {"query": query_str}}}}
565
+ }
566
+
567
+ parsed_filters = self._parse_filters(filters)
568
+ if len(parsed_filters) > 0:
569
+ lexical_query["bool"]["filter"] = parsed_filters
570
+
571
+ query = {
572
+ "size": k,
573
+ "query": lexical_query,
574
+ }
575
+ if excluded_source_fields:
576
+ query["_source"] = {"exclude": excluded_source_fields}
577
+ return query
578
+
579
+ def __get_painless_scripting_source(
580
+ self, space_type: str, vector_field: str = "embedding"
581
+ ) -> str:
582
+ """
583
+ For Painless Scripting, it returns the script source based on space type.
584
+ This does not work with Opensearch Serverless currently.
585
+ """
586
+ source_value = (
587
+ f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
588
+ )
589
+ if space_type == "cosineSimilarity":
590
+ return source_value
591
+ else:
592
+ return f"1/{source_value}"
593
+
594
+ def _get_knn_scoring_script(self, space_type, vector_field, query_vector):
595
+ """Default scoring script that will work with AWS Opensearch Serverless."""
596
+ return {
597
+ "source": "knn_score",
598
+ "lang": "knn",
599
+ "params": {
600
+ "field": vector_field,
601
+ "query_value": query_vector,
602
+ "space_type": space_type,
603
+ },
604
+ }
605
+
606
+ def _get_painless_scoring_script(self, space_type, vector_field, query_vector):
607
+ source = self.__get_painless_scripting_source(space_type, vector_field)
608
+ return {
609
+ "source": source,
610
+ "params": {
611
+ "field": vector_field,
612
+ "query_value": query_vector,
613
+ },
614
+ }
615
+
616
+ def _default_scoring_script_query(
617
+ self,
618
+ query_vector: List[float],
619
+ k: int = 4,
620
+ space_type: str = "l2Squared",
621
+ pre_filter: Optional[Union[Dict, List]] = None,
622
+ vector_field: str = "embedding",
623
+ excluded_source_fields: Optional[List[str]] = None,
624
+ ) -> Dict:
625
+ """
626
+ For Scoring Script Search, this is the default query. Has to account for Opensearch Service
627
+ Serverless which does not support painless scripting functions so defaults to knn_score.
628
+ """
629
+ if not pre_filter:
630
+ pre_filter = MATCH_ALL_QUERY
631
+
632
+ # check if we can use painless scripting or have to use default knn_score script
633
+ if self.is_aoss:
634
+ if space_type == "l2Squared":
635
+ raise ValueError(
636
+ "Unsupported space type for aoss. Can only use l1, l2, cosinesimil."
637
+ )
638
+ script = self._get_knn_scoring_script(
639
+ space_type, vector_field, query_vector
640
+ )
641
+ else:
642
+ script = self._get_painless_scoring_script(
643
+ space_type, vector_field, query_vector
644
+ )
645
+ query = {
646
+ "size": k,
647
+ "query": {
648
+ "script_score": {
649
+ "query": pre_filter,
650
+ "script": script,
651
+ }
652
+ },
653
+ }
654
+ if excluded_source_fields:
655
+ query["_source"] = {"exclude": excluded_source_fields}
656
+ return query
657
+
658
+ def _is_aoss_enabled(self, http_auth: Any) -> bool:
659
+ """Check if the service is http_auth is set as `aoss`."""
660
+ return (
661
+ http_auth is not None
662
+ and hasattr(http_auth, "service")
663
+ and http_auth.service == "aoss"
664
+ )
665
+
666
+ def _is_efficient_filtering_enabled(self) -> bool:
667
+ """Check if kNN with efficient filtering is enabled."""
668
+ # Technically, AOSS supports efficient filtering,
669
+ # but we can't check the version number using .info(); AOSS doesn't support 'GET /'
670
+ # so we must skip and disable by default.
671
+ if self.is_aoss:
672
+ ef_enabled = False
673
+ else:
674
+ self._os_version = self._get_opensearch_version()
675
+ major, minor, patch = self._os_version.split(".")
676
+ ef_enabled = int(major) > 2 or (int(major) == 2 and int(minor) >= 9)
677
+ return ef_enabled
678
+
679
+ def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
680
+ """Store results in the index."""
681
+ embeddings: List[List[float]] = []
682
+ texts: List[str] = []
683
+ metadatas: List[dict] = []
684
+ ids: List[str] = []
685
+ for node in nodes:
686
+ ids.append(node.node_id)
687
+ embeddings.append(node.get_embedding())
688
+ texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
689
+ metadatas.append(node_to_metadata_dict(node, remove_text=True))
690
+
691
+ return self._bulk_ingest_embeddings(
692
+ self._os_client,
693
+ self._index,
694
+ embeddings,
695
+ texts,
696
+ metadatas=metadatas,
697
+ ids=ids,
698
+ vector_field=self._embedding_field,
699
+ text_field=self._text_field,
700
+ mapping=None,
701
+ max_chunk_bytes=self._max_chunk_bytes,
702
+ is_aoss=self.is_aoss,
703
+ )
704
+
705
+ async def aindex_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
706
+ """Store results in the index."""
707
+ embeddings: List[List[float]] = []
708
+ texts: List[str] = []
709
+ metadatas: List[dict] = []
710
+ ids: List[str] = []
711
+ for node in nodes:
712
+ ids.append(node.node_id)
713
+ embeddings.append(node.get_embedding())
714
+ texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
715
+ metadatas.append(node_to_metadata_dict(node, remove_text=True))
716
+
717
+ return await self._abulk_ingest_embeddings(
718
+ self._os_async_client,
719
+ self._index,
720
+ embeddings,
721
+ texts,
722
+ metadatas=metadatas,
723
+ ids=ids,
724
+ vector_field=self._embedding_field,
725
+ text_field=self._text_field,
726
+ mapping=None,
727
+ max_chunk_bytes=self._max_chunk_bytes,
728
+ is_aoss=self.is_aoss,
729
+ )
730
+
731
+ def delete_by_doc_id(self, doc_id: str) -> None:
732
+ """
733
+ Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
734
+
735
+ Args:
736
+ doc_id (str): a LlamaIndex `Document` id
737
+
738
+ """
739
+ search_query = {
740
+ "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
741
+ }
742
+ self._os_client.delete_by_query(
743
+ index=self._index, body=search_query, refresh=True
744
+ )
745
+
746
+ async def adelete_by_doc_id(self, doc_id: str) -> None:
747
+ """
748
+ Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
749
+
750
+ Args:
751
+ doc_id (str): a LlamaIndex `Document` id
752
+
753
+ """
754
+ search_query = {
755
+ "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
756
+ }
757
+ await self._os_async_client.delete_by_query(
758
+ index=self._index, body=search_query, refresh=True
759
+ )
760
+
761
+ def delete_nodes(
762
+ self,
763
+ node_ids: Optional[List[str]] = None,
764
+ filters: Optional[MetadataFilters] = None,
765
+ **delete_kwargs: Any,
766
+ ) -> None:
767
+ """
768
+ Deletes nodes.
769
+
770
+ Args:
771
+ node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
772
+ filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
773
+
774
+ """
775
+ if not node_ids and not filters:
776
+ return
777
+
778
+ query = {"query": {"bool": {"filter": []}}}
779
+ if node_ids:
780
+ query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
781
+
782
+ if filters:
783
+ query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
784
+
785
+ self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
786
+
787
+ async def adelete_nodes(
788
+ self,
789
+ node_ids: Optional[List[str]] = None,
790
+ filters: Optional[MetadataFilters] = None,
791
+ **delete_kwargs: Any,
792
+ ) -> None:
793
+ """
794
+ Deletes nodes.
795
+
796
+ Args:
797
+ node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
798
+ filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
799
+
800
+ """
801
+ if not node_ids and not filters:
802
+ return
803
+
804
+ query = {"query": {"bool": {"filter": []}}}
805
+ if node_ids:
806
+ query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
807
+
808
+ if filters:
809
+ query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
810
+
811
+ await self._os_async_client.delete_by_query(
812
+ index=self._index, body=query, refresh=True
813
+ )
814
+
815
+ def clear(self) -> None:
816
+ """Clears index."""
817
+ query = {"query": {"bool": {"filter": []}}}
818
+ self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
819
+
820
+ async def aclear(self) -> None:
821
+ """Clears index."""
822
+ query = {"query": {"bool": {"filter": []}}}
823
+ await self._os_async_client.delete_by_query(
824
+ index=self._index, body=query, refresh=True
825
+ )
826
+
827
+ def close(self) -> None:
828
+ """Close the OpenSearch clients and release resources."""
829
+ self._os_client.close()
830
+ try:
831
+ loop = asyncio.get_running_loop()
832
+ except RuntimeError:
833
+ # No running loop: run async close directly
834
+ asyncio.run(self._os_async_client.close())
835
+ else:
836
+ # Running loop: schedule async close
837
+ loop.create_task(self._os_async_client.close())
838
+
839
+ async def aclose(self) -> None:
840
+ """Asynchronously close the OpenSearch clients and release resources."""
841
+ self._os_client.close()
842
+ await self._os_async_client.close()
843
+
844
+ def query(
845
+ self,
846
+ query_mode: VectorStoreQueryMode,
847
+ query_str: Optional[str],
848
+ query_embedding: List[float],
849
+ k: int,
850
+ filters: Optional[MetadataFilters] = None,
851
+ ) -> VectorStoreQueryResult:
852
+ if query_mode == VectorStoreQueryMode.HYBRID:
853
+ if query_str is None or self._search_pipeline is None:
854
+ raise ValueError(INVALID_HYBRID_QUERY_ERROR)
855
+ search_query = self._hybrid_search_query(
856
+ self._text_field,
857
+ query_str,
858
+ self._embedding_field,
859
+ query_embedding,
860
+ k,
861
+ filters=filters,
862
+ excluded_source_fields=self._excluded_source_fields,
863
+ )
864
+ params = {
865
+ "search_pipeline": self._search_pipeline,
866
+ }
867
+ elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
868
+ search_query = self._lexical_search_query(
869
+ self._text_field,
870
+ query_str,
871
+ k,
872
+ filters=filters,
873
+ excluded_source_fields=self._excluded_source_fields,
874
+ )
875
+ params = None
876
+ else:
877
+ search_query = self._knn_search_query(
878
+ self._embedding_field,
879
+ query_embedding,
880
+ k,
881
+ filters=filters,
882
+ excluded_source_fields=self._excluded_source_fields,
883
+ )
884
+ params = None
885
+
886
+ res = self._os_client.search(
887
+ index=self._index, body=search_query, params=params
888
+ )
889
+
890
+ return self._to_query_result(res)
891
+
892
+ async def aquery(
893
+ self,
894
+ query_mode: VectorStoreQueryMode,
895
+ query_str: Optional[str],
896
+ query_embedding: List[float],
897
+ k: int,
898
+ filters: Optional[MetadataFilters] = None,
899
+ ) -> VectorStoreQueryResult:
900
+ if query_mode == VectorStoreQueryMode.HYBRID:
901
+ if query_str is None or self._search_pipeline is None:
902
+ raise ValueError(INVALID_HYBRID_QUERY_ERROR)
903
+ search_query = self._hybrid_search_query(
904
+ self._text_field,
905
+ query_str,
906
+ self._embedding_field,
907
+ query_embedding,
908
+ k,
909
+ filters=filters,
910
+ excluded_source_fields=self._excluded_source_fields,
911
+ )
912
+ params = {
913
+ "search_pipeline": self._search_pipeline,
914
+ }
915
+ elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
916
+ search_query = self._lexical_search_query(
917
+ self._text_field,
918
+ query_str,
919
+ k,
920
+ filters=filters,
921
+ excluded_source_fields=self._excluded_source_fields,
922
+ )
923
+ params = None
924
+ else:
925
+ search_query = self._knn_search_query(
926
+ self._embedding_field,
927
+ query_embedding,
928
+ k,
929
+ filters=filters,
930
+ excluded_source_fields=self._excluded_source_fields,
931
+ )
932
+ params = None
933
+
934
+ res = await self._os_async_client.search(
935
+ index=self._index, body=search_query, params=params
936
+ )
937
+
938
+ return self._to_query_result(res)
939
+
940
+ def _to_query_result(self, res) -> VectorStoreQueryResult:
941
+ nodes = []
942
+ ids = []
943
+ scores = []
944
+ for hit in res["hits"]["hits"]:
945
+ source = hit["_source"]
946
+ node_id = hit["_id"]
947
+ text = source[self._text_field]
948
+ metadata = source.get("metadata", None)
949
+
950
+ try:
951
+ node = metadata_dict_to_node(metadata)
952
+ node.text = text
953
+ except Exception:
954
+ # TODO: Legacy support for old nodes
955
+ node_info = source.get("node_info")
956
+ relationships = source.get("relationships") or {}
957
+ start_char_idx = None
958
+ end_char_idx = None
959
+ if isinstance(node_info, dict):
960
+ start_char_idx = node_info.get("start", None)
961
+ end_char_idx = node_info.get("end", None)
962
+
963
+ node = TextNode(
964
+ text=text,
965
+ metadata=metadata,
966
+ id_=node_id,
967
+ start_char_idx=start_char_idx,
968
+ end_char_idx=end_char_idx,
969
+ relationships=relationships,
970
+ )
971
+ ids.append(node_id)
972
+ nodes.append(node)
973
+ scores.append(hit["_score"])
974
+
975
+ return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)
976
+
977
+
978
+ class OpensearchVectorStore(BasePydanticVectorStore):
979
+ """
980
+ Elasticsearch/Opensearch vector store.
981
+
982
+ Args:
983
+ client (OpensearchVectorClient): Vector index client to use
984
+ for data insertion/querying.
985
+
986
+ Examples:
987
+ `pip install llama-index-vector-stores-opensearch`
988
+
989
+ ```python
990
+ from llama_index.vector_stores.opensearch import (
991
+ OpensearchVectorStore,
992
+ OpensearchVectorClient,
993
+ )
994
+
995
+ # http endpoint for your cluster (opensearch required for vector index usage)
996
+ endpoint = "http://localhost:9200"
997
+ # index to demonstrate the VectorStore impl
998
+ idx = "gpt-index-demo"
999
+
1000
+ # OpensearchVectorClient stores text in this field by default
1001
+ text_field = "content"
1002
+ # OpensearchVectorClient stores embeddings in this field by default
1003
+ embedding_field = "embedding"
1004
+
1005
+ # OpensearchVectorClient encapsulates logic for a
1006
+ # single opensearch index with vector search enabled
1007
+ client = OpensearchVectorClient(
1008
+ endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
1009
+ )
1010
+
1011
+ # initialize vector store
1012
+ vector_store = OpensearchVectorStore(client)
1013
+ ```
1014
+
1015
+ """
1016
+
1017
+ stores_text: bool = True
1018
+ _client: OpensearchVectorClient = PrivateAttr(default=None)
1019
+
1020
+ def __init__(
1021
+ self,
1022
+ client: OpensearchVectorClient,
1023
+ ) -> None:
1024
+ """Initialize params."""
1025
+ super().__init__()
1026
+ self._client = client
1027
+
1028
+ @property
1029
+ def client(self) -> Any:
1030
+ """Get client."""
1031
+ return self._client
1032
+
1033
+ def add(
1034
+ self,
1035
+ nodes: List[BaseNode],
1036
+ **add_kwargs: Any,
1037
+ ) -> List[str]:
1038
+ """
1039
+ Add nodes to index.
1040
+
1041
+ Args:
1042
+ nodes: List[BaseNode]: list of nodes with embeddings.
1043
+
1044
+ """
1045
+ self._client.index_results(nodes)
1046
+ return [result.node_id for result in nodes]
1047
+
1048
+ async def async_add(
1049
+ self,
1050
+ nodes: List[BaseNode],
1051
+ **add_kwargs: Any,
1052
+ ) -> List[str]:
1053
+ """
1054
+ Async add nodes to index.
1055
+
1056
+ Args:
1057
+ nodes: List[BaseNode]: list of nodes with embeddings.
1058
+
1059
+ """
1060
+ await self._client.aindex_results(nodes)
1061
+ return [result.node_id for result in nodes]
1062
+
1063
+ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
1064
+ """
1065
+ Delete nodes using with ref_doc_id.
1066
+
1067
+ Args:
1068
+ ref_doc_id (str): The doc_id of the document to delete.
1069
+
1070
+ """
1071
+ self._client.delete_by_doc_id(ref_doc_id)
1072
+
1073
+ async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
1074
+ """
1075
+ Async delete nodes using with ref_doc_id.
1076
+
1077
+ Args:
1078
+ ref_doc_id (str): The doc_id of the document to delete.
1079
+
1080
+ """
1081
+ await self._client.adelete_by_doc_id(ref_doc_id)
1082
+
1083
+ def delete_nodes(
1084
+ self,
1085
+ node_ids: Optional[List[str]] = None,
1086
+ filters: Optional[MetadataFilters] = None,
1087
+ **delete_kwargs: Any,
1088
+ ) -> None:
1089
+ """
1090
+ Deletes nodes async.
1091
+
1092
+ Args:
1093
+ node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
1094
+ filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
1095
+
1096
+ """
1097
+ self._client.delete_nodes(node_ids, filters, **delete_kwargs)
1098
+
1099
+ async def adelete_nodes(
1100
+ self,
1101
+ node_ids: Optional[List[str]] = None,
1102
+ filters: Optional[MetadataFilters] = None,
1103
+ **delete_kwargs: Any,
1104
+ ) -> None:
1105
+ """
1106
+ Async deletes nodes async.
1107
+
1108
+ Args:
1109
+ node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
1110
+ filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
1111
+
1112
+ """
1113
+ await self._client.adelete_nodes(node_ids, filters, **delete_kwargs)
1114
+
1115
+ def clear(self) -> None:
1116
+ """Clears index."""
1117
+ self._client.clear()
1118
+
1119
+ async def aclear(self) -> None:
1120
+ """Async clears index."""
1121
+ await self._client.aclear()
1122
+
1123
+ def close(self) -> None:
1124
+ """Close the vector store and release resources."""
1125
+ self._client.close()
1126
+
1127
+ async def aclose(self) -> None:
1128
+ """Asynchronously close the vector store and release resources."""
1129
+ await self._client.aclose()
1130
+
1131
+ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
1132
+ """
1133
+ Query index for top k most similar nodes.
1134
+
1135
+ Args:
1136
+ query (VectorStoreQuery): Store query object.
1137
+
1138
+ """
1139
+ query_embedding = cast(List[float], query.query_embedding)
1140
+
1141
+ return self._client.query(
1142
+ query.mode,
1143
+ query.query_str,
1144
+ query_embedding,
1145
+ query.similarity_top_k,
1146
+ filters=query.filters,
1147
+ )
1148
+
1149
+ async def aquery(
1150
+ self, query: VectorStoreQuery, **kwargs: Any
1151
+ ) -> VectorStoreQueryResult:
1152
+ """
1153
+ Async query index for top k most similar nodes.
1154
+
1155
+ Args:
1156
+ query (VectorStoreQuery): Store query object.
1157
+
1158
+ """
1159
+ query_embedding = cast(List[float], query.query_embedding)
1160
+
1161
+ return await self._client.aquery(
1162
+ query.mode,
1163
+ query.query_str,
1164
+ query_embedding,
1165
+ query.similarity_top_k,
1166
+ filters=query.filters,
1167
+ )
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: llama-index-vector-stores-opensearch
3
+ Version: 0.6.3
4
+ Summary: llama-index vector_stores opensearch integration
5
+ Author-email: Your Name <you@example.com>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: <4.0,>=3.9
9
+ Requires-Dist: llama-index-core<0.15,>=0.13.0
10
+ Requires-Dist: opensearch-py[async]<3,>=2.4.2
11
+ Description-Content-Type: text/markdown
12
+
13
+ # LlamaIndex Vector_Stores Integration: Opensearch
@@ -0,0 +1,7 @@
1
+ llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
3
+ llama_index/vector_stores/opensearch/base.py,sha256=AwmvJiHNSp8XhQ0sSuNig54dj5VD165gYgpcSiIM6lk,41329
4
+ llama_index_vector_stores_opensearch-0.6.3.dist-info/METADATA,sha256=0Bd4D3AzMxQWdsyHl8LjUTMnHIFxSWnROZsYyF6Ni0c,438
5
+ llama_index_vector_stores_opensearch-0.6.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ llama_index_vector_stores_opensearch-0.6.3.dist-info/licenses/LICENSE,sha256=JPQLUZD9rKvCTdu192Nk0V5PAwklIg6jANii3UmTyMs,1065
7
+ llama_index_vector_stores_opensearch-0.6.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) Jerry Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.