llama-index-vector-stores-opensearch 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_index/py.typed +0 -0
- llama_index/vector_stores/opensearch/__init__.py +6 -0
- llama_index/vector_stores/opensearch/base.py +1167 -0
- llama_index_vector_stores_opensearch-0.6.3.dist-info/METADATA +13 -0
- llama_index_vector_stores_opensearch-0.6.3.dist-info/RECORD +7 -0
- llama_index_vector_stores_opensearch-0.6.3.dist-info/WHEEL +4 -0
- llama_index_vector_stores_opensearch-0.6.3.dist-info/licenses/LICENSE +21 -0
llama_index/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,1167 @@
|
|
|
1
|
+
"""Elasticsearch/Opensearch vector store."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
7
|
+
|
|
8
|
+
from llama_index.core.async_utils import asyncio_run
|
|
9
|
+
from llama_index.core.bridge.pydantic import PrivateAttr
|
|
10
|
+
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
|
|
11
|
+
from llama_index.core.vector_stores.types import (
|
|
12
|
+
FilterCondition,
|
|
13
|
+
FilterOperator,
|
|
14
|
+
MetadataFilter,
|
|
15
|
+
MetadataFilters,
|
|
16
|
+
BasePydanticVectorStore,
|
|
17
|
+
VectorStoreQuery,
|
|
18
|
+
VectorStoreQueryMode,
|
|
19
|
+
VectorStoreQueryResult,
|
|
20
|
+
)
|
|
21
|
+
from llama_index.core.vector_stores.utils import (
|
|
22
|
+
metadata_dict_to_node,
|
|
23
|
+
node_to_metadata_dict,
|
|
24
|
+
)
|
|
25
|
+
from opensearchpy.client import Client as OSClient
|
|
26
|
+
|
|
27
|
+
IMPORT_OPENSEARCH_PY_ERROR = (
|
|
28
|
+
"Could not import OpenSearch. Please install it with `pip install opensearch-py`."
|
|
29
|
+
)
|
|
30
|
+
IMPORT_ASYNC_OPENSEARCH_PY_ERROR = "Could not import AsyncOpenSearch. Please install it with `pip install opensearch-py`."
|
|
31
|
+
INVALID_HYBRID_QUERY_ERROR = (
|
|
32
|
+
"Please specify the lexical_query and search_pipeline for hybrid search."
|
|
33
|
+
)
|
|
34
|
+
MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class OpensearchVectorClient:
|
|
38
|
+
"""
|
|
39
|
+
Object encapsulating an Opensearch index that has vector search enabled.
|
|
40
|
+
|
|
41
|
+
If the index does not yet exist, it is created during init.
|
|
42
|
+
Therefore, the underlying index is assumed to either:
|
|
43
|
+
1) not exist yet or 2) be created due to previous usage of this class.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
endpoint (str): URL (http/https) of elasticsearch endpoint
|
|
47
|
+
index (str): Name of the elasticsearch index
|
|
48
|
+
dim (int): Dimension of the vector
|
|
49
|
+
embedding_field (str): Name of the field in the index to store
|
|
50
|
+
embedding array in.
|
|
51
|
+
text_field (str): Name of the field to grab text from
|
|
52
|
+
method (Optional[dict]): Opensearch "method" JSON obj for configuring
|
|
53
|
+
the KNN index.
|
|
54
|
+
This includes engine, metric, and other config params. Defaults to:
|
|
55
|
+
{"name": "hnsw", "space_type": "l2", "engine": "nmslib",
|
|
56
|
+
"parameters": {"ef_construction": 256, "m": 48}}
|
|
57
|
+
settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
|
|
58
|
+
{"index": {"knn": True, "knn.algo_param.ef_search": 100}}
|
|
59
|
+
space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
|
|
60
|
+
os_client (Optional[OSClient]): Custom synchronous client (see OpenSearch from opensearch-py)
|
|
61
|
+
os_async_client (Optional[OSClient]): Custom asynchronous client (see AsyncOpenSearch from opensearch-py)
|
|
62
|
+
excluded_source_fields (Optional[List[str]]): Optional list of document "source" fields to exclude from OpenSearch responses.
|
|
63
|
+
**kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
|
|
64
|
+
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
endpoint: str,
|
|
70
|
+
index: str,
|
|
71
|
+
dim: int,
|
|
72
|
+
embedding_field: str = "embedding",
|
|
73
|
+
text_field: str = "content",
|
|
74
|
+
method: Optional[dict] = None,
|
|
75
|
+
settings: Optional[dict] = None,
|
|
76
|
+
engine: Optional[str] = "nmslib",
|
|
77
|
+
space_type: Optional[str] = "l2",
|
|
78
|
+
max_chunk_bytes: int = 1 * 1024 * 1024,
|
|
79
|
+
search_pipeline: Optional[str] = None,
|
|
80
|
+
os_client: Optional[OSClient] = None,
|
|
81
|
+
os_async_client: Optional[OSClient] = None,
|
|
82
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
83
|
+
**kwargs: Any,
|
|
84
|
+
):
|
|
85
|
+
"""Init params."""
|
|
86
|
+
if method is None:
|
|
87
|
+
method = {
|
|
88
|
+
"name": "hnsw",
|
|
89
|
+
"space_type": "l2",
|
|
90
|
+
"engine": engine,
|
|
91
|
+
"parameters": {"ef_construction": 256, "m": 48},
|
|
92
|
+
}
|
|
93
|
+
if settings is None:
|
|
94
|
+
settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
|
|
95
|
+
if embedding_field is None:
|
|
96
|
+
embedding_field = "embedding"
|
|
97
|
+
|
|
98
|
+
self._method = method
|
|
99
|
+
self._embedding_field = embedding_field
|
|
100
|
+
self._endpoint = endpoint
|
|
101
|
+
self._dim = dim
|
|
102
|
+
self._index = index
|
|
103
|
+
self._text_field = text_field
|
|
104
|
+
self._max_chunk_bytes = max_chunk_bytes
|
|
105
|
+
self._excluded_source_fields = excluded_source_fields
|
|
106
|
+
|
|
107
|
+
self._search_pipeline = search_pipeline
|
|
108
|
+
http_auth = kwargs.get("http_auth")
|
|
109
|
+
self.space_type = space_type
|
|
110
|
+
self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
|
|
111
|
+
# initialize mapping
|
|
112
|
+
idx_conf = {
|
|
113
|
+
"settings": settings,
|
|
114
|
+
"mappings": {
|
|
115
|
+
"properties": {
|
|
116
|
+
embedding_field: {
|
|
117
|
+
"type": "knn_vector",
|
|
118
|
+
"dimension": dim,
|
|
119
|
+
"method": method,
|
|
120
|
+
},
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
}
|
|
124
|
+
self._os_client = os_client or self._get_opensearch_client(
|
|
125
|
+
self._endpoint, **kwargs
|
|
126
|
+
)
|
|
127
|
+
self._os_async_client = os_async_client or self._get_async_opensearch_client(
|
|
128
|
+
self._endpoint, **kwargs
|
|
129
|
+
)
|
|
130
|
+
self._efficient_filtering_enabled = self._is_efficient_filtering_enabled()
|
|
131
|
+
not_found_error = self._import_not_found_error()
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
self._os_client.indices.get(index=self._index)
|
|
135
|
+
except TypeError:
|
|
136
|
+
# Probably using async so switch to async client
|
|
137
|
+
try:
|
|
138
|
+
asyncio_run(self._os_async_client.indices.get(index=self._index))
|
|
139
|
+
except not_found_error:
|
|
140
|
+
asyncio_run(
|
|
141
|
+
self._os_async_client.indices.create(
|
|
142
|
+
index=self._index, body=idx_conf
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
if self.is_aoss:
|
|
146
|
+
asyncio_run(self._os_async_client.indices.exists(index=self._index))
|
|
147
|
+
else:
|
|
148
|
+
asyncio_run(
|
|
149
|
+
self._os_async_client.indices.refresh(index=self._index)
|
|
150
|
+
)
|
|
151
|
+
except not_found_error:
|
|
152
|
+
self._os_client.indices.create(index=self._index, body=idx_conf)
|
|
153
|
+
if self.is_aoss:
|
|
154
|
+
self._os_client.indices.exists(index=self._index)
|
|
155
|
+
else:
|
|
156
|
+
self._os_client.indices.refresh(index=self._index)
|
|
157
|
+
|
|
158
|
+
def _import_opensearch(self) -> Any:
|
|
159
|
+
"""Import OpenSearch if available, otherwise raise error."""
|
|
160
|
+
try:
|
|
161
|
+
from opensearchpy import OpenSearch
|
|
162
|
+
except ImportError:
|
|
163
|
+
raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
|
|
164
|
+
return OpenSearch
|
|
165
|
+
|
|
166
|
+
def _import_async_opensearch(self) -> Any:
|
|
167
|
+
"""Import AsyncOpenSearch if available, otherwise raise error."""
|
|
168
|
+
try:
|
|
169
|
+
from opensearchpy import AsyncOpenSearch
|
|
170
|
+
except ImportError:
|
|
171
|
+
raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
|
|
172
|
+
return AsyncOpenSearch
|
|
173
|
+
|
|
174
|
+
def _import_bulk(self) -> Any:
|
|
175
|
+
"""Import bulk if available, otherwise raise error."""
|
|
176
|
+
try:
|
|
177
|
+
from opensearchpy.helpers import bulk
|
|
178
|
+
except ImportError:
|
|
179
|
+
raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
|
|
180
|
+
return bulk
|
|
181
|
+
|
|
182
|
+
def _import_async_bulk(self) -> Any:
|
|
183
|
+
"""Import async_bulk if available, otherwise raise error."""
|
|
184
|
+
try:
|
|
185
|
+
from opensearchpy.helpers import async_bulk
|
|
186
|
+
except ImportError:
|
|
187
|
+
raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
|
|
188
|
+
return async_bulk
|
|
189
|
+
|
|
190
|
+
def _import_not_found_error(self) -> Any:
|
|
191
|
+
"""Import not found error if available, otherwise raise error."""
|
|
192
|
+
try:
|
|
193
|
+
from opensearchpy.exceptions import NotFoundError
|
|
194
|
+
except ImportError:
|
|
195
|
+
raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
|
|
196
|
+
return NotFoundError
|
|
197
|
+
|
|
198
|
+
def _get_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
|
|
199
|
+
"""Get OpenSearch client from the opensearch_url, otherwise raise error."""
|
|
200
|
+
try:
|
|
201
|
+
opensearch = self._import_opensearch()
|
|
202
|
+
client = opensearch(opensearch_url, **kwargs)
|
|
203
|
+
except ValueError as e:
|
|
204
|
+
raise ImportError(
|
|
205
|
+
f"OpenSearch client string provided is not in proper format. "
|
|
206
|
+
f"Got error: {e} "
|
|
207
|
+
)
|
|
208
|
+
return client
|
|
209
|
+
|
|
210
|
+
def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
|
|
211
|
+
"""Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
|
|
212
|
+
try:
|
|
213
|
+
opensearch = self._import_async_opensearch()
|
|
214
|
+
client = opensearch(opensearch_url, **kwargs)
|
|
215
|
+
|
|
216
|
+
except ValueError as e:
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"AsyncOpenSearch client string provided is not in proper format. "
|
|
219
|
+
f"Got error: {e} "
|
|
220
|
+
)
|
|
221
|
+
return client
|
|
222
|
+
|
|
223
|
+
def _get_opensearch_version(self) -> str:
|
|
224
|
+
info = self._os_client.info()
|
|
225
|
+
return info["version"]["number"]
|
|
226
|
+
|
|
227
|
+
def _bulk_ingest_embeddings(
|
|
228
|
+
self,
|
|
229
|
+
client: Any,
|
|
230
|
+
index_name: str,
|
|
231
|
+
embeddings: List[List[float]],
|
|
232
|
+
texts: Iterable[str],
|
|
233
|
+
metadatas: Optional[List[dict]] = None,
|
|
234
|
+
ids: Optional[List[str]] = None,
|
|
235
|
+
vector_field: str = "embedding",
|
|
236
|
+
text_field: str = "content",
|
|
237
|
+
mapping: Optional[Dict] = None,
|
|
238
|
+
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
239
|
+
is_aoss: bool = False,
|
|
240
|
+
) -> List[str]:
|
|
241
|
+
"""Bulk Ingest Embeddings into given index."""
|
|
242
|
+
if not mapping:
|
|
243
|
+
mapping = {}
|
|
244
|
+
|
|
245
|
+
bulk = self._import_bulk()
|
|
246
|
+
not_found_error = self._import_not_found_error()
|
|
247
|
+
requests = []
|
|
248
|
+
return_ids = []
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
client.indices.get(index=index_name)
|
|
252
|
+
except not_found_error:
|
|
253
|
+
client.indices.create(index=index_name, body=mapping)
|
|
254
|
+
|
|
255
|
+
for i, text in enumerate(texts):
|
|
256
|
+
metadata = metadatas[i] if metadatas else {}
|
|
257
|
+
_id = ids[i] if ids else str(uuid.uuid4())
|
|
258
|
+
request = {
|
|
259
|
+
"_op_type": "index",
|
|
260
|
+
"_index": index_name,
|
|
261
|
+
vector_field: embeddings[i],
|
|
262
|
+
text_field: text,
|
|
263
|
+
"metadata": metadata,
|
|
264
|
+
}
|
|
265
|
+
if is_aoss:
|
|
266
|
+
request["id"] = _id
|
|
267
|
+
else:
|
|
268
|
+
request["_id"] = _id
|
|
269
|
+
requests.append(request)
|
|
270
|
+
return_ids.append(_id)
|
|
271
|
+
|
|
272
|
+
bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
273
|
+
if not is_aoss:
|
|
274
|
+
client.indices.refresh(index=index_name)
|
|
275
|
+
|
|
276
|
+
return return_ids
|
|
277
|
+
|
|
278
|
+
async def _abulk_ingest_embeddings(
|
|
279
|
+
self,
|
|
280
|
+
client: Any,
|
|
281
|
+
index_name: str,
|
|
282
|
+
embeddings: List[List[float]],
|
|
283
|
+
texts: Iterable[str],
|
|
284
|
+
metadatas: Optional[List[dict]] = None,
|
|
285
|
+
ids: Optional[List[str]] = None,
|
|
286
|
+
vector_field: str = "embedding",
|
|
287
|
+
text_field: str = "content",
|
|
288
|
+
mapping: Optional[Dict] = None,
|
|
289
|
+
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
290
|
+
is_aoss: bool = False,
|
|
291
|
+
) -> List[str]:
|
|
292
|
+
"""Async Bulk Ingest Embeddings into given index."""
|
|
293
|
+
if not mapping:
|
|
294
|
+
mapping = {}
|
|
295
|
+
|
|
296
|
+
async_bulk = self._import_async_bulk()
|
|
297
|
+
not_found_error = self._import_not_found_error()
|
|
298
|
+
requests = []
|
|
299
|
+
return_ids = []
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
await client.indices.get(index=index_name)
|
|
303
|
+
except not_found_error:
|
|
304
|
+
await client.indices.create(index=index_name, body=mapping)
|
|
305
|
+
|
|
306
|
+
for i, text in enumerate(texts):
|
|
307
|
+
metadata = metadatas[i] if metadatas else {}
|
|
308
|
+
_id = ids[i] if ids else str(uuid.uuid4())
|
|
309
|
+
request = {
|
|
310
|
+
"_op_type": "index",
|
|
311
|
+
"_index": index_name,
|
|
312
|
+
vector_field: embeddings[i],
|
|
313
|
+
text_field: text,
|
|
314
|
+
"metadata": metadata,
|
|
315
|
+
}
|
|
316
|
+
if is_aoss:
|
|
317
|
+
request["id"] = _id
|
|
318
|
+
else:
|
|
319
|
+
request["_id"] = _id
|
|
320
|
+
requests.append(request)
|
|
321
|
+
return_ids.append(_id)
|
|
322
|
+
|
|
323
|
+
await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
324
|
+
if not is_aoss:
|
|
325
|
+
await client.indices.refresh(index=index_name)
|
|
326
|
+
|
|
327
|
+
return return_ids
|
|
328
|
+
|
|
329
|
+
def _default_approximate_search_query(
|
|
330
|
+
self,
|
|
331
|
+
query_vector: List[float],
|
|
332
|
+
k: int = 4,
|
|
333
|
+
filters: Optional[Union[Dict, List]] = None,
|
|
334
|
+
vector_field: str = "embedding",
|
|
335
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
336
|
+
) -> Dict:
|
|
337
|
+
"""For Approximate k-NN Search, this is the default query."""
|
|
338
|
+
query = {
|
|
339
|
+
"size": k,
|
|
340
|
+
"query": {
|
|
341
|
+
"knn": {
|
|
342
|
+
vector_field: {
|
|
343
|
+
"vector": query_vector,
|
|
344
|
+
"k": k,
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
},
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if filters:
|
|
351
|
+
# filter key must be added only when filtering to avoid "filter doesn't support values of type: START_ARRAY" exception
|
|
352
|
+
query["query"]["knn"][vector_field]["filter"] = filters
|
|
353
|
+
if excluded_source_fields:
|
|
354
|
+
query["_source"] = {"exclude": excluded_source_fields}
|
|
355
|
+
return query
|
|
356
|
+
|
|
357
|
+
def _is_text_field(self, value: Any) -> bool:
|
|
358
|
+
"""
|
|
359
|
+
Check if value is a string and keyword filtering needs to be performed.
|
|
360
|
+
|
|
361
|
+
Not applied to datetime strings.
|
|
362
|
+
"""
|
|
363
|
+
if isinstance(value, str):
|
|
364
|
+
try:
|
|
365
|
+
datetime.fromisoformat(value)
|
|
366
|
+
return False
|
|
367
|
+
except ValueError as e:
|
|
368
|
+
return True
|
|
369
|
+
else:
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
def _parse_filter(self, filter: MetadataFilter) -> dict:
|
|
373
|
+
"""
|
|
374
|
+
Parse a single MetadataFilter to equivalent OpenSearch expression.
|
|
375
|
+
|
|
376
|
+
As Opensearch does not differentiate between scalar/array keyword fields, IN and ANY are equivalent.
|
|
377
|
+
"""
|
|
378
|
+
key = f"metadata.{filter.key}"
|
|
379
|
+
op = filter.operator
|
|
380
|
+
|
|
381
|
+
equality_postfix = ".keyword" if self._is_text_field(value=filter.value) else ""
|
|
382
|
+
|
|
383
|
+
if op == FilterOperator.EQ:
|
|
384
|
+
return {"term": {f"{key}{equality_postfix}": filter.value}}
|
|
385
|
+
elif op in [
|
|
386
|
+
FilterOperator.GT,
|
|
387
|
+
FilterOperator.GTE,
|
|
388
|
+
FilterOperator.LT,
|
|
389
|
+
FilterOperator.LTE,
|
|
390
|
+
]:
|
|
391
|
+
return {"range": {key: {filter.operator.name.lower(): filter.value}}}
|
|
392
|
+
elif op == FilterOperator.NE:
|
|
393
|
+
return {
|
|
394
|
+
"bool": {
|
|
395
|
+
"must_not": {"term": {f"{key}{equality_postfix}": filter.value}}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
elif op in [FilterOperator.IN, FilterOperator.ANY]:
|
|
399
|
+
if isinstance(filter.value, list) and all(
|
|
400
|
+
self._is_text_field(val) for val in filter.value
|
|
401
|
+
):
|
|
402
|
+
return {"terms": {f"{key}.keyword": filter.value}}
|
|
403
|
+
else:
|
|
404
|
+
return {"terms": {key: filter.value}}
|
|
405
|
+
elif op == FilterOperator.NIN:
|
|
406
|
+
return {"bool": {"must_not": {"terms": {key: filter.value}}}}
|
|
407
|
+
elif op == FilterOperator.ALL:
|
|
408
|
+
return {
|
|
409
|
+
"terms_set": {
|
|
410
|
+
key: {
|
|
411
|
+
"terms": filter.value,
|
|
412
|
+
"minimum_should_match_script": {"source": "params.num_terms"},
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
elif op == FilterOperator.TEXT_MATCH:
|
|
417
|
+
return {"match": {key: {"query": filter.value, "fuzziness": "AUTO"}}}
|
|
418
|
+
elif op == FilterOperator.CONTAINS:
|
|
419
|
+
return {"wildcard": {key: f"*{filter.value}*"}}
|
|
420
|
+
elif op == FilterOperator.IS_EMPTY:
|
|
421
|
+
return {"bool": {"must_not": {"exists": {"field": key}}}}
|
|
422
|
+
else:
|
|
423
|
+
raise ValueError(f"Unsupported filter operator: {filter.operator}")
|
|
424
|
+
|
|
425
|
+
def _parse_filters_recursively(self, filters: MetadataFilters) -> dict:
|
|
426
|
+
"""Parse (possibly nested) MetadataFilters to equivalent OpenSearch expression."""
|
|
427
|
+
condition_map = {FilterCondition.AND: "must", FilterCondition.OR: "should"}
|
|
428
|
+
|
|
429
|
+
bool_clause = condition_map[filters.condition]
|
|
430
|
+
bool_query: dict[str, dict[str, list[dict]]] = {"bool": {bool_clause: []}}
|
|
431
|
+
|
|
432
|
+
for filter_item in filters.filters:
|
|
433
|
+
if isinstance(filter_item, MetadataFilter):
|
|
434
|
+
bool_query["bool"][bool_clause].append(self._parse_filter(filter_item))
|
|
435
|
+
elif isinstance(filter_item, MetadataFilters):
|
|
436
|
+
bool_query["bool"][bool_clause].append(
|
|
437
|
+
self._parse_filters_recursively(filter_item)
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
raise ValueError(f"Unsupported filter type: {type(filter_item)}")
|
|
441
|
+
|
|
442
|
+
return bool_query
|
|
443
|
+
|
|
444
|
+
def _parse_filters(self, filters: Optional[MetadataFilters]) -> List[dict]:
|
|
445
|
+
"""Parse MetadataFilters to equivalent OpenSearch expression."""
|
|
446
|
+
if filters is None:
|
|
447
|
+
return []
|
|
448
|
+
return [self._parse_filters_recursively(filters=filters)]
|
|
449
|
+
|
|
450
|
+
def _knn_search_query(
|
|
451
|
+
self,
|
|
452
|
+
embedding_field: str,
|
|
453
|
+
query_embedding: List[float],
|
|
454
|
+
k: int,
|
|
455
|
+
filters: Optional[MetadataFilters] = None,
|
|
456
|
+
search_method="approximate",
|
|
457
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
458
|
+
) -> Dict:
|
|
459
|
+
"""
|
|
460
|
+
Perform a k-Nearest Neighbors (kNN) search.
|
|
461
|
+
|
|
462
|
+
If the search method is "approximate" and the engine is "lucene" or "faiss", use efficient kNN filtering.
|
|
463
|
+
Otherwise, perform an exhaustive exact kNN search using "painless scripting" if the version of
|
|
464
|
+
OpenSearch supports it. If the OpenSearch version does not support it, use scoring script search.
|
|
465
|
+
|
|
466
|
+
Note:
|
|
467
|
+
- AWS OpenSearch Serverless does not support the painless scripting functionality at this time according to AWS.
|
|
468
|
+
- Approximate kNN search does not support pre-filtering.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
query_embedding (List[float]): Vector embedding to query.
|
|
472
|
+
k (int): Maximum number of results.
|
|
473
|
+
filters (Optional[MetadataFilters]): Optional filters to apply for the search.
|
|
474
|
+
Supports filter-context queries documented at
|
|
475
|
+
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
476
|
+
excluded_source_fields: Optional list of document "source" fields to exclude from the response.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Dict: Up to k documents closest to query_embedding.
|
|
480
|
+
|
|
481
|
+
"""
|
|
482
|
+
filters = self._parse_filters(filters)
|
|
483
|
+
|
|
484
|
+
if not filters:
|
|
485
|
+
search_query = self._default_approximate_search_query(
|
|
486
|
+
query_embedding,
|
|
487
|
+
k,
|
|
488
|
+
vector_field=embedding_field,
|
|
489
|
+
excluded_source_fields=excluded_source_fields,
|
|
490
|
+
)
|
|
491
|
+
elif (
|
|
492
|
+
search_method == "approximate"
|
|
493
|
+
and self._method["engine"]
|
|
494
|
+
in [
|
|
495
|
+
"lucene",
|
|
496
|
+
"faiss",
|
|
497
|
+
]
|
|
498
|
+
and self._efficient_filtering_enabled
|
|
499
|
+
):
|
|
500
|
+
# if engine is lucene or faiss, opensearch recommends efficient-kNN filtering.
|
|
501
|
+
search_query = self._default_approximate_search_query(
|
|
502
|
+
query_embedding,
|
|
503
|
+
k,
|
|
504
|
+
filters={"bool": {"filter": filters}},
|
|
505
|
+
vector_field=embedding_field,
|
|
506
|
+
excluded_source_fields=excluded_source_fields,
|
|
507
|
+
)
|
|
508
|
+
else:
|
|
509
|
+
if self.is_aoss:
|
|
510
|
+
# if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
|
|
511
|
+
# painless scripting so default scoring script returned will be just normal knn_score script
|
|
512
|
+
search_query = self._default_scoring_script_query(
|
|
513
|
+
query_embedding,
|
|
514
|
+
k,
|
|
515
|
+
space_type=self.space_type,
|
|
516
|
+
pre_filter={"bool": {"filter": filters}},
|
|
517
|
+
vector_field=embedding_field,
|
|
518
|
+
excluded_source_fields=excluded_source_fields,
|
|
519
|
+
)
|
|
520
|
+
else:
|
|
521
|
+
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
522
|
+
search_query = self._default_scoring_script_query(
|
|
523
|
+
query_embedding,
|
|
524
|
+
k,
|
|
525
|
+
space_type="l2Squared",
|
|
526
|
+
pre_filter={"bool": {"filter": filters}},
|
|
527
|
+
vector_field=embedding_field,
|
|
528
|
+
excluded_source_fields=excluded_source_fields,
|
|
529
|
+
)
|
|
530
|
+
return search_query
|
|
531
|
+
|
|
532
|
+
def _hybrid_search_query(
|
|
533
|
+
self,
|
|
534
|
+
text_field: str,
|
|
535
|
+
query_str: str,
|
|
536
|
+
embedding_field: str,
|
|
537
|
+
query_embedding: List[float],
|
|
538
|
+
k: int,
|
|
539
|
+
filters: Optional[MetadataFilters] = None,
|
|
540
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
541
|
+
) -> Dict:
|
|
542
|
+
knn_query = self._knn_search_query(embedding_field, query_embedding, k, filters)
|
|
543
|
+
lexical_query = self._lexical_search_query(text_field, query_str, k, filters)
|
|
544
|
+
|
|
545
|
+
query = {
|
|
546
|
+
"size": k,
|
|
547
|
+
"query": {
|
|
548
|
+
"hybrid": {"queries": [lexical_query["query"], knn_query["query"]]}
|
|
549
|
+
},
|
|
550
|
+
}
|
|
551
|
+
if excluded_source_fields:
|
|
552
|
+
query["_source"] = {"exclude": excluded_source_fields}
|
|
553
|
+
return query
|
|
554
|
+
|
|
555
|
+
def _lexical_search_query(
|
|
556
|
+
self,
|
|
557
|
+
text_field: str,
|
|
558
|
+
query_str: str,
|
|
559
|
+
k: int,
|
|
560
|
+
filters: Optional[MetadataFilters] = None,
|
|
561
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
562
|
+
) -> Dict:
|
|
563
|
+
lexical_query = {
|
|
564
|
+
"bool": {"must": {"match": {text_field: {"query": query_str}}}}
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
parsed_filters = self._parse_filters(filters)
|
|
568
|
+
if len(parsed_filters) > 0:
|
|
569
|
+
lexical_query["bool"]["filter"] = parsed_filters
|
|
570
|
+
|
|
571
|
+
query = {
|
|
572
|
+
"size": k,
|
|
573
|
+
"query": lexical_query,
|
|
574
|
+
}
|
|
575
|
+
if excluded_source_fields:
|
|
576
|
+
query["_source"] = {"exclude": excluded_source_fields}
|
|
577
|
+
return query
|
|
578
|
+
|
|
579
|
+
def __get_painless_scripting_source(
|
|
580
|
+
self, space_type: str, vector_field: str = "embedding"
|
|
581
|
+
) -> str:
|
|
582
|
+
"""
|
|
583
|
+
For Painless Scripting, it returns the script source based on space type.
|
|
584
|
+
This does not work with Opensearch Serverless currently.
|
|
585
|
+
"""
|
|
586
|
+
source_value = (
|
|
587
|
+
f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
588
|
+
)
|
|
589
|
+
if space_type == "cosineSimilarity":
|
|
590
|
+
return source_value
|
|
591
|
+
else:
|
|
592
|
+
return f"1/{source_value}"
|
|
593
|
+
|
|
594
|
+
def _get_knn_scoring_script(self, space_type, vector_field, query_vector):
|
|
595
|
+
"""Default scoring script that will work with AWS Opensearch Serverless."""
|
|
596
|
+
return {
|
|
597
|
+
"source": "knn_score",
|
|
598
|
+
"lang": "knn",
|
|
599
|
+
"params": {
|
|
600
|
+
"field": vector_field,
|
|
601
|
+
"query_value": query_vector,
|
|
602
|
+
"space_type": space_type,
|
|
603
|
+
},
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
def _get_painless_scoring_script(self, space_type, vector_field, query_vector):
|
|
607
|
+
source = self.__get_painless_scripting_source(space_type, vector_field)
|
|
608
|
+
return {
|
|
609
|
+
"source": source,
|
|
610
|
+
"params": {
|
|
611
|
+
"field": vector_field,
|
|
612
|
+
"query_value": query_vector,
|
|
613
|
+
},
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
def _default_scoring_script_query(
|
|
617
|
+
self,
|
|
618
|
+
query_vector: List[float],
|
|
619
|
+
k: int = 4,
|
|
620
|
+
space_type: str = "l2Squared",
|
|
621
|
+
pre_filter: Optional[Union[Dict, List]] = None,
|
|
622
|
+
vector_field: str = "embedding",
|
|
623
|
+
excluded_source_fields: Optional[List[str]] = None,
|
|
624
|
+
) -> Dict:
|
|
625
|
+
"""
|
|
626
|
+
For Scoring Script Search, this is the default query. Has to account for Opensearch Service
|
|
627
|
+
Serverless which does not support painless scripting functions so defaults to knn_score.
|
|
628
|
+
"""
|
|
629
|
+
if not pre_filter:
|
|
630
|
+
pre_filter = MATCH_ALL_QUERY
|
|
631
|
+
|
|
632
|
+
# check if we can use painless scripting or have to use default knn_score script
|
|
633
|
+
if self.is_aoss:
|
|
634
|
+
if space_type == "l2Squared":
|
|
635
|
+
raise ValueError(
|
|
636
|
+
"Unsupported space type for aoss. Can only use l1, l2, cosinesimil."
|
|
637
|
+
)
|
|
638
|
+
script = self._get_knn_scoring_script(
|
|
639
|
+
space_type, vector_field, query_vector
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
script = self._get_painless_scoring_script(
|
|
643
|
+
space_type, vector_field, query_vector
|
|
644
|
+
)
|
|
645
|
+
query = {
|
|
646
|
+
"size": k,
|
|
647
|
+
"query": {
|
|
648
|
+
"script_score": {
|
|
649
|
+
"query": pre_filter,
|
|
650
|
+
"script": script,
|
|
651
|
+
}
|
|
652
|
+
},
|
|
653
|
+
}
|
|
654
|
+
if excluded_source_fields:
|
|
655
|
+
query["_source"] = {"exclude": excluded_source_fields}
|
|
656
|
+
return query
|
|
657
|
+
|
|
658
|
+
def _is_aoss_enabled(self, http_auth: Any) -> bool:
|
|
659
|
+
"""Check if the service is http_auth is set as `aoss`."""
|
|
660
|
+
return (
|
|
661
|
+
http_auth is not None
|
|
662
|
+
and hasattr(http_auth, "service")
|
|
663
|
+
and http_auth.service == "aoss"
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
def _is_efficient_filtering_enabled(self) -> bool:
|
|
667
|
+
"""Check if kNN with efficient filtering is enabled."""
|
|
668
|
+
# Technically, AOSS supports efficient filtering,
|
|
669
|
+
# but we can't check the version number using .info(); AOSS doesn't support 'GET /'
|
|
670
|
+
# so we must skip and disable by default.
|
|
671
|
+
if self.is_aoss:
|
|
672
|
+
ef_enabled = False
|
|
673
|
+
else:
|
|
674
|
+
self._os_version = self._get_opensearch_version()
|
|
675
|
+
major, minor, patch = self._os_version.split(".")
|
|
676
|
+
ef_enabled = int(major) > 2 or (int(major) == 2 and int(minor) >= 9)
|
|
677
|
+
return ef_enabled
|
|
678
|
+
|
|
679
|
+
def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
680
|
+
"""Store results in the index."""
|
|
681
|
+
embeddings: List[List[float]] = []
|
|
682
|
+
texts: List[str] = []
|
|
683
|
+
metadatas: List[dict] = []
|
|
684
|
+
ids: List[str] = []
|
|
685
|
+
for node in nodes:
|
|
686
|
+
ids.append(node.node_id)
|
|
687
|
+
embeddings.append(node.get_embedding())
|
|
688
|
+
texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
689
|
+
metadatas.append(node_to_metadata_dict(node, remove_text=True))
|
|
690
|
+
|
|
691
|
+
return self._bulk_ingest_embeddings(
|
|
692
|
+
self._os_client,
|
|
693
|
+
self._index,
|
|
694
|
+
embeddings,
|
|
695
|
+
texts,
|
|
696
|
+
metadatas=metadatas,
|
|
697
|
+
ids=ids,
|
|
698
|
+
vector_field=self._embedding_field,
|
|
699
|
+
text_field=self._text_field,
|
|
700
|
+
mapping=None,
|
|
701
|
+
max_chunk_bytes=self._max_chunk_bytes,
|
|
702
|
+
is_aoss=self.is_aoss,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
async def aindex_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
706
|
+
"""Store results in the index."""
|
|
707
|
+
embeddings: List[List[float]] = []
|
|
708
|
+
texts: List[str] = []
|
|
709
|
+
metadatas: List[dict] = []
|
|
710
|
+
ids: List[str] = []
|
|
711
|
+
for node in nodes:
|
|
712
|
+
ids.append(node.node_id)
|
|
713
|
+
embeddings.append(node.get_embedding())
|
|
714
|
+
texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
715
|
+
metadatas.append(node_to_metadata_dict(node, remove_text=True))
|
|
716
|
+
|
|
717
|
+
return await self._abulk_ingest_embeddings(
|
|
718
|
+
self._os_async_client,
|
|
719
|
+
self._index,
|
|
720
|
+
embeddings,
|
|
721
|
+
texts,
|
|
722
|
+
metadatas=metadatas,
|
|
723
|
+
ids=ids,
|
|
724
|
+
vector_field=self._embedding_field,
|
|
725
|
+
text_field=self._text_field,
|
|
726
|
+
mapping=None,
|
|
727
|
+
max_chunk_bytes=self._max_chunk_bytes,
|
|
728
|
+
is_aoss=self.is_aoss,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
def delete_by_doc_id(self, doc_id: str) -> None:
|
|
732
|
+
"""
|
|
733
|
+
Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
doc_id (str): a LlamaIndex `Document` id
|
|
737
|
+
|
|
738
|
+
"""
|
|
739
|
+
search_query = {
|
|
740
|
+
"query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
|
|
741
|
+
}
|
|
742
|
+
self._os_client.delete_by_query(
|
|
743
|
+
index=self._index, body=search_query, refresh=True
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
async def adelete_by_doc_id(self, doc_id: str) -> None:
|
|
747
|
+
"""
|
|
748
|
+
Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
doc_id (str): a LlamaIndex `Document` id
|
|
752
|
+
|
|
753
|
+
"""
|
|
754
|
+
search_query = {
|
|
755
|
+
"query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
|
|
756
|
+
}
|
|
757
|
+
await self._os_async_client.delete_by_query(
|
|
758
|
+
index=self._index, body=search_query, refresh=True
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
def delete_nodes(
|
|
762
|
+
self,
|
|
763
|
+
node_ids: Optional[List[str]] = None,
|
|
764
|
+
filters: Optional[MetadataFilters] = None,
|
|
765
|
+
**delete_kwargs: Any,
|
|
766
|
+
) -> None:
|
|
767
|
+
"""
|
|
768
|
+
Deletes nodes.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
|
|
772
|
+
filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
|
|
773
|
+
|
|
774
|
+
"""
|
|
775
|
+
if not node_ids and not filters:
|
|
776
|
+
return
|
|
777
|
+
|
|
778
|
+
query = {"query": {"bool": {"filter": []}}}
|
|
779
|
+
if node_ids:
|
|
780
|
+
query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
|
|
781
|
+
|
|
782
|
+
if filters:
|
|
783
|
+
query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
|
|
784
|
+
|
|
785
|
+
self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
|
|
786
|
+
|
|
787
|
+
async def adelete_nodes(
|
|
788
|
+
self,
|
|
789
|
+
node_ids: Optional[List[str]] = None,
|
|
790
|
+
filters: Optional[MetadataFilters] = None,
|
|
791
|
+
**delete_kwargs: Any,
|
|
792
|
+
) -> None:
|
|
793
|
+
"""
|
|
794
|
+
Deletes nodes.
|
|
795
|
+
|
|
796
|
+
Args:
|
|
797
|
+
node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
|
|
798
|
+
filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
|
|
799
|
+
|
|
800
|
+
"""
|
|
801
|
+
if not node_ids and not filters:
|
|
802
|
+
return
|
|
803
|
+
|
|
804
|
+
query = {"query": {"bool": {"filter": []}}}
|
|
805
|
+
if node_ids:
|
|
806
|
+
query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
|
|
807
|
+
|
|
808
|
+
if filters:
|
|
809
|
+
query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
|
|
810
|
+
|
|
811
|
+
await self._os_async_client.delete_by_query(
|
|
812
|
+
index=self._index, body=query, refresh=True
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
def clear(self) -> None:
|
|
816
|
+
"""Clears index."""
|
|
817
|
+
query = {"query": {"bool": {"filter": []}}}
|
|
818
|
+
self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
|
|
819
|
+
|
|
820
|
+
async def aclear(self) -> None:
|
|
821
|
+
"""Clears index."""
|
|
822
|
+
query = {"query": {"bool": {"filter": []}}}
|
|
823
|
+
await self._os_async_client.delete_by_query(
|
|
824
|
+
index=self._index, body=query, refresh=True
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
def close(self) -> None:
|
|
828
|
+
"""Close the OpenSearch clients and release resources."""
|
|
829
|
+
self._os_client.close()
|
|
830
|
+
try:
|
|
831
|
+
loop = asyncio.get_running_loop()
|
|
832
|
+
except RuntimeError:
|
|
833
|
+
# No running loop: run async close directly
|
|
834
|
+
asyncio.run(self._os_async_client.close())
|
|
835
|
+
else:
|
|
836
|
+
# Running loop: schedule async close
|
|
837
|
+
loop.create_task(self._os_async_client.close())
|
|
838
|
+
|
|
839
|
+
async def aclose(self) -> None:
|
|
840
|
+
"""Asynchronously close the OpenSearch clients and release resources."""
|
|
841
|
+
self._os_client.close()
|
|
842
|
+
await self._os_async_client.close()
|
|
843
|
+
|
|
844
|
+
def query(
|
|
845
|
+
self,
|
|
846
|
+
query_mode: VectorStoreQueryMode,
|
|
847
|
+
query_str: Optional[str],
|
|
848
|
+
query_embedding: List[float],
|
|
849
|
+
k: int,
|
|
850
|
+
filters: Optional[MetadataFilters] = None,
|
|
851
|
+
) -> VectorStoreQueryResult:
|
|
852
|
+
if query_mode == VectorStoreQueryMode.HYBRID:
|
|
853
|
+
if query_str is None or self._search_pipeline is None:
|
|
854
|
+
raise ValueError(INVALID_HYBRID_QUERY_ERROR)
|
|
855
|
+
search_query = self._hybrid_search_query(
|
|
856
|
+
self._text_field,
|
|
857
|
+
query_str,
|
|
858
|
+
self._embedding_field,
|
|
859
|
+
query_embedding,
|
|
860
|
+
k,
|
|
861
|
+
filters=filters,
|
|
862
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
863
|
+
)
|
|
864
|
+
params = {
|
|
865
|
+
"search_pipeline": self._search_pipeline,
|
|
866
|
+
}
|
|
867
|
+
elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
|
|
868
|
+
search_query = self._lexical_search_query(
|
|
869
|
+
self._text_field,
|
|
870
|
+
query_str,
|
|
871
|
+
k,
|
|
872
|
+
filters=filters,
|
|
873
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
874
|
+
)
|
|
875
|
+
params = None
|
|
876
|
+
else:
|
|
877
|
+
search_query = self._knn_search_query(
|
|
878
|
+
self._embedding_field,
|
|
879
|
+
query_embedding,
|
|
880
|
+
k,
|
|
881
|
+
filters=filters,
|
|
882
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
883
|
+
)
|
|
884
|
+
params = None
|
|
885
|
+
|
|
886
|
+
res = self._os_client.search(
|
|
887
|
+
index=self._index, body=search_query, params=params
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
return self._to_query_result(res)
|
|
891
|
+
|
|
892
|
+
async def aquery(
|
|
893
|
+
self,
|
|
894
|
+
query_mode: VectorStoreQueryMode,
|
|
895
|
+
query_str: Optional[str],
|
|
896
|
+
query_embedding: List[float],
|
|
897
|
+
k: int,
|
|
898
|
+
filters: Optional[MetadataFilters] = None,
|
|
899
|
+
) -> VectorStoreQueryResult:
|
|
900
|
+
if query_mode == VectorStoreQueryMode.HYBRID:
|
|
901
|
+
if query_str is None or self._search_pipeline is None:
|
|
902
|
+
raise ValueError(INVALID_HYBRID_QUERY_ERROR)
|
|
903
|
+
search_query = self._hybrid_search_query(
|
|
904
|
+
self._text_field,
|
|
905
|
+
query_str,
|
|
906
|
+
self._embedding_field,
|
|
907
|
+
query_embedding,
|
|
908
|
+
k,
|
|
909
|
+
filters=filters,
|
|
910
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
911
|
+
)
|
|
912
|
+
params = {
|
|
913
|
+
"search_pipeline": self._search_pipeline,
|
|
914
|
+
}
|
|
915
|
+
elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
|
|
916
|
+
search_query = self._lexical_search_query(
|
|
917
|
+
self._text_field,
|
|
918
|
+
query_str,
|
|
919
|
+
k,
|
|
920
|
+
filters=filters,
|
|
921
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
922
|
+
)
|
|
923
|
+
params = None
|
|
924
|
+
else:
|
|
925
|
+
search_query = self._knn_search_query(
|
|
926
|
+
self._embedding_field,
|
|
927
|
+
query_embedding,
|
|
928
|
+
k,
|
|
929
|
+
filters=filters,
|
|
930
|
+
excluded_source_fields=self._excluded_source_fields,
|
|
931
|
+
)
|
|
932
|
+
params = None
|
|
933
|
+
|
|
934
|
+
res = await self._os_async_client.search(
|
|
935
|
+
index=self._index, body=search_query, params=params
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
return self._to_query_result(res)
|
|
939
|
+
|
|
940
|
+
def _to_query_result(self, res) -> VectorStoreQueryResult:
|
|
941
|
+
nodes = []
|
|
942
|
+
ids = []
|
|
943
|
+
scores = []
|
|
944
|
+
for hit in res["hits"]["hits"]:
|
|
945
|
+
source = hit["_source"]
|
|
946
|
+
node_id = hit["_id"]
|
|
947
|
+
text = source[self._text_field]
|
|
948
|
+
metadata = source.get("metadata", None)
|
|
949
|
+
|
|
950
|
+
try:
|
|
951
|
+
node = metadata_dict_to_node(metadata)
|
|
952
|
+
node.text = text
|
|
953
|
+
except Exception:
|
|
954
|
+
# TODO: Legacy support for old nodes
|
|
955
|
+
node_info = source.get("node_info")
|
|
956
|
+
relationships = source.get("relationships") or {}
|
|
957
|
+
start_char_idx = None
|
|
958
|
+
end_char_idx = None
|
|
959
|
+
if isinstance(node_info, dict):
|
|
960
|
+
start_char_idx = node_info.get("start", None)
|
|
961
|
+
end_char_idx = node_info.get("end", None)
|
|
962
|
+
|
|
963
|
+
node = TextNode(
|
|
964
|
+
text=text,
|
|
965
|
+
metadata=metadata,
|
|
966
|
+
id_=node_id,
|
|
967
|
+
start_char_idx=start_char_idx,
|
|
968
|
+
end_char_idx=end_char_idx,
|
|
969
|
+
relationships=relationships,
|
|
970
|
+
)
|
|
971
|
+
ids.append(node_id)
|
|
972
|
+
nodes.append(node)
|
|
973
|
+
scores.append(hit["_score"])
|
|
974
|
+
|
|
975
|
+
return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
class OpensearchVectorStore(BasePydanticVectorStore):
|
|
979
|
+
"""
|
|
980
|
+
Elasticsearch/Opensearch vector store.
|
|
981
|
+
|
|
982
|
+
Args:
|
|
983
|
+
client (OpensearchVectorClient): Vector index client to use
|
|
984
|
+
for data insertion/querying.
|
|
985
|
+
|
|
986
|
+
Examples:
|
|
987
|
+
`pip install llama-index-vector-stores-opensearch`
|
|
988
|
+
|
|
989
|
+
```python
|
|
990
|
+
from llama_index.vector_stores.opensearch import (
|
|
991
|
+
OpensearchVectorStore,
|
|
992
|
+
OpensearchVectorClient,
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
# http endpoint for your cluster (opensearch required for vector index usage)
|
|
996
|
+
endpoint = "http://localhost:9200"
|
|
997
|
+
# index to demonstrate the VectorStore impl
|
|
998
|
+
idx = "gpt-index-demo"
|
|
999
|
+
|
|
1000
|
+
# OpensearchVectorClient stores text in this field by default
|
|
1001
|
+
text_field = "content"
|
|
1002
|
+
# OpensearchVectorClient stores embeddings in this field by default
|
|
1003
|
+
embedding_field = "embedding"
|
|
1004
|
+
|
|
1005
|
+
# OpensearchVectorClient encapsulates logic for a
|
|
1006
|
+
# single opensearch index with vector search enabled
|
|
1007
|
+
client = OpensearchVectorClient(
|
|
1008
|
+
endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
# initialize vector store
|
|
1012
|
+
vector_store = OpensearchVectorStore(client)
|
|
1013
|
+
```
|
|
1014
|
+
|
|
1015
|
+
"""
|
|
1016
|
+
|
|
1017
|
+
stores_text: bool = True
|
|
1018
|
+
_client: OpensearchVectorClient = PrivateAttr(default=None)
|
|
1019
|
+
|
|
1020
|
+
def __init__(
|
|
1021
|
+
self,
|
|
1022
|
+
client: OpensearchVectorClient,
|
|
1023
|
+
) -> None:
|
|
1024
|
+
"""Initialize params."""
|
|
1025
|
+
super().__init__()
|
|
1026
|
+
self._client = client
|
|
1027
|
+
|
|
1028
|
+
@property
|
|
1029
|
+
def client(self) -> Any:
|
|
1030
|
+
"""Get client."""
|
|
1031
|
+
return self._client
|
|
1032
|
+
|
|
1033
|
+
def add(
|
|
1034
|
+
self,
|
|
1035
|
+
nodes: List[BaseNode],
|
|
1036
|
+
**add_kwargs: Any,
|
|
1037
|
+
) -> List[str]:
|
|
1038
|
+
"""
|
|
1039
|
+
Add nodes to index.
|
|
1040
|
+
|
|
1041
|
+
Args:
|
|
1042
|
+
nodes: List[BaseNode]: list of nodes with embeddings.
|
|
1043
|
+
|
|
1044
|
+
"""
|
|
1045
|
+
self._client.index_results(nodes)
|
|
1046
|
+
return [result.node_id for result in nodes]
|
|
1047
|
+
|
|
1048
|
+
async def async_add(
|
|
1049
|
+
self,
|
|
1050
|
+
nodes: List[BaseNode],
|
|
1051
|
+
**add_kwargs: Any,
|
|
1052
|
+
) -> List[str]:
|
|
1053
|
+
"""
|
|
1054
|
+
Async add nodes to index.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
nodes: List[BaseNode]: list of nodes with embeddings.
|
|
1058
|
+
|
|
1059
|
+
"""
|
|
1060
|
+
await self._client.aindex_results(nodes)
|
|
1061
|
+
return [result.node_id for result in nodes]
|
|
1062
|
+
|
|
1063
|
+
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
1064
|
+
"""
|
|
1065
|
+
Delete nodes using with ref_doc_id.
|
|
1066
|
+
|
|
1067
|
+
Args:
|
|
1068
|
+
ref_doc_id (str): The doc_id of the document to delete.
|
|
1069
|
+
|
|
1070
|
+
"""
|
|
1071
|
+
self._client.delete_by_doc_id(ref_doc_id)
|
|
1072
|
+
|
|
1073
|
+
async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
1074
|
+
"""
|
|
1075
|
+
Async delete nodes using with ref_doc_id.
|
|
1076
|
+
|
|
1077
|
+
Args:
|
|
1078
|
+
ref_doc_id (str): The doc_id of the document to delete.
|
|
1079
|
+
|
|
1080
|
+
"""
|
|
1081
|
+
await self._client.adelete_by_doc_id(ref_doc_id)
|
|
1082
|
+
|
|
1083
|
+
def delete_nodes(
|
|
1084
|
+
self,
|
|
1085
|
+
node_ids: Optional[List[str]] = None,
|
|
1086
|
+
filters: Optional[MetadataFilters] = None,
|
|
1087
|
+
**delete_kwargs: Any,
|
|
1088
|
+
) -> None:
|
|
1089
|
+
"""
|
|
1090
|
+
Deletes nodes async.
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
|
|
1094
|
+
filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
|
|
1095
|
+
|
|
1096
|
+
"""
|
|
1097
|
+
self._client.delete_nodes(node_ids, filters, **delete_kwargs)
|
|
1098
|
+
|
|
1099
|
+
async def adelete_nodes(
|
|
1100
|
+
self,
|
|
1101
|
+
node_ids: Optional[List[str]] = None,
|
|
1102
|
+
filters: Optional[MetadataFilters] = None,
|
|
1103
|
+
**delete_kwargs: Any,
|
|
1104
|
+
) -> None:
|
|
1105
|
+
"""
|
|
1106
|
+
Async deletes nodes async.
|
|
1107
|
+
|
|
1108
|
+
Args:
|
|
1109
|
+
node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
|
|
1110
|
+
filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
|
|
1111
|
+
|
|
1112
|
+
"""
|
|
1113
|
+
await self._client.adelete_nodes(node_ids, filters, **delete_kwargs)
|
|
1114
|
+
|
|
1115
|
+
def clear(self) -> None:
|
|
1116
|
+
"""Clears index."""
|
|
1117
|
+
self._client.clear()
|
|
1118
|
+
|
|
1119
|
+
async def aclear(self) -> None:
|
|
1120
|
+
"""Async clears index."""
|
|
1121
|
+
await self._client.aclear()
|
|
1122
|
+
|
|
1123
|
+
def close(self) -> None:
|
|
1124
|
+
"""Close the vector store and release resources."""
|
|
1125
|
+
self._client.close()
|
|
1126
|
+
|
|
1127
|
+
async def aclose(self) -> None:
|
|
1128
|
+
"""Asynchronously close the vector store and release resources."""
|
|
1129
|
+
await self._client.aclose()
|
|
1130
|
+
|
|
1131
|
+
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
1132
|
+
"""
|
|
1133
|
+
Query index for top k most similar nodes.
|
|
1134
|
+
|
|
1135
|
+
Args:
|
|
1136
|
+
query (VectorStoreQuery): Store query object.
|
|
1137
|
+
|
|
1138
|
+
"""
|
|
1139
|
+
query_embedding = cast(List[float], query.query_embedding)
|
|
1140
|
+
|
|
1141
|
+
return self._client.query(
|
|
1142
|
+
query.mode,
|
|
1143
|
+
query.query_str,
|
|
1144
|
+
query_embedding,
|
|
1145
|
+
query.similarity_top_k,
|
|
1146
|
+
filters=query.filters,
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
async def aquery(
|
|
1150
|
+
self, query: VectorStoreQuery, **kwargs: Any
|
|
1151
|
+
) -> VectorStoreQueryResult:
|
|
1152
|
+
"""
|
|
1153
|
+
Async query index for top k most similar nodes.
|
|
1154
|
+
|
|
1155
|
+
Args:
|
|
1156
|
+
query (VectorStoreQuery): Store query object.
|
|
1157
|
+
|
|
1158
|
+
"""
|
|
1159
|
+
query_embedding = cast(List[float], query.query_embedding)
|
|
1160
|
+
|
|
1161
|
+
return await self._client.aquery(
|
|
1162
|
+
query.mode,
|
|
1163
|
+
query.query_str,
|
|
1164
|
+
query_embedding,
|
|
1165
|
+
query.similarity_top_k,
|
|
1166
|
+
filters=query.filters,
|
|
1167
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llama-index-vector-stores-opensearch
|
|
3
|
+
Version: 0.6.3
|
|
4
|
+
Summary: llama-index vector_stores opensearch integration
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: <4.0,>=3.9
|
|
9
|
+
Requires-Dist: llama-index-core<0.15,>=0.13.0
|
|
10
|
+
Requires-Dist: opensearch-py[async]<3,>=2.4.2
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# LlamaIndex Vector_Stores Integration: Opensearch
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
llama_index/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
|
|
3
|
+
llama_index/vector_stores/opensearch/base.py,sha256=AwmvJiHNSp8XhQ0sSuNig54dj5VD165gYgpcSiIM6lk,41329
|
|
4
|
+
llama_index_vector_stores_opensearch-0.6.3.dist-info/METADATA,sha256=0Bd4D3AzMxQWdsyHl8LjUTMnHIFxSWnROZsYyF6Ni0c,438
|
|
5
|
+
llama_index_vector_stores_opensearch-0.6.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
llama_index_vector_stores_opensearch-0.6.3.dist-info/licenses/LICENSE,sha256=JPQLUZD9rKvCTdu192Nk0V5PAwklIg6jANii3UmTyMs,1065
|
|
7
|
+
llama_index_vector_stores_opensearch-0.6.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Jerry Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|