llama-index-vector-stores-opensearch 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.
- llama_index/vector_stores/opensearch/base.py +262 -229
- {llama_index_vector_stores_opensearch-0.1.8.dist-info → llama_index_vector_stores_opensearch-0.1.10.dist-info}/METADATA +1 -1
- llama_index_vector_stores_opensearch-0.1.10.dist-info/RECORD +5 -0
- llama_index_vector_stores_opensearch-0.1.8.dist-info/RECORD +0 -5
- {llama_index_vector_stores_opensearch-0.1.8.dist-info → llama_index_vector_stores_opensearch-0.1.10.dist-info}/WHEEL +0 -0
|
@@ -32,220 +32,6 @@ INVALID_HYBRID_QUERY_ERROR = (
|
|
|
32
32
|
MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _import_async_opensearch() -> Any:
|
|
36
|
-
"""Import OpenSearch if available, otherwise raise error."""
|
|
37
|
-
return AsyncOpenSearch
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _import_async_bulk() -> Any:
|
|
41
|
-
"""Import bulk if available, otherwise raise error."""
|
|
42
|
-
return async_bulk
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def _import_not_found_error() -> Any:
|
|
46
|
-
"""Import not found error if available, otherwise raise error."""
|
|
47
|
-
return NotFoundError
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def _get_async_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
|
|
51
|
-
"""Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
|
|
52
|
-
try:
|
|
53
|
-
opensearch = _import_async_opensearch()
|
|
54
|
-
client = opensearch(opensearch_url, **kwargs)
|
|
55
|
-
|
|
56
|
-
except ValueError as e:
|
|
57
|
-
raise ValueError(
|
|
58
|
-
f"AsyncOpenSearch client string provided is not in proper format. "
|
|
59
|
-
f"Got error: {e} "
|
|
60
|
-
)
|
|
61
|
-
return client
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
async def _bulk_ingest_embeddings(
|
|
65
|
-
client: Any,
|
|
66
|
-
index_name: str,
|
|
67
|
-
embeddings: List[List[float]],
|
|
68
|
-
texts: Iterable[str],
|
|
69
|
-
metadatas: Optional[List[dict]] = None,
|
|
70
|
-
ids: Optional[List[str]] = None,
|
|
71
|
-
vector_field: str = "embedding",
|
|
72
|
-
text_field: str = "content",
|
|
73
|
-
mapping: Optional[Dict] = None,
|
|
74
|
-
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
75
|
-
is_aoss: bool = False,
|
|
76
|
-
) -> List[str]:
|
|
77
|
-
"""Async Bulk Ingest Embeddings into given index."""
|
|
78
|
-
if not mapping:
|
|
79
|
-
mapping = {}
|
|
80
|
-
|
|
81
|
-
async_bulk = _import_async_bulk()
|
|
82
|
-
not_found_error = _import_not_found_error()
|
|
83
|
-
requests = []
|
|
84
|
-
return_ids = []
|
|
85
|
-
mapping = mapping
|
|
86
|
-
|
|
87
|
-
try:
|
|
88
|
-
await client.indices.get(index=index_name)
|
|
89
|
-
except not_found_error:
|
|
90
|
-
await client.indices.create(index=index_name, body=mapping)
|
|
91
|
-
|
|
92
|
-
for i, text in enumerate(texts):
|
|
93
|
-
metadata = metadatas[i] if metadatas else {}
|
|
94
|
-
_id = ids[i] if ids else str(uuid.uuid4())
|
|
95
|
-
request = {
|
|
96
|
-
"_op_type": "index",
|
|
97
|
-
"_index": index_name,
|
|
98
|
-
vector_field: embeddings[i],
|
|
99
|
-
text_field: text,
|
|
100
|
-
"metadata": metadata,
|
|
101
|
-
}
|
|
102
|
-
if is_aoss:
|
|
103
|
-
request["id"] = _id
|
|
104
|
-
else:
|
|
105
|
-
request["_id"] = _id
|
|
106
|
-
requests.append(request)
|
|
107
|
-
return_ids.append(_id)
|
|
108
|
-
await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
109
|
-
if not is_aoss:
|
|
110
|
-
await client.indices.refresh(index=index_name)
|
|
111
|
-
return return_ids
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def _default_approximate_search_query(
|
|
115
|
-
query_vector: List[float],
|
|
116
|
-
k: int = 4,
|
|
117
|
-
vector_field: str = "embedding",
|
|
118
|
-
) -> Dict:
|
|
119
|
-
"""For Approximate k-NN Search, this is the default query."""
|
|
120
|
-
return {
|
|
121
|
-
"size": k,
|
|
122
|
-
"query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def _parse_filters(filters: Optional[MetadataFilters]) -> Any:
|
|
127
|
-
pre_filter = []
|
|
128
|
-
if filters is not None:
|
|
129
|
-
for f in filters.legacy_filters():
|
|
130
|
-
pre_filter.append({f.key: json.loads(str(f.value))})
|
|
131
|
-
|
|
132
|
-
return pre_filter
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def _knn_search_query(
|
|
136
|
-
embedding_field: str,
|
|
137
|
-
query_embedding: List[float],
|
|
138
|
-
k: int,
|
|
139
|
-
filters: Optional[MetadataFilters] = None,
|
|
140
|
-
) -> Dict:
|
|
141
|
-
"""
|
|
142
|
-
Do knn search.
|
|
143
|
-
|
|
144
|
-
If there are no filters do approx-knn search.
|
|
145
|
-
If there are (pre)-filters, do an exhaustive exact knn search using 'painless
|
|
146
|
-
scripting'.
|
|
147
|
-
|
|
148
|
-
Note that approximate knn search does not support pre-filtering.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
query_embedding: Vector embedding to query.
|
|
152
|
-
k: Maximum number of results.
|
|
153
|
-
filters: Optional filters to apply before the search.
|
|
154
|
-
Supports filter-context queries documented at
|
|
155
|
-
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
Up to k docs closest to query_embedding
|
|
159
|
-
"""
|
|
160
|
-
if filters is None:
|
|
161
|
-
search_query = _default_approximate_search_query(
|
|
162
|
-
query_embedding, k, vector_field=embedding_field
|
|
163
|
-
)
|
|
164
|
-
else:
|
|
165
|
-
pre_filter = _parse_filters(filters)
|
|
166
|
-
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
167
|
-
search_query = _default_painless_scripting_query(
|
|
168
|
-
query_embedding,
|
|
169
|
-
k,
|
|
170
|
-
space_type="l2Squared",
|
|
171
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
172
|
-
vector_field=embedding_field,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
return search_query
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _hybrid_search_query(
|
|
179
|
-
text_field: str,
|
|
180
|
-
query_str: str,
|
|
181
|
-
embedding_field: str,
|
|
182
|
-
query_embedding: List[float],
|
|
183
|
-
k: int,
|
|
184
|
-
filters: Optional[MetadataFilters] = None,
|
|
185
|
-
) -> Dict:
|
|
186
|
-
knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"]
|
|
187
|
-
lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
|
|
188
|
-
|
|
189
|
-
parsed_filters = _parse_filters(filters)
|
|
190
|
-
if len(parsed_filters) > 0:
|
|
191
|
-
lexical_query["filter"] = parsed_filters
|
|
192
|
-
return {
|
|
193
|
-
"size": k,
|
|
194
|
-
"query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def __get_painless_scripting_source(
|
|
199
|
-
space_type: str, vector_field: str = "embedding"
|
|
200
|
-
) -> str:
|
|
201
|
-
"""For Painless Scripting, it returns the script source based on space type."""
|
|
202
|
-
source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
203
|
-
if space_type == "cosineSimilarity":
|
|
204
|
-
return source_value
|
|
205
|
-
else:
|
|
206
|
-
return f"1/{source_value}"
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def _default_painless_scripting_query(
|
|
210
|
-
query_vector: List[float],
|
|
211
|
-
k: int = 4,
|
|
212
|
-
space_type: str = "l2Squared",
|
|
213
|
-
pre_filter: Optional[Union[Dict, List]] = None,
|
|
214
|
-
vector_field: str = "embedding",
|
|
215
|
-
) -> Dict:
|
|
216
|
-
"""For Painless Scripting Search, this is the default query."""
|
|
217
|
-
if not pre_filter:
|
|
218
|
-
pre_filter = MATCH_ALL_QUERY
|
|
219
|
-
|
|
220
|
-
source = __get_painless_scripting_source(space_type, vector_field)
|
|
221
|
-
return {
|
|
222
|
-
"size": k,
|
|
223
|
-
"query": {
|
|
224
|
-
"script_score": {
|
|
225
|
-
"query": pre_filter,
|
|
226
|
-
"script": {
|
|
227
|
-
"source": source,
|
|
228
|
-
"params": {
|
|
229
|
-
"field": vector_field,
|
|
230
|
-
"query_value": query_vector,
|
|
231
|
-
},
|
|
232
|
-
},
|
|
233
|
-
}
|
|
234
|
-
},
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _is_aoss_enabled(http_auth: Any) -> bool:
|
|
239
|
-
"""Check if the service is http_auth is set as `aoss`."""
|
|
240
|
-
if (
|
|
241
|
-
http_auth is not None
|
|
242
|
-
and hasattr(http_auth, "service")
|
|
243
|
-
and http_auth.service == "aoss"
|
|
244
|
-
):
|
|
245
|
-
return True
|
|
246
|
-
return False
|
|
247
|
-
|
|
248
|
-
|
|
249
35
|
class OpensearchVectorClient:
|
|
250
36
|
"""
|
|
251
37
|
Object encapsulating an Opensearch index that has vector search enabled.
|
|
@@ -302,7 +88,7 @@ class OpensearchVectorClient:
|
|
|
302
88
|
|
|
303
89
|
self._search_pipeline = search_pipeline
|
|
304
90
|
http_auth = kwargs.get("http_auth")
|
|
305
|
-
self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
|
|
91
|
+
self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
|
|
306
92
|
# initialize mapping
|
|
307
93
|
idx_conf = {
|
|
308
94
|
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
|
|
@@ -316,8 +102,8 @@ class OpensearchVectorClient:
|
|
|
316
102
|
}
|
|
317
103
|
},
|
|
318
104
|
}
|
|
319
|
-
self._os_client = _get_async_opensearch_client(self._endpoint, **kwargs)
|
|
320
|
-
not_found_error = _import_not_found_error()
|
|
105
|
+
self._os_client = self._get_async_opensearch_client(self._endpoint, **kwargs)
|
|
106
|
+
not_found_error = self._import_not_found_error()
|
|
321
107
|
|
|
322
108
|
event_loop = asyncio.get_event_loop()
|
|
323
109
|
try:
|
|
@@ -332,6 +118,217 @@ class OpensearchVectorClient:
|
|
|
332
118
|
self._os_client.indices.refresh(index=self._index)
|
|
333
119
|
)
|
|
334
120
|
|
|
121
|
+
def _import_async_opensearch(self) -> Any:
|
|
122
|
+
"""Import OpenSearch if available, otherwise raise error."""
|
|
123
|
+
return AsyncOpenSearch
|
|
124
|
+
|
|
125
|
+
def _import_async_bulk(self) -> Any:
|
|
126
|
+
"""Import bulk if available, otherwise raise error."""
|
|
127
|
+
return async_bulk
|
|
128
|
+
|
|
129
|
+
def _import_not_found_error(self) -> Any:
|
|
130
|
+
"""Import not found error if available, otherwise raise error."""
|
|
131
|
+
return NotFoundError
|
|
132
|
+
|
|
133
|
+
def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
|
|
134
|
+
"""Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
|
|
135
|
+
try:
|
|
136
|
+
opensearch = self._import_async_opensearch()
|
|
137
|
+
client = opensearch(opensearch_url, **kwargs)
|
|
138
|
+
|
|
139
|
+
except ValueError as e:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"AsyncOpenSearch client string provided is not in proper format. "
|
|
142
|
+
f"Got error: {e} "
|
|
143
|
+
)
|
|
144
|
+
return client
|
|
145
|
+
|
|
146
|
+
async def _bulk_ingest_embeddings(
|
|
147
|
+
self,
|
|
148
|
+
client: Any,
|
|
149
|
+
index_name: str,
|
|
150
|
+
embeddings: List[List[float]],
|
|
151
|
+
texts: Iterable[str],
|
|
152
|
+
metadatas: Optional[List[dict]] = None,
|
|
153
|
+
ids: Optional[List[str]] = None,
|
|
154
|
+
vector_field: str = "embedding",
|
|
155
|
+
text_field: str = "content",
|
|
156
|
+
mapping: Optional[Dict] = None,
|
|
157
|
+
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
158
|
+
is_aoss: bool = False,
|
|
159
|
+
) -> List[str]:
|
|
160
|
+
"""Async Bulk Ingest Embeddings into given index."""
|
|
161
|
+
if not mapping:
|
|
162
|
+
mapping = {}
|
|
163
|
+
|
|
164
|
+
async_bulk = self._import_async_bulk()
|
|
165
|
+
not_found_error = self._import_not_found_error()
|
|
166
|
+
requests = []
|
|
167
|
+
return_ids = []
|
|
168
|
+
mapping = mapping
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
await client.indices.get(index=index_name)
|
|
172
|
+
except not_found_error:
|
|
173
|
+
await client.indices.create(index=index_name, body=mapping)
|
|
174
|
+
|
|
175
|
+
for i, text in enumerate(texts):
|
|
176
|
+
metadata = metadatas[i] if metadatas else {}
|
|
177
|
+
_id = ids[i] if ids else str(uuid.uuid4())
|
|
178
|
+
request = {
|
|
179
|
+
"_op_type": "index",
|
|
180
|
+
"_index": index_name,
|
|
181
|
+
vector_field: embeddings[i],
|
|
182
|
+
text_field: text,
|
|
183
|
+
"metadata": metadata,
|
|
184
|
+
}
|
|
185
|
+
if is_aoss:
|
|
186
|
+
request["id"] = _id
|
|
187
|
+
else:
|
|
188
|
+
request["_id"] = _id
|
|
189
|
+
requests.append(request)
|
|
190
|
+
return_ids.append(_id)
|
|
191
|
+
await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
192
|
+
if not is_aoss:
|
|
193
|
+
await client.indices.refresh(index=index_name)
|
|
194
|
+
return return_ids
|
|
195
|
+
|
|
196
|
+
def _default_approximate_search_query(
|
|
197
|
+
self,
|
|
198
|
+
query_vector: List[float],
|
|
199
|
+
k: int = 4,
|
|
200
|
+
vector_field: str = "embedding",
|
|
201
|
+
) -> Dict:
|
|
202
|
+
"""For Approximate k-NN Search, this is the default query."""
|
|
203
|
+
return {
|
|
204
|
+
"size": k,
|
|
205
|
+
"query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def _parse_filters(self, filters: Optional[MetadataFilters]) -> Any:
|
|
209
|
+
pre_filter = []
|
|
210
|
+
if filters is not None:
|
|
211
|
+
for f in filters.legacy_filters():
|
|
212
|
+
pre_filter.append({f.key: json.loads(str(f.value))})
|
|
213
|
+
|
|
214
|
+
return pre_filter
|
|
215
|
+
|
|
216
|
+
def _knn_search_query(
|
|
217
|
+
self,
|
|
218
|
+
embedding_field: str,
|
|
219
|
+
query_embedding: List[float],
|
|
220
|
+
k: int,
|
|
221
|
+
filters: Optional[MetadataFilters] = None,
|
|
222
|
+
) -> Dict:
|
|
223
|
+
"""
|
|
224
|
+
Do knn search.
|
|
225
|
+
|
|
226
|
+
If there are no filters do approx-knn search.
|
|
227
|
+
If there are (pre)-filters, do an exhaustive exact knn search using 'painless
|
|
228
|
+
scripting'.
|
|
229
|
+
|
|
230
|
+
Note that approximate knn search does not support pre-filtering.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
query_embedding: Vector embedding to query.
|
|
234
|
+
k: Maximum number of results.
|
|
235
|
+
filters: Optional filters to apply before the search.
|
|
236
|
+
Supports filter-context queries documented at
|
|
237
|
+
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Up to k docs closest to query_embedding
|
|
241
|
+
"""
|
|
242
|
+
pre_filter = self._parse_filters(filters)
|
|
243
|
+
if not pre_filter:
|
|
244
|
+
search_query = self._default_approximate_search_query(
|
|
245
|
+
query_embedding, k, vector_field=embedding_field
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
249
|
+
search_query = self._default_painless_scripting_query(
|
|
250
|
+
query_embedding,
|
|
251
|
+
k,
|
|
252
|
+
space_type="l2Squared",
|
|
253
|
+
pre_filter={"bool": {"filter": pre_filter}},
|
|
254
|
+
vector_field=embedding_field,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return search_query
|
|
258
|
+
|
|
259
|
+
def _hybrid_search_query(
|
|
260
|
+
self,
|
|
261
|
+
text_field: str,
|
|
262
|
+
query_str: str,
|
|
263
|
+
embedding_field: str,
|
|
264
|
+
query_embedding: List[float],
|
|
265
|
+
k: int,
|
|
266
|
+
filters: Optional[MetadataFilters] = None,
|
|
267
|
+
) -> Dict:
|
|
268
|
+
knn_query = self._knn_search_query(
|
|
269
|
+
embedding_field, query_embedding, k, filters
|
|
270
|
+
)["query"]
|
|
271
|
+
lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
|
|
272
|
+
|
|
273
|
+
parsed_filters = self._parse_filters(filters)
|
|
274
|
+
if len(parsed_filters) > 0:
|
|
275
|
+
lexical_query["filter"] = parsed_filters
|
|
276
|
+
return {
|
|
277
|
+
"size": k,
|
|
278
|
+
"query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
def __get_painless_scripting_source(
|
|
282
|
+
self, space_type: str, vector_field: str = "embedding"
|
|
283
|
+
) -> str:
|
|
284
|
+
"""For Painless Scripting, it returns the script source based on space type."""
|
|
285
|
+
source_value = (
|
|
286
|
+
f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
287
|
+
)
|
|
288
|
+
if space_type == "cosineSimilarity":
|
|
289
|
+
return source_value
|
|
290
|
+
else:
|
|
291
|
+
return f"1/{source_value}"
|
|
292
|
+
|
|
293
|
+
def _default_painless_scripting_query(
|
|
294
|
+
self,
|
|
295
|
+
query_vector: List[float],
|
|
296
|
+
k: int = 4,
|
|
297
|
+
space_type: str = "l2Squared",
|
|
298
|
+
pre_filter: Optional[Union[Dict, List]] = None,
|
|
299
|
+
vector_field: str = "embedding",
|
|
300
|
+
) -> Dict:
|
|
301
|
+
"""For Painless Scripting Search, this is the default query."""
|
|
302
|
+
if not pre_filter:
|
|
303
|
+
pre_filter = MATCH_ALL_QUERY
|
|
304
|
+
|
|
305
|
+
source = self.__get_painless_scripting_source(space_type, vector_field)
|
|
306
|
+
return {
|
|
307
|
+
"size": k,
|
|
308
|
+
"query": {
|
|
309
|
+
"script_score": {
|
|
310
|
+
"query": pre_filter,
|
|
311
|
+
"script": {
|
|
312
|
+
"source": source,
|
|
313
|
+
"params": {
|
|
314
|
+
"field": vector_field,
|
|
315
|
+
"query_value": query_vector,
|
|
316
|
+
},
|
|
317
|
+
},
|
|
318
|
+
}
|
|
319
|
+
},
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
def _is_aoss_enabled(self, http_auth: Any) -> bool:
|
|
323
|
+
"""Check if the service is http_auth is set as `aoss`."""
|
|
324
|
+
if (
|
|
325
|
+
http_auth is not None
|
|
326
|
+
and hasattr(http_auth, "service")
|
|
327
|
+
and http_auth.service == "aoss"
|
|
328
|
+
):
|
|
329
|
+
return True
|
|
330
|
+
return False
|
|
331
|
+
|
|
335
332
|
async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
336
333
|
"""Store results in the index."""
|
|
337
334
|
embeddings: List[List[float]] = []
|
|
@@ -344,7 +341,7 @@ class OpensearchVectorClient:
|
|
|
344
341
|
texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
345
342
|
metadatas.append(node_to_metadata_dict(node, remove_text=True))
|
|
346
343
|
|
|
347
|
-
return await _bulk_ingest_embeddings(
|
|
344
|
+
return await self._bulk_ingest_embeddings(
|
|
348
345
|
self._os_client,
|
|
349
346
|
self._index,
|
|
350
347
|
embeddings,
|
|
@@ -358,12 +355,12 @@ class OpensearchVectorClient:
|
|
|
358
355
|
is_aoss=self.is_aoss,
|
|
359
356
|
)
|
|
360
357
|
|
|
361
|
-
async def
|
|
358
|
+
async def delete_by_doc_id(self, doc_id: str) -> None:
|
|
362
359
|
"""
|
|
363
|
-
|
|
360
|
+
Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
|
|
364
361
|
|
|
365
362
|
Args:
|
|
366
|
-
doc_id (str):
|
|
363
|
+
doc_id (str): a LlamaIndex `Document` id
|
|
367
364
|
"""
|
|
368
365
|
search_query = {
|
|
369
366
|
"query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
|
|
@@ -381,7 +378,7 @@ class OpensearchVectorClient:
|
|
|
381
378
|
if query_mode == VectorStoreQueryMode.HYBRID:
|
|
382
379
|
if query_str is None or self._search_pipeline is None:
|
|
383
380
|
raise ValueError(INVALID_HYBRID_QUERY_ERROR)
|
|
384
|
-
search_query = _hybrid_search_query(
|
|
381
|
+
search_query = self._hybrid_search_query(
|
|
385
382
|
self._text_field,
|
|
386
383
|
query_str,
|
|
387
384
|
self._embedding_field,
|
|
@@ -389,9 +386,11 @@ class OpensearchVectorClient:
|
|
|
389
386
|
k,
|
|
390
387
|
filters=filters,
|
|
391
388
|
)
|
|
392
|
-
params = {
|
|
389
|
+
params = {
|
|
390
|
+
"search_pipeline": self._search_pipeline,
|
|
391
|
+
}
|
|
393
392
|
else:
|
|
394
|
-
search_query = _knn_search_query(
|
|
393
|
+
search_query = self._knn_search_query(
|
|
395
394
|
self._embedding_field, query_embedding, k, filters=filters
|
|
396
395
|
)
|
|
397
396
|
params = None
|
|
@@ -399,6 +398,10 @@ class OpensearchVectorClient:
|
|
|
399
398
|
res = await self._os_client.search(
|
|
400
399
|
index=self._index, body=search_query, params=params
|
|
401
400
|
)
|
|
401
|
+
|
|
402
|
+
return self._to_query_result(res)
|
|
403
|
+
|
|
404
|
+
def _to_query_result(self, res) -> VectorStoreQueryResult:
|
|
402
405
|
nodes = []
|
|
403
406
|
ids = []
|
|
404
407
|
scores = []
|
|
@@ -433,6 +436,7 @@ class OpensearchVectorClient:
|
|
|
433
436
|
ids.append(node_id)
|
|
434
437
|
nodes.append(node)
|
|
435
438
|
scores.append(hit["_score"])
|
|
439
|
+
|
|
436
440
|
return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)
|
|
437
441
|
|
|
438
442
|
|
|
@@ -443,6 +447,35 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
443
447
|
Args:
|
|
444
448
|
client (OpensearchVectorClient): Vector index client to use
|
|
445
449
|
for data insertion/querying.
|
|
450
|
+
|
|
451
|
+
Examples:
|
|
452
|
+
`pip install llama-index-vector-stores-opensearch`
|
|
453
|
+
|
|
454
|
+
```python
|
|
455
|
+
from llama_index.vector_stores.opensearch import (
|
|
456
|
+
OpensearchVectorStore,
|
|
457
|
+
OpensearchVectorClient,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# http endpoint for your cluster (opensearch required for vector index usage)
|
|
461
|
+
endpoint = "http://localhost:9200"
|
|
462
|
+
# index to demonstrate the VectorStore impl
|
|
463
|
+
idx = "gpt-index-demo"
|
|
464
|
+
|
|
465
|
+
# OpensearchVectorClient stores text in this field by default
|
|
466
|
+
text_field = "content"
|
|
467
|
+
# OpensearchVectorClient stores embeddings in this field by default
|
|
468
|
+
embedding_field = "embedding"
|
|
469
|
+
|
|
470
|
+
# OpensearchVectorClient encapsulates logic for a
|
|
471
|
+
# single opensearch index with vector search enabled
|
|
472
|
+
client = OpensearchVectorClient(
|
|
473
|
+
endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# initialize vector store
|
|
477
|
+
vector_store = OpensearchVectorStore(client)
|
|
478
|
+
```
|
|
446
479
|
"""
|
|
447
480
|
|
|
448
481
|
stores_text: bool = True
|
|
@@ -494,10 +527,10 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
494
527
|
|
|
495
528
|
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
496
529
|
"""
|
|
497
|
-
Delete nodes using
|
|
530
|
+
Delete nodes using a ref_doc_id.
|
|
498
531
|
|
|
499
532
|
Args:
|
|
500
|
-
ref_doc_id (str): The doc_id of the document
|
|
533
|
+
ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
|
|
501
534
|
|
|
502
535
|
"""
|
|
503
536
|
asyncio.get_event_loop().run_until_complete(
|
|
@@ -506,13 +539,13 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
506
539
|
|
|
507
540
|
async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
508
541
|
"""
|
|
509
|
-
Async delete nodes using
|
|
542
|
+
Async delete nodes using a ref_doc_id.
|
|
510
543
|
|
|
511
544
|
Args:
|
|
512
|
-
ref_doc_id (str): The doc_id of the document
|
|
545
|
+
ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
|
|
513
546
|
|
|
514
547
|
"""
|
|
515
|
-
await self._client.
|
|
548
|
+
await self._client.delete_by_doc_id(ref_doc_id)
|
|
516
549
|
|
|
517
550
|
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
518
551
|
"""
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
|
|
2
|
+
llama_index/vector_stores/opensearch/base.py,sha256=LGXHRzcMVpvE-cb778I_W_I7wfGbLKgFQq9jR4OY6N8,19162
|
|
3
|
+
llama_index_vector_stores_opensearch-0.1.10.dist-info/METADATA,sha256=_-b77ZvNRypAK2awVaSGwoKqL3VdyS9SgpfVmyvmH1o,678
|
|
4
|
+
llama_index_vector_stores_opensearch-0.1.10.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
5
|
+
llama_index_vector_stores_opensearch-0.1.10.dist-info/RECORD,,
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
llama_index/vector_stores/opensearch/__init__.py,sha256=U1_XAkZb6zcskOk4s10NB8Tjs9AZRGdRQLzOGpbWdBA,176
|
|
2
|
-
llama_index/vector_stores/opensearch/base.py,sha256=eHWcqPEeiYObJu8vIVfDPl9ryg8lOE__zEexvFV4lGA,16958
|
|
3
|
-
llama_index_vector_stores_opensearch-0.1.8.dist-info/METADATA,sha256=Oq_NhlE5Mq_tpHFUiERyEgWdruzzUi0XKXdRYpDFMFk,677
|
|
4
|
-
llama_index_vector_stores_opensearch-0.1.8.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
5
|
-
llama_index_vector_stores_opensearch-0.1.8.dist-info/RECORD,,
|
|
File without changes
|