llama-index-vector-stores-opensearch 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.
- {llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/PKG-INFO +1 -1
- {llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/llama_index/vector_stores/opensearch/base.py +263 -231
- {llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/pyproject.toml +1 -1
- {llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/README.md +0 -0
- {llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/llama_index/vector_stores/opensearch/__init__.py +0 -0
|
@@ -32,220 +32,6 @@ INVALID_HYBRID_QUERY_ERROR = (
|
|
|
32
32
|
MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _import_async_opensearch() -> Any:
|
|
36
|
-
"""Import OpenSearch if available, otherwise raise error."""
|
|
37
|
-
return AsyncOpenSearch
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _import_async_bulk() -> Any:
|
|
41
|
-
"""Import bulk if available, otherwise raise error."""
|
|
42
|
-
return async_bulk
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def _import_not_found_error() -> Any:
|
|
46
|
-
"""Import not found error if available, otherwise raise error."""
|
|
47
|
-
return NotFoundError
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def _get_async_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
|
|
51
|
-
"""Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
|
|
52
|
-
try:
|
|
53
|
-
opensearch = _import_async_opensearch()
|
|
54
|
-
client = opensearch(opensearch_url, **kwargs)
|
|
55
|
-
|
|
56
|
-
except ValueError as e:
|
|
57
|
-
raise ValueError(
|
|
58
|
-
f"AsyncOpenSearch client string provided is not in proper format. "
|
|
59
|
-
f"Got error: {e} "
|
|
60
|
-
)
|
|
61
|
-
return client
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
async def _bulk_ingest_embeddings(
|
|
65
|
-
client: Any,
|
|
66
|
-
index_name: str,
|
|
67
|
-
embeddings: List[List[float]],
|
|
68
|
-
texts: Iterable[str],
|
|
69
|
-
metadatas: Optional[List[dict]] = None,
|
|
70
|
-
ids: Optional[List[str]] = None,
|
|
71
|
-
vector_field: str = "embedding",
|
|
72
|
-
text_field: str = "content",
|
|
73
|
-
mapping: Optional[Dict] = None,
|
|
74
|
-
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
75
|
-
is_aoss: bool = False,
|
|
76
|
-
) -> List[str]:
|
|
77
|
-
"""Async Bulk Ingest Embeddings into given index."""
|
|
78
|
-
if not mapping:
|
|
79
|
-
mapping = {}
|
|
80
|
-
|
|
81
|
-
async_bulk = _import_async_bulk()
|
|
82
|
-
not_found_error = _import_not_found_error()
|
|
83
|
-
requests = []
|
|
84
|
-
return_ids = []
|
|
85
|
-
mapping = mapping
|
|
86
|
-
|
|
87
|
-
try:
|
|
88
|
-
await client.indices.get(index=index_name)
|
|
89
|
-
except not_found_error:
|
|
90
|
-
await client.indices.create(index=index_name, body=mapping)
|
|
91
|
-
|
|
92
|
-
for i, text in enumerate(texts):
|
|
93
|
-
metadata = metadatas[i] if metadatas else {}
|
|
94
|
-
_id = ids[i] if ids else str(uuid.uuid4())
|
|
95
|
-
request = {
|
|
96
|
-
"_op_type": "index",
|
|
97
|
-
"_index": index_name,
|
|
98
|
-
vector_field: embeddings[i],
|
|
99
|
-
text_field: text,
|
|
100
|
-
"metadata": metadata,
|
|
101
|
-
}
|
|
102
|
-
if is_aoss:
|
|
103
|
-
request["id"] = _id
|
|
104
|
-
else:
|
|
105
|
-
request["_id"] = _id
|
|
106
|
-
requests.append(request)
|
|
107
|
-
return_ids.append(_id)
|
|
108
|
-
await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
109
|
-
if not is_aoss:
|
|
110
|
-
await client.indices.refresh(index=index_name)
|
|
111
|
-
return return_ids
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def _default_approximate_search_query(
|
|
115
|
-
query_vector: List[float],
|
|
116
|
-
k: int = 4,
|
|
117
|
-
vector_field: str = "embedding",
|
|
118
|
-
) -> Dict:
|
|
119
|
-
"""For Approximate k-NN Search, this is the default query."""
|
|
120
|
-
return {
|
|
121
|
-
"size": k,
|
|
122
|
-
"query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def _parse_filters(filters: Optional[MetadataFilters]) -> Any:
|
|
127
|
-
pre_filter = []
|
|
128
|
-
if filters is not None:
|
|
129
|
-
for f in filters.legacy_filters():
|
|
130
|
-
pre_filter.append({f.key: json.loads(str(f.value))})
|
|
131
|
-
|
|
132
|
-
return pre_filter
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def _knn_search_query(
|
|
136
|
-
embedding_field: str,
|
|
137
|
-
query_embedding: List[float],
|
|
138
|
-
k: int,
|
|
139
|
-
filters: Optional[MetadataFilters] = None,
|
|
140
|
-
) -> Dict:
|
|
141
|
-
"""
|
|
142
|
-
Do knn search.
|
|
143
|
-
|
|
144
|
-
If there are no filters do approx-knn search.
|
|
145
|
-
If there are (pre)-filters, do an exhaustive exact knn search using 'painless
|
|
146
|
-
scripting'.
|
|
147
|
-
|
|
148
|
-
Note that approximate knn search does not support pre-filtering.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
query_embedding: Vector embedding to query.
|
|
152
|
-
k: Maximum number of results.
|
|
153
|
-
filters: Optional filters to apply before the search.
|
|
154
|
-
Supports filter-context queries documented at
|
|
155
|
-
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
Up to k docs closest to query_embedding
|
|
159
|
-
"""
|
|
160
|
-
if filters is None:
|
|
161
|
-
search_query = _default_approximate_search_query(
|
|
162
|
-
query_embedding, k, vector_field=embedding_field
|
|
163
|
-
)
|
|
164
|
-
else:
|
|
165
|
-
pre_filter = _parse_filters(filters)
|
|
166
|
-
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
167
|
-
search_query = _default_painless_scripting_query(
|
|
168
|
-
query_embedding,
|
|
169
|
-
k,
|
|
170
|
-
space_type="l2Squared",
|
|
171
|
-
pre_filter={"bool": {"filter": pre_filter}},
|
|
172
|
-
vector_field=embedding_field,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
return search_query
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _hybrid_search_query(
|
|
179
|
-
text_field: str,
|
|
180
|
-
query_str: str,
|
|
181
|
-
embedding_field: str,
|
|
182
|
-
query_embedding: List[float],
|
|
183
|
-
k: int,
|
|
184
|
-
filters: Optional[MetadataFilters] = None,
|
|
185
|
-
) -> Dict:
|
|
186
|
-
knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"]
|
|
187
|
-
lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
|
|
188
|
-
|
|
189
|
-
parsed_filters = _parse_filters(filters)
|
|
190
|
-
if len(parsed_filters) > 0:
|
|
191
|
-
lexical_query["filter"] = parsed_filters
|
|
192
|
-
return {
|
|
193
|
-
"size": k,
|
|
194
|
-
"query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def __get_painless_scripting_source(
|
|
199
|
-
space_type: str, vector_field: str = "embedding"
|
|
200
|
-
) -> str:
|
|
201
|
-
"""For Painless Scripting, it returns the script source based on space type."""
|
|
202
|
-
source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
203
|
-
if space_type == "cosineSimilarity":
|
|
204
|
-
return source_value
|
|
205
|
-
else:
|
|
206
|
-
return f"1/{source_value}"
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def _default_painless_scripting_query(
|
|
210
|
-
query_vector: List[float],
|
|
211
|
-
k: int = 4,
|
|
212
|
-
space_type: str = "l2Squared",
|
|
213
|
-
pre_filter: Optional[Union[Dict, List]] = None,
|
|
214
|
-
vector_field: str = "embedding",
|
|
215
|
-
) -> Dict:
|
|
216
|
-
"""For Painless Scripting Search, this is the default query."""
|
|
217
|
-
if not pre_filter:
|
|
218
|
-
pre_filter = MATCH_ALL_QUERY
|
|
219
|
-
|
|
220
|
-
source = __get_painless_scripting_source(space_type, vector_field)
|
|
221
|
-
return {
|
|
222
|
-
"size": k,
|
|
223
|
-
"query": {
|
|
224
|
-
"script_score": {
|
|
225
|
-
"query": pre_filter,
|
|
226
|
-
"script": {
|
|
227
|
-
"source": source,
|
|
228
|
-
"params": {
|
|
229
|
-
"field": vector_field,
|
|
230
|
-
"query_value": query_vector,
|
|
231
|
-
},
|
|
232
|
-
},
|
|
233
|
-
}
|
|
234
|
-
},
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _is_aoss_enabled(http_auth: Any) -> bool:
|
|
239
|
-
"""Check if the service is http_auth is set as `aoss`."""
|
|
240
|
-
if (
|
|
241
|
-
http_auth is not None
|
|
242
|
-
and hasattr(http_auth, "service")
|
|
243
|
-
and http_auth.service == "aoss"
|
|
244
|
-
):
|
|
245
|
-
return True
|
|
246
|
-
return False
|
|
247
|
-
|
|
248
|
-
|
|
249
35
|
class OpensearchVectorClient:
|
|
250
36
|
"""
|
|
251
37
|
Object encapsulating an Opensearch index that has vector search enabled.
|
|
@@ -302,7 +88,7 @@ class OpensearchVectorClient:
|
|
|
302
88
|
|
|
303
89
|
self._search_pipeline = search_pipeline
|
|
304
90
|
http_auth = kwargs.get("http_auth")
|
|
305
|
-
self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
|
|
91
|
+
self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
|
|
306
92
|
# initialize mapping
|
|
307
93
|
idx_conf = {
|
|
308
94
|
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
|
|
@@ -316,8 +102,8 @@ class OpensearchVectorClient:
|
|
|
316
102
|
}
|
|
317
103
|
},
|
|
318
104
|
}
|
|
319
|
-
self._os_client = _get_async_opensearch_client(self._endpoint, **kwargs)
|
|
320
|
-
not_found_error = _import_not_found_error()
|
|
105
|
+
self._os_client = self._get_async_opensearch_client(self._endpoint, **kwargs)
|
|
106
|
+
not_found_error = self._import_not_found_error()
|
|
321
107
|
|
|
322
108
|
event_loop = asyncio.get_event_loop()
|
|
323
109
|
try:
|
|
@@ -332,6 +118,217 @@ class OpensearchVectorClient:
|
|
|
332
118
|
self._os_client.indices.refresh(index=self._index)
|
|
333
119
|
)
|
|
334
120
|
|
|
121
|
+
def _import_async_opensearch(self) -> Any:
|
|
122
|
+
"""Import OpenSearch if available, otherwise raise error."""
|
|
123
|
+
return AsyncOpenSearch
|
|
124
|
+
|
|
125
|
+
def _import_async_bulk(self) -> Any:
|
|
126
|
+
"""Import bulk if available, otherwise raise error."""
|
|
127
|
+
return async_bulk
|
|
128
|
+
|
|
129
|
+
def _import_not_found_error(self) -> Any:
|
|
130
|
+
"""Import not found error if available, otherwise raise error."""
|
|
131
|
+
return NotFoundError
|
|
132
|
+
|
|
133
|
+
def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
|
|
134
|
+
"""Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
|
|
135
|
+
try:
|
|
136
|
+
opensearch = self._import_async_opensearch()
|
|
137
|
+
client = opensearch(opensearch_url, **kwargs)
|
|
138
|
+
|
|
139
|
+
except ValueError as e:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"AsyncOpenSearch client string provided is not in proper format. "
|
|
142
|
+
f"Got error: {e} "
|
|
143
|
+
)
|
|
144
|
+
return client
|
|
145
|
+
|
|
146
|
+
async def _bulk_ingest_embeddings(
|
|
147
|
+
self,
|
|
148
|
+
client: Any,
|
|
149
|
+
index_name: str,
|
|
150
|
+
embeddings: List[List[float]],
|
|
151
|
+
texts: Iterable[str],
|
|
152
|
+
metadatas: Optional[List[dict]] = None,
|
|
153
|
+
ids: Optional[List[str]] = None,
|
|
154
|
+
vector_field: str = "embedding",
|
|
155
|
+
text_field: str = "content",
|
|
156
|
+
mapping: Optional[Dict] = None,
|
|
157
|
+
max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
|
|
158
|
+
is_aoss: bool = False,
|
|
159
|
+
) -> List[str]:
|
|
160
|
+
"""Async Bulk Ingest Embeddings into given index."""
|
|
161
|
+
if not mapping:
|
|
162
|
+
mapping = {}
|
|
163
|
+
|
|
164
|
+
async_bulk = self._import_async_bulk()
|
|
165
|
+
not_found_error = self._import_not_found_error()
|
|
166
|
+
requests = []
|
|
167
|
+
return_ids = []
|
|
168
|
+
mapping = mapping
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
await client.indices.get(index=index_name)
|
|
172
|
+
except not_found_error:
|
|
173
|
+
await client.indices.create(index=index_name, body=mapping)
|
|
174
|
+
|
|
175
|
+
for i, text in enumerate(texts):
|
|
176
|
+
metadata = metadatas[i] if metadatas else {}
|
|
177
|
+
_id = ids[i] if ids else str(uuid.uuid4())
|
|
178
|
+
request = {
|
|
179
|
+
"_op_type": "index",
|
|
180
|
+
"_index": index_name,
|
|
181
|
+
vector_field: embeddings[i],
|
|
182
|
+
text_field: text,
|
|
183
|
+
"metadata": metadata,
|
|
184
|
+
}
|
|
185
|
+
if is_aoss:
|
|
186
|
+
request["id"] = _id
|
|
187
|
+
else:
|
|
188
|
+
request["_id"] = _id
|
|
189
|
+
requests.append(request)
|
|
190
|
+
return_ids.append(_id)
|
|
191
|
+
await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
|
|
192
|
+
if not is_aoss:
|
|
193
|
+
await client.indices.refresh(index=index_name)
|
|
194
|
+
return return_ids
|
|
195
|
+
|
|
196
|
+
def _default_approximate_search_query(
|
|
197
|
+
self,
|
|
198
|
+
query_vector: List[float],
|
|
199
|
+
k: int = 4,
|
|
200
|
+
vector_field: str = "embedding",
|
|
201
|
+
) -> Dict:
|
|
202
|
+
"""For Approximate k-NN Search, this is the default query."""
|
|
203
|
+
return {
|
|
204
|
+
"size": k,
|
|
205
|
+
"query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def _parse_filters(self, filters: Optional[MetadataFilters]) -> Any:
|
|
209
|
+
pre_filter = []
|
|
210
|
+
if filters is not None:
|
|
211
|
+
for f in filters.legacy_filters():
|
|
212
|
+
pre_filter.append({f.key: json.loads(str(f.value))})
|
|
213
|
+
|
|
214
|
+
return pre_filter
|
|
215
|
+
|
|
216
|
+
def _knn_search_query(
|
|
217
|
+
self,
|
|
218
|
+
embedding_field: str,
|
|
219
|
+
query_embedding: List[float],
|
|
220
|
+
k: int,
|
|
221
|
+
filters: Optional[MetadataFilters] = None,
|
|
222
|
+
) -> Dict:
|
|
223
|
+
"""
|
|
224
|
+
Do knn search.
|
|
225
|
+
|
|
226
|
+
If there are no filters do approx-knn search.
|
|
227
|
+
If there are (pre)-filters, do an exhaustive exact knn search using 'painless
|
|
228
|
+
scripting'.
|
|
229
|
+
|
|
230
|
+
Note that approximate knn search does not support pre-filtering.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
query_embedding: Vector embedding to query.
|
|
234
|
+
k: Maximum number of results.
|
|
235
|
+
filters: Optional filters to apply before the search.
|
|
236
|
+
Supports filter-context queries documented at
|
|
237
|
+
https://opensearch.org/docs/latest/query-dsl/query-filter-context/
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Up to k docs closest to query_embedding
|
|
241
|
+
"""
|
|
242
|
+
if filters is None:
|
|
243
|
+
search_query = self._default_approximate_search_query(
|
|
244
|
+
query_embedding, k, vector_field=embedding_field
|
|
245
|
+
)
|
|
246
|
+
else:
|
|
247
|
+
pre_filter = self._parse_filters(filters)
|
|
248
|
+
# https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
|
|
249
|
+
search_query = self._default_painless_scripting_query(
|
|
250
|
+
query_embedding,
|
|
251
|
+
k,
|
|
252
|
+
space_type="l2Squared",
|
|
253
|
+
pre_filter={"bool": {"filter": pre_filter}},
|
|
254
|
+
vector_field=embedding_field,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return search_query
|
|
258
|
+
|
|
259
|
+
def _hybrid_search_query(
|
|
260
|
+
self,
|
|
261
|
+
text_field: str,
|
|
262
|
+
query_str: str,
|
|
263
|
+
embedding_field: str,
|
|
264
|
+
query_embedding: List[float],
|
|
265
|
+
k: int,
|
|
266
|
+
filters: Optional[MetadataFilters] = None,
|
|
267
|
+
) -> Dict:
|
|
268
|
+
knn_query = self._knn_search_query(
|
|
269
|
+
embedding_field, query_embedding, k, filters
|
|
270
|
+
)["query"]
|
|
271
|
+
lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}
|
|
272
|
+
|
|
273
|
+
parsed_filters = self._parse_filters(filters)
|
|
274
|
+
if len(parsed_filters) > 0:
|
|
275
|
+
lexical_query["filter"] = parsed_filters
|
|
276
|
+
return {
|
|
277
|
+
"size": k,
|
|
278
|
+
"query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
def __get_painless_scripting_source(
|
|
282
|
+
self, space_type: str, vector_field: str = "embedding"
|
|
283
|
+
) -> str:
|
|
284
|
+
"""For Painless Scripting, it returns the script source based on space type."""
|
|
285
|
+
source_value = (
|
|
286
|
+
f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
|
|
287
|
+
)
|
|
288
|
+
if space_type == "cosineSimilarity":
|
|
289
|
+
return source_value
|
|
290
|
+
else:
|
|
291
|
+
return f"1/{source_value}"
|
|
292
|
+
|
|
293
|
+
def _default_painless_scripting_query(
|
|
294
|
+
self,
|
|
295
|
+
query_vector: List[float],
|
|
296
|
+
k: int = 4,
|
|
297
|
+
space_type: str = "l2Squared",
|
|
298
|
+
pre_filter: Optional[Union[Dict, List]] = None,
|
|
299
|
+
vector_field: str = "embedding",
|
|
300
|
+
) -> Dict:
|
|
301
|
+
"""For Painless Scripting Search, this is the default query."""
|
|
302
|
+
if not pre_filter:
|
|
303
|
+
pre_filter = MATCH_ALL_QUERY
|
|
304
|
+
|
|
305
|
+
source = self.__get_painless_scripting_source(space_type, vector_field)
|
|
306
|
+
return {
|
|
307
|
+
"size": k,
|
|
308
|
+
"query": {
|
|
309
|
+
"script_score": {
|
|
310
|
+
"query": pre_filter,
|
|
311
|
+
"script": {
|
|
312
|
+
"source": source,
|
|
313
|
+
"params": {
|
|
314
|
+
"field": vector_field,
|
|
315
|
+
"query_value": query_vector,
|
|
316
|
+
},
|
|
317
|
+
},
|
|
318
|
+
}
|
|
319
|
+
},
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
def _is_aoss_enabled(self, http_auth: Any) -> bool:
|
|
323
|
+
"""Check if the service is http_auth is set as `aoss`."""
|
|
324
|
+
if (
|
|
325
|
+
http_auth is not None
|
|
326
|
+
and hasattr(http_auth, "service")
|
|
327
|
+
and http_auth.service == "aoss"
|
|
328
|
+
):
|
|
329
|
+
return True
|
|
330
|
+
return False
|
|
331
|
+
|
|
335
332
|
async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
|
|
336
333
|
"""Store results in the index."""
|
|
337
334
|
embeddings: List[List[float]] = []
|
|
@@ -344,7 +341,7 @@ class OpensearchVectorClient:
|
|
|
344
341
|
texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
|
|
345
342
|
metadatas.append(node_to_metadata_dict(node, remove_text=True))
|
|
346
343
|
|
|
347
|
-
return await _bulk_ingest_embeddings(
|
|
344
|
+
return await self._bulk_ingest_embeddings(
|
|
348
345
|
self._os_client,
|
|
349
346
|
self._index,
|
|
350
347
|
embeddings,
|
|
@@ -358,14 +355,17 @@ class OpensearchVectorClient:
|
|
|
358
355
|
is_aoss=self.is_aoss,
|
|
359
356
|
)
|
|
360
357
|
|
|
361
|
-
async def
|
|
358
|
+
async def delete_by_doc_id(self, doc_id: str) -> None:
|
|
362
359
|
"""
|
|
363
|
-
|
|
360
|
+
Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
|
|
364
361
|
|
|
365
362
|
Args:
|
|
366
|
-
doc_id (str):
|
|
363
|
+
doc_id (str): a LlamaIndex `Document` id
|
|
367
364
|
"""
|
|
368
|
-
|
|
365
|
+
search_query = {
|
|
366
|
+
"query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
|
|
367
|
+
}
|
|
368
|
+
await self._os_client.delete_by_query(index=self._index, body=search_query)
|
|
369
369
|
|
|
370
370
|
async def aquery(
|
|
371
371
|
self,
|
|
@@ -378,7 +378,7 @@ class OpensearchVectorClient:
|
|
|
378
378
|
if query_mode == VectorStoreQueryMode.HYBRID:
|
|
379
379
|
if query_str is None or self._search_pipeline is None:
|
|
380
380
|
raise ValueError(INVALID_HYBRID_QUERY_ERROR)
|
|
381
|
-
search_query = _hybrid_search_query(
|
|
381
|
+
search_query = self._hybrid_search_query(
|
|
382
382
|
self._text_field,
|
|
383
383
|
query_str,
|
|
384
384
|
self._embedding_field,
|
|
@@ -386,12 +386,15 @@ class OpensearchVectorClient:
|
|
|
386
386
|
k,
|
|
387
387
|
filters=filters,
|
|
388
388
|
)
|
|
389
|
-
params = {
|
|
389
|
+
params = {
|
|
390
|
+
"search_pipeline": self._search_pipeline,
|
|
391
|
+
"_source_excludes": ["embedding"],
|
|
392
|
+
}
|
|
390
393
|
else:
|
|
391
|
-
search_query = _knn_search_query(
|
|
394
|
+
search_query = self._knn_search_query(
|
|
392
395
|
self._embedding_field, query_embedding, k, filters=filters
|
|
393
396
|
)
|
|
394
|
-
params =
|
|
397
|
+
params = {"_source_excludes": ["embedding"]}
|
|
395
398
|
|
|
396
399
|
res = await self._os_client.search(
|
|
397
400
|
index=self._index, body=search_query, params=params
|
|
@@ -440,6 +443,35 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
440
443
|
Args:
|
|
441
444
|
client (OpensearchVectorClient): Vector index client to use
|
|
442
445
|
for data insertion/querying.
|
|
446
|
+
|
|
447
|
+
Examples:
|
|
448
|
+
`pip install llama-index-vector-stores-opensearch`
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from llama_index.vector_stores.opensearch import (
|
|
452
|
+
OpensearchVectorStore,
|
|
453
|
+
OpensearchVectorClient,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# http endpoint for your cluster (opensearch required for vector index usage)
|
|
457
|
+
endpoint = "http://localhost:9200"
|
|
458
|
+
# index to demonstrate the VectorStore impl
|
|
459
|
+
idx = "gpt-index-demo"
|
|
460
|
+
|
|
461
|
+
# OpensearchVectorClient stores text in this field by default
|
|
462
|
+
text_field = "content"
|
|
463
|
+
# OpensearchVectorClient stores embeddings in this field by default
|
|
464
|
+
embedding_field = "embedding"
|
|
465
|
+
|
|
466
|
+
# OpensearchVectorClient encapsulates logic for a
|
|
467
|
+
# single opensearch index with vector search enabled
|
|
468
|
+
client = OpensearchVectorClient(
|
|
469
|
+
endpoint, idx, 1536, embedding_field=embedding_field, text_field=text_field
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# initialize vector store
|
|
473
|
+
vector_store = OpensearchVectorStore(client)
|
|
474
|
+
```
|
|
443
475
|
"""
|
|
444
476
|
|
|
445
477
|
stores_text: bool = True
|
|
@@ -491,10 +523,10 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
491
523
|
|
|
492
524
|
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
493
525
|
"""
|
|
494
|
-
Delete nodes using
|
|
526
|
+
Delete nodes using a ref_doc_id.
|
|
495
527
|
|
|
496
528
|
Args:
|
|
497
|
-
ref_doc_id (str): The doc_id of the document
|
|
529
|
+
ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
|
|
498
530
|
|
|
499
531
|
"""
|
|
500
532
|
asyncio.get_event_loop().run_until_complete(
|
|
@@ -503,13 +535,13 @@ class OpensearchVectorStore(BasePydanticVectorStore):
|
|
|
503
535
|
|
|
504
536
|
async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
505
537
|
"""
|
|
506
|
-
Async delete nodes using
|
|
538
|
+
Async delete nodes using a ref_doc_id.
|
|
507
539
|
|
|
508
540
|
Args:
|
|
509
|
-
ref_doc_id (str): The doc_id of the document
|
|
541
|
+
ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
|
|
510
542
|
|
|
511
543
|
"""
|
|
512
|
-
await self._client.
|
|
544
|
+
await self._client.delete_by_doc_id(ref_doc_id)
|
|
513
545
|
|
|
514
546
|
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
515
547
|
"""
|
{llama_index_vector_stores_opensearch-0.1.7 → llama_index_vector_stores_opensearch-0.1.9}/README.md
RENAMED
|
File without changes
|