elasticsearch-haystack 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-2.0.0.dist-info → elasticsearch_haystack-3.0.0.dist-info}/METADATA +4 -4
- {elasticsearch_haystack-2.0.0.dist-info → elasticsearch_haystack-3.0.0.dist-info}/RECORD +7 -7
- haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +24 -1
- haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +30 -4
- haystack_integrations/document_stores/elasticsearch/document_store.py +309 -89
- {elasticsearch_haystack-2.0.0.dist-info → elasticsearch_haystack-3.0.0.dist-info}/WHEEL +0 -0
- {elasticsearch_haystack-2.0.0.dist-info → elasticsearch_haystack-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -11,15 +11,15 @@ License-File: LICENSE
|
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Programming Language :: Python
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
18
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
20
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: aiohttp
|
|
21
21
|
Requires-Dist: elasticsearch<9,>=8
|
|
22
|
-
Requires-Dist: haystack-ai
|
|
22
|
+
Requires-Dist: haystack-ai>=2.11.0
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
|
|
25
25
|
[](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
haystack_integrations/components/retrievers/elasticsearch/__init__.py,sha256=cSJBsYjz_T4kK-M-auAHVUnYIcgUqqwwQe_hsF0_IG4,307
|
|
2
|
-
haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=
|
|
3
|
-
haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=
|
|
2
|
+
haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=ISHc6elYXoDXDvC62_3bMMCk_Dv67jvZIgQBCZ1ZHdw,7012
|
|
3
|
+
haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=jHDLMeecpf-DhvbRM1AAq2kIJn7xMNTR9vkm-FhHH7k,7332
|
|
4
4
|
haystack_integrations/document_stores/elasticsearch/__init__.py,sha256=YTfu94dtVUBogbJFr1aJrKuaI6-Bw9VuHfPoyU7M8os,207
|
|
5
|
-
haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=
|
|
5
|
+
haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=pZ0pPyOCPTCKNYD4q5YbLrslSGTIbVPj60U18-BImX8,27406
|
|
6
6
|
haystack_integrations/document_stores/elasticsearch/filters.py,sha256=Umip-PP4uFjuWeB1JWkKhaKClQ0VpiykoDlDu99wIV0,9759
|
|
7
|
-
elasticsearch_haystack-
|
|
8
|
-
elasticsearch_haystack-
|
|
9
|
-
elasticsearch_haystack-
|
|
10
|
-
elasticsearch_haystack-
|
|
7
|
+
elasticsearch_haystack-3.0.0.dist-info/METADATA,sha256=E0ClBwzkNrT0g2L39vhe1jpnFNOvYQzeR50jCcHfw5c,2149
|
|
8
|
+
elasticsearch_haystack-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
+
elasticsearch_haystack-3.0.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
|
|
10
|
+
elasticsearch_haystack-3.0.0.dist-info/RECORD,,
|
|
@@ -120,7 +120,7 @@ class ElasticsearchBM25Retriever:
|
|
|
120
120
|
"""
|
|
121
121
|
Retrieve documents using the BM25 keyword-based algorithm.
|
|
122
122
|
|
|
123
|
-
:param query: String to search in `Document`s
|
|
123
|
+
:param query: String to search in the `Document`s text.
|
|
124
124
|
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
125
125
|
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
126
126
|
details.
|
|
@@ -137,3 +137,26 @@ class ElasticsearchBM25Retriever:
|
|
|
137
137
|
scale_score=self._scale_score,
|
|
138
138
|
)
|
|
139
139
|
return {"documents": docs}
|
|
140
|
+
|
|
141
|
+
@component.output_types(documents=List[Document])
|
|
142
|
+
async def run_async(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
|
|
143
|
+
"""
|
|
144
|
+
Asynchronously retrieve documents using the BM25 keyword-based algorithm.
|
|
145
|
+
|
|
146
|
+
:param query: String to search in the `Document` text.
|
|
147
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
148
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
149
|
+
details.
|
|
150
|
+
:param top_k: Maximum number of `Document` to return.
|
|
151
|
+
:returns: A dictionary with the following keys:
|
|
152
|
+
- `documents`: List of `Document`s that match the query.
|
|
153
|
+
"""
|
|
154
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
155
|
+
docs = await self._document_store._bm25_retrieval_async(
|
|
156
|
+
query=query,
|
|
157
|
+
filters=filters,
|
|
158
|
+
fuzziness=self._fuzziness,
|
|
159
|
+
top_k=top_k or self._top_k,
|
|
160
|
+
scale_score=self._scale_score,
|
|
161
|
+
)
|
|
162
|
+
return {"documents": docs}
|
|
@@ -119,10 +119,11 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
119
119
|
Retrieve documents using a vector similarity metric.
|
|
120
120
|
|
|
121
121
|
:param query_embedding: Embedding of the query.
|
|
122
|
-
:param filters: Filters applied
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
122
|
+
:param filters: Filters applied when fetching documents from the Document Store.
|
|
123
|
+
Filters are applied during the approximate kNN search to ensure the Retriever returns
|
|
124
|
+
`top_k` matching documents.
|
|
125
|
+
The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
|
|
126
|
+
:param top_k: Maximum number of documents to return.
|
|
126
127
|
:returns: A dictionary with the following keys:
|
|
127
128
|
- `documents`: List of `Document`s most similar to the given `query_embedding`
|
|
128
129
|
"""
|
|
@@ -134,3 +135,28 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
134
135
|
num_candidates=self._num_candidates,
|
|
135
136
|
)
|
|
136
137
|
return {"documents": docs}
|
|
138
|
+
|
|
139
|
+
@component.output_types(documents=List[Document])
|
|
140
|
+
async def run_async(
|
|
141
|
+
self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
Asynchronously retrieve documents using a vector similarity metric.
|
|
145
|
+
|
|
146
|
+
:param query_embedding: Embedding of the query.
|
|
147
|
+
:param filters: Filters applied when fetching documents from the Document Store.
|
|
148
|
+
Filters are applied during the approximate kNN search to ensure the Retriever returns
|
|
149
|
+
`top_k` matching documents.
|
|
150
|
+
The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
|
|
151
|
+
:param top_k: Maximum number of documents to return.
|
|
152
|
+
:returns: A dictionary with the following keys:
|
|
153
|
+
- `documents`: List of `Document`s that match the query.
|
|
154
|
+
"""
|
|
155
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
156
|
+
docs = await self._document_store._embedding_retrieval_async(
|
|
157
|
+
query_embedding=query_embedding,
|
|
158
|
+
filters=filters,
|
|
159
|
+
top_k=top_k or self._top_k,
|
|
160
|
+
num_candidates=self._num_candidates,
|
|
161
|
+
)
|
|
162
|
+
return {"documents": docs}
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import
|
|
5
|
-
from typing import Any, Dict, List, Literal,
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
# There are no import stubs for elastic_transport and elasticsearch so mypy fails
|
|
10
10
|
from elastic_transport import NodeConfig # type: ignore[import-not-found]
|
|
11
|
-
from haystack import default_from_dict, default_to_dict
|
|
11
|
+
from haystack import default_from_dict, default_to_dict, logging
|
|
12
12
|
from haystack.dataclasses import Document
|
|
13
13
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
14
14
|
from haystack.document_stores.types import DuplicatePolicy
|
|
15
15
|
from haystack.version import __version__ as haystack_version
|
|
16
16
|
|
|
17
|
-
from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found]
|
|
17
|
+
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers # type: ignore[import-not-found]
|
|
18
18
|
|
|
19
19
|
from .filters import _normalize_filters
|
|
20
20
|
|
|
@@ -30,11 +30,12 @@ Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]]]
|
|
|
30
30
|
# Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
|
|
31
31
|
# all be mapped to scores ~1.
|
|
32
32
|
BM25_SCALING_FACTOR = 8
|
|
33
|
+
DOC_ALREADY_EXISTS = 409
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class ElasticsearchDocumentStore:
|
|
36
37
|
"""
|
|
37
|
-
ElasticsearchDocumentStore
|
|
38
|
+
An ElasticsearchDocumentStore instance that works with Elastic Cloud or your own
|
|
38
39
|
Elasticsearch cluster.
|
|
39
40
|
|
|
40
41
|
Usage example (Elastic Cloud):
|
|
@@ -93,28 +94,39 @@ class ElasticsearchDocumentStore:
|
|
|
93
94
|
"""
|
|
94
95
|
self._hosts = hosts
|
|
95
96
|
self._client = None
|
|
97
|
+
self._async_client = None
|
|
96
98
|
self._index = index
|
|
97
99
|
self._embedding_similarity_function = embedding_similarity_function
|
|
98
100
|
self._custom_mapping = custom_mapping
|
|
99
101
|
self._kwargs = kwargs
|
|
102
|
+
self._initialized = False
|
|
100
103
|
|
|
101
104
|
if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
|
|
102
105
|
msg = "custom_mapping must be a dictionary"
|
|
103
106
|
raise ValueError(msg)
|
|
104
107
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
+
def _ensure_initialized(self):
|
|
109
|
+
"""
|
|
110
|
+
Ensures both sync and async clients are initialized and the index exists.
|
|
111
|
+
"""
|
|
112
|
+
if not self._initialized:
|
|
108
113
|
headers = self._kwargs.pop("headers", {})
|
|
109
114
|
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
# Initialize both sync and async clients
|
|
117
|
+
self._client = Elasticsearch(
|
|
112
118
|
self._hosts,
|
|
113
119
|
headers=headers,
|
|
114
120
|
**self._kwargs,
|
|
115
121
|
)
|
|
122
|
+
self._async_client = AsyncElasticsearch(
|
|
123
|
+
self._hosts,
|
|
124
|
+
headers=headers,
|
|
125
|
+
**self._kwargs,
|
|
126
|
+
)
|
|
127
|
+
|
|
116
128
|
# Check client connection, this will raise if not connected
|
|
117
|
-
|
|
129
|
+
self._client.info()
|
|
118
130
|
|
|
119
131
|
if self._custom_mapping:
|
|
120
132
|
mappings = self._custom_mapping
|
|
@@ -143,13 +155,27 @@ class ElasticsearchDocumentStore:
|
|
|
143
155
|
}
|
|
144
156
|
|
|
145
157
|
# Create the index if it doesn't exist
|
|
146
|
-
if not
|
|
147
|
-
|
|
158
|
+
if not self._client.indices.exists(index=self._index):
|
|
159
|
+
self._client.indices.create(index=self._index, mappings=mappings)
|
|
148
160
|
|
|
149
|
-
self.
|
|
161
|
+
self._initialized = True
|
|
150
162
|
|
|
163
|
+
@property
|
|
164
|
+
def client(self) -> Elasticsearch:
|
|
165
|
+
"""
|
|
166
|
+
Returns the synchronous Elasticsearch client, initializing it if necessary.
|
|
167
|
+
"""
|
|
168
|
+
self._ensure_initialized()
|
|
151
169
|
return self._client
|
|
152
170
|
|
|
171
|
+
@property
|
|
172
|
+
def async_client(self) -> AsyncElasticsearch:
|
|
173
|
+
"""
|
|
174
|
+
Returns the asynchronous Elasticsearch client, initializing it if necessary.
|
|
175
|
+
"""
|
|
176
|
+
self._ensure_initialized()
|
|
177
|
+
return self._async_client
|
|
178
|
+
|
|
153
179
|
def to_dict(self) -> Dict[str, Any]:
|
|
154
180
|
"""
|
|
155
181
|
Serializes the component to a dictionary.
|
|
@@ -184,15 +210,26 @@ class ElasticsearchDocumentStore:
|
|
|
184
210
|
def count_documents(self) -> int:
|
|
185
211
|
"""
|
|
186
212
|
Returns how many documents are present in the document store.
|
|
187
|
-
|
|
213
|
+
|
|
214
|
+
:returns:
|
|
215
|
+
Number of documents in the document store.
|
|
188
216
|
"""
|
|
217
|
+
self._ensure_initialized()
|
|
189
218
|
return self.client.count(index=self._index)["count"]
|
|
190
219
|
|
|
220
|
+
async def count_documents_async(self) -> int:
|
|
221
|
+
"""
|
|
222
|
+
Asynchronously returns how many documents are present in the document store.
|
|
223
|
+
:returns: Number of documents in the document store.
|
|
224
|
+
"""
|
|
225
|
+
self._ensure_initialized()
|
|
226
|
+
result = await self._async_client.count(index=self._index) # type: ignore
|
|
227
|
+
return result["count"]
|
|
228
|
+
|
|
191
229
|
def _search_documents(self, **kwargs) -> List[Document]:
|
|
192
230
|
"""
|
|
193
231
|
Calls the Elasticsearch client's search method and handles pagination.
|
|
194
232
|
"""
|
|
195
|
-
|
|
196
233
|
top_k = kwargs.get("size")
|
|
197
234
|
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
198
235
|
top_k = kwargs["knn"]["k"]
|
|
@@ -207,7 +244,7 @@ class ElasticsearchDocumentStore:
|
|
|
207
244
|
**kwargs,
|
|
208
245
|
)
|
|
209
246
|
|
|
210
|
-
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
|
|
247
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
|
|
211
248
|
from_ = len(documents)
|
|
212
249
|
|
|
213
250
|
if top_k is not None and from_ >= top_k:
|
|
@@ -216,6 +253,31 @@ class ElasticsearchDocumentStore:
|
|
|
216
253
|
break
|
|
217
254
|
return documents
|
|
218
255
|
|
|
256
|
+
async def _search_documents_async(self, **kwargs) -> List[Document]:
|
|
257
|
+
"""
|
|
258
|
+
Asynchronously calls the Elasticsearch client's search method and handles pagination.
|
|
259
|
+
"""
|
|
260
|
+
top_k = kwargs.get("size")
|
|
261
|
+
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
262
|
+
top_k = kwargs["knn"]["k"]
|
|
263
|
+
|
|
264
|
+
documents: List[Document] = []
|
|
265
|
+
from_ = 0
|
|
266
|
+
|
|
267
|
+
# handle pagination
|
|
268
|
+
while True:
|
|
269
|
+
res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
|
|
270
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
|
|
271
|
+
from_ = len(documents)
|
|
272
|
+
|
|
273
|
+
if top_k is not None and from_ >= top_k:
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
if from_ >= res["hits"]["total"]["value"]:
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
return documents
|
|
280
|
+
|
|
219
281
|
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
220
282
|
"""
|
|
221
283
|
The main query method for the document store. It retrieves all documents that match the filters.
|
|
@@ -229,10 +291,45 @@ class ElasticsearchDocumentStore:
|
|
|
229
291
|
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
230
292
|
raise ValueError(msg)
|
|
231
293
|
|
|
294
|
+
self._ensure_initialized()
|
|
232
295
|
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
233
296
|
documents = self._search_documents(query=query)
|
|
234
297
|
return documents
|
|
235
298
|
|
|
299
|
+
async def filter_documents_async(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
300
|
+
"""
|
|
301
|
+
Asynchronously retrieves all documents that match the filters.
|
|
302
|
+
|
|
303
|
+
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
|
|
304
|
+
see the official Elasticsearch
|
|
305
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
|
|
306
|
+
:returns: List of `Document`s that match the filters.
|
|
307
|
+
"""
|
|
308
|
+
if filters and "operator" not in filters and "conditions" not in filters:
|
|
309
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
310
|
+
raise ValueError(msg)
|
|
311
|
+
|
|
312
|
+
self._ensure_initialized()
|
|
313
|
+
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
314
|
+
documents = await self._search_documents_async(query=query)
|
|
315
|
+
return documents
|
|
316
|
+
|
|
317
|
+
@staticmethod
|
|
318
|
+
def _deserialize_document(hit: Dict[str, Any]) -> Document:
|
|
319
|
+
"""
|
|
320
|
+
Creates a `Document` from the search hit provided.
|
|
321
|
+
This is mostly useful in self.filter_documents().
|
|
322
|
+
:param hit: A search hit from Elasticsearch.
|
|
323
|
+
:returns: `Document` created from the search hit.
|
|
324
|
+
"""
|
|
325
|
+
data = hit["_source"]
|
|
326
|
+
|
|
327
|
+
if "highlight" in hit:
|
|
328
|
+
data["metadata"]["highlighted"] = hit["highlight"]
|
|
329
|
+
data["score"] = hit["_score"]
|
|
330
|
+
|
|
331
|
+
return Document.from_dict(data)
|
|
332
|
+
|
|
236
333
|
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
|
|
237
334
|
"""
|
|
238
335
|
Writes `Document`s to Elasticsearch.
|
|
@@ -258,23 +355,15 @@ class ElasticsearchDocumentStore:
|
|
|
258
355
|
elasticsearch_actions = []
|
|
259
356
|
for doc in documents:
|
|
260
357
|
doc_dict = doc.to_dict()
|
|
261
|
-
|
|
262
|
-
dataframe = doc_dict.pop("dataframe")
|
|
263
|
-
if dataframe:
|
|
264
|
-
logger.warning(
|
|
265
|
-
"Document %s has the `dataframe` field set,"
|
|
266
|
-
"ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
|
|
267
|
-
"The `dataframe` field will soon be removed from Haystack Document.",
|
|
268
|
-
doc.id,
|
|
269
|
-
)
|
|
358
|
+
|
|
270
359
|
if "sparse_embedding" in doc_dict:
|
|
271
360
|
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
272
361
|
if sparse_embedding:
|
|
273
362
|
logger.warning(
|
|
274
|
-
"Document
|
|
363
|
+
"Document {doc_id} has the `sparse_embedding` field set,"
|
|
275
364
|
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
276
365
|
"The `sparse_embedding` field will be ignored.",
|
|
277
|
-
doc.id,
|
|
366
|
+
doc_id=doc.id,
|
|
278
367
|
)
|
|
279
368
|
elasticsearch_actions.append(
|
|
280
369
|
{
|
|
@@ -315,40 +404,78 @@ class ElasticsearchDocumentStore:
|
|
|
315
404
|
|
|
316
405
|
return documents_written
|
|
317
406
|
|
|
318
|
-
|
|
319
|
-
|
|
407
|
+
async def write_documents_async(
|
|
408
|
+
self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE
|
|
409
|
+
) -> int:
|
|
320
410
|
"""
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
This is mostly useful in self.filter_documents().
|
|
411
|
+
Asynchronously writes `Document`s to Elasticsearch.
|
|
324
412
|
|
|
325
|
-
:param
|
|
326
|
-
:
|
|
413
|
+
:param documents: List of Documents to write to the document store.
|
|
414
|
+
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
|
|
415
|
+
:raises ValueError: If `documents` is not a list of `Document`s.
|
|
416
|
+
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
|
|
417
|
+
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
|
|
418
|
+
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
|
|
419
|
+
:returns: Number of documents written to the document store.
|
|
327
420
|
"""
|
|
328
|
-
|
|
421
|
+
self._ensure_initialized()
|
|
329
422
|
|
|
330
|
-
if
|
|
331
|
-
|
|
332
|
-
|
|
423
|
+
if len(documents) > 0:
|
|
424
|
+
if not isinstance(documents[0], Document):
|
|
425
|
+
msg = "param 'documents' must contain a list of objects of type Document"
|
|
426
|
+
raise ValueError(msg)
|
|
333
427
|
|
|
334
|
-
if
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
)
|
|
343
|
-
|
|
428
|
+
if policy == DuplicatePolicy.NONE:
|
|
429
|
+
policy = DuplicatePolicy.FAIL
|
|
430
|
+
|
|
431
|
+
actions = []
|
|
432
|
+
for doc in documents:
|
|
433
|
+
doc_dict = doc.to_dict()
|
|
434
|
+
|
|
435
|
+
if "sparse_embedding" in doc_dict:
|
|
436
|
+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
437
|
+
if sparse_embedding:
|
|
438
|
+
logger.warning(
|
|
439
|
+
"Document {doc_id} has the `sparse_embedding` field set,"
|
|
440
|
+
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
441
|
+
"The `sparse_embedding` field will be ignored.",
|
|
442
|
+
doc_id=doc.id,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
action = {
|
|
446
|
+
"_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
|
|
447
|
+
"_id": doc.id,
|
|
448
|
+
"_source": doc_dict,
|
|
449
|
+
}
|
|
450
|
+
actions.append(action)
|
|
451
|
+
|
|
452
|
+
try:
|
|
453
|
+
success, failed = await helpers.async_bulk(
|
|
454
|
+
client=self._async_client,
|
|
455
|
+
actions=actions,
|
|
456
|
+
index=self._index,
|
|
457
|
+
refresh=True,
|
|
458
|
+
raise_on_error=False,
|
|
459
|
+
)
|
|
460
|
+
if failed:
|
|
461
|
+
if policy == DuplicatePolicy.FAIL:
|
|
462
|
+
for error in failed:
|
|
463
|
+
if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
|
|
464
|
+
msg = f"ID '{error['create']['_id']}' already exists in the document store"
|
|
465
|
+
raise DuplicateDocumentError(msg)
|
|
466
|
+
msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
|
|
467
|
+
raise DocumentStoreError(msg)
|
|
468
|
+
return success
|
|
469
|
+
except Exception as e:
|
|
470
|
+
msg = f"Failed to write documents to Elasticsearch: {e!s}"
|
|
471
|
+
raise DocumentStoreError(msg) from e
|
|
344
472
|
|
|
345
473
|
def delete_documents(self, document_ids: List[str]) -> None:
|
|
346
474
|
"""
|
|
347
|
-
Deletes all
|
|
475
|
+
Deletes all documents with a matching document_ids from the document store.
|
|
348
476
|
|
|
349
|
-
:param document_ids: the
|
|
477
|
+
:param document_ids: the document ids to delete
|
|
350
478
|
"""
|
|
351
|
-
|
|
352
479
|
helpers.bulk(
|
|
353
480
|
client=self.client,
|
|
354
481
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
@@ -357,6 +484,25 @@ class ElasticsearchDocumentStore:
|
|
|
357
484
|
raise_on_error=False,
|
|
358
485
|
)
|
|
359
486
|
|
|
487
|
+
async def delete_documents_async(self, document_ids: List[str]) -> None:
|
|
488
|
+
"""
|
|
489
|
+
Asynchronously deletes all documents with a matching document_ids from the document store.
|
|
490
|
+
|
|
491
|
+
:param document_ids: the document ids to delete
|
|
492
|
+
"""
|
|
493
|
+
self._ensure_initialized()
|
|
494
|
+
|
|
495
|
+
try:
|
|
496
|
+
await helpers.async_bulk(
|
|
497
|
+
client=self._async_client,
|
|
498
|
+
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
499
|
+
index=self._index,
|
|
500
|
+
refresh=True,
|
|
501
|
+
)
|
|
502
|
+
except Exception as e:
|
|
503
|
+
msg = f"Failed to delete documents from Elasticsearch: {e!s}"
|
|
504
|
+
raise DocumentStoreError(msg) from e
|
|
505
|
+
|
|
360
506
|
def _bm25_retrieval(
|
|
361
507
|
self,
|
|
362
508
|
query: str,
|
|
@@ -367,27 +513,15 @@ class ElasticsearchDocumentStore:
|
|
|
367
513
|
scale_score: bool = False,
|
|
368
514
|
) -> List[Document]:
|
|
369
515
|
"""
|
|
370
|
-
Retrieves
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
:param query: String to search in saved `Document`s' text.
|
|
380
|
-
:param filters: Filters applied to the retrieved `Document`s, for more info
|
|
381
|
-
see `ElasticsearchDocumentStore.filter_documents`.
|
|
382
|
-
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
|
|
383
|
-
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
|
|
384
|
-
for valid values.
|
|
385
|
-
:param top_k: Maximum number of `Document`s to return.
|
|
386
|
-
:param scale_score: If `True` scales the `Document``s scores between 0 and 1.
|
|
387
|
-
:raises ValueError: If `query` is an empty string
|
|
388
|
-
:returns: List of `Document` that match `query`
|
|
516
|
+
Retrieves documents using BM25 retrieval.
|
|
517
|
+
|
|
518
|
+
:param query: The query string to search for
|
|
519
|
+
:param filters: Optional filters to narrow down the search space
|
|
520
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
521
|
+
:param top_k: Maximum number of documents to return
|
|
522
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
523
|
+
:returns: List of Documents that match the query
|
|
389
524
|
"""
|
|
390
|
-
|
|
391
525
|
if not query:
|
|
392
526
|
msg = "query must be a non empty string"
|
|
393
527
|
raise ValueError(msg)
|
|
@@ -421,35 +555,79 @@ class ElasticsearchDocumentStore:
|
|
|
421
555
|
|
|
422
556
|
return documents
|
|
423
557
|
|
|
424
|
-
def
|
|
558
|
+
async def _bm25_retrieval_async(
|
|
425
559
|
self,
|
|
426
|
-
|
|
560
|
+
query: str,
|
|
427
561
|
*,
|
|
428
562
|
filters: Optional[Dict[str, Any]] = None,
|
|
563
|
+
fuzziness: str = "AUTO",
|
|
429
564
|
top_k: int = 10,
|
|
430
|
-
|
|
565
|
+
scale_score: bool = False,
|
|
431
566
|
) -> List[Document]:
|
|
432
567
|
"""
|
|
433
|
-
|
|
568
|
+
Asynchronously retrieves documents using BM25 retrieval.
|
|
569
|
+
|
|
570
|
+
:param query: The query string to search for
|
|
571
|
+
:param filters: Optional filters to narrow down the search space
|
|
572
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
573
|
+
:param top_k: Maximum number of documents to return
|
|
574
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
575
|
+
:returns: List of Documents that match the query
|
|
576
|
+
"""
|
|
577
|
+
self._ensure_initialized()
|
|
578
|
+
|
|
579
|
+
if not query:
|
|
580
|
+
msg = "query must be a non empty string"
|
|
581
|
+
raise ValueError(msg)
|
|
582
|
+
|
|
583
|
+
# Prepare the search body
|
|
584
|
+
search_body = {
|
|
585
|
+
"size": top_k,
|
|
586
|
+
"query": {
|
|
587
|
+
"bool": {
|
|
588
|
+
"must": [
|
|
589
|
+
{
|
|
590
|
+
"multi_match": {
|
|
591
|
+
"query": query,
|
|
592
|
+
"type": "most_fields",
|
|
593
|
+
"operator": "OR",
|
|
594
|
+
"fuzziness": fuzziness,
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
]
|
|
598
|
+
}
|
|
599
|
+
},
|
|
600
|
+
}
|
|
434
601
|
|
|
435
|
-
|
|
602
|
+
if filters:
|
|
603
|
+
search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
|
|
604
|
+
|
|
605
|
+
documents = await self._search_documents_async(**search_body)
|
|
606
|
+
|
|
607
|
+
if scale_score:
|
|
608
|
+
for doc in documents:
|
|
609
|
+
if doc.score is not None:
|
|
610
|
+
doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
|
|
436
611
|
|
|
437
|
-
|
|
438
|
-
`ElasticsearchDocumentStore` nor called directly.
|
|
439
|
-
`ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
|
|
612
|
+
return documents
|
|
440
613
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
:raises ValueError: If `query_embedding` is an empty list.
|
|
450
|
-
:returns: List of `Document` that are most similar to `query_embedding`.
|
|
614
|
+
def _embedding_retrieval(
|
|
615
|
+
self,
|
|
616
|
+
query_embedding: List[float],
|
|
617
|
+
*,
|
|
618
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
619
|
+
top_k: int = 10,
|
|
620
|
+
num_candidates: Optional[int] = None,
|
|
621
|
+
) -> List[Document]:
|
|
451
622
|
"""
|
|
623
|
+
Retrieves documents using dense vector similarity search.
|
|
452
624
|
|
|
625
|
+
:param query_embedding: Embedding vector to search for
|
|
626
|
+
:param filters: Optional filters to narrow down the search space
|
|
627
|
+
:param top_k: Maximum number of documents to return
|
|
628
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
629
|
+
:returns: List of Documents most similar to query_embedding
|
|
630
|
+
"""
|
|
453
631
|
if not query_embedding:
|
|
454
632
|
msg = "query_embedding must be a non-empty list of floats"
|
|
455
633
|
raise ValueError(msg)
|
|
@@ -471,3 +649,45 @@ class ElasticsearchDocumentStore:
|
|
|
471
649
|
|
|
472
650
|
docs = self._search_documents(**body)
|
|
473
651
|
return docs
|
|
652
|
+
|
|
653
|
+
async def _embedding_retrieval_async(
|
|
654
|
+
self,
|
|
655
|
+
query_embedding: List[float],
|
|
656
|
+
*,
|
|
657
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
658
|
+
top_k: int = 10,
|
|
659
|
+
num_candidates: Optional[int] = None,
|
|
660
|
+
) -> List[Document]:
|
|
661
|
+
"""
|
|
662
|
+
Asynchronously retrieves documents using dense vector similarity search.
|
|
663
|
+
|
|
664
|
+
:param query_embedding: Embedding vector to search for
|
|
665
|
+
:param filters: Optional filters to narrow down the search space
|
|
666
|
+
:param top_k: Maximum number of documents to return
|
|
667
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
668
|
+
:returns: List of Documents most similar to query_embedding
|
|
669
|
+
"""
|
|
670
|
+
self._ensure_initialized()
|
|
671
|
+
|
|
672
|
+
if not query_embedding:
|
|
673
|
+
msg = "query_embedding must be a non-empty list of floats"
|
|
674
|
+
raise ValueError(msg)
|
|
675
|
+
|
|
676
|
+
# If num_candidates is not set, use top_k * 10 as default
|
|
677
|
+
if num_candidates is None:
|
|
678
|
+
num_candidates = top_k * 10
|
|
679
|
+
|
|
680
|
+
# Prepare the search body
|
|
681
|
+
search_body = {
|
|
682
|
+
"knn": {
|
|
683
|
+
"field": "embedding",
|
|
684
|
+
"query_vector": query_embedding,
|
|
685
|
+
"k": top_k,
|
|
686
|
+
"num_candidates": num_candidates,
|
|
687
|
+
},
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
if filters:
|
|
691
|
+
search_body["knn"]["filter"] = _normalize_filters(filters)
|
|
692
|
+
|
|
693
|
+
return await self._search_documents_async(**search_body)
|
|
File without changes
|
{elasticsearch_haystack-2.0.0.dist-info → elasticsearch_haystack-3.0.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|