elasticsearch-haystack 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: elasticsearch-haystack
3
- Version: 2.0.0
3
+ Version: 3.0.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -11,15 +11,15 @@ License-File: LICENSE
11
11
  Classifier: Development Status :: 4 - Beta
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Programming Language :: Python
14
- Classifier: Programming Language :: Python :: 3.8
15
14
  Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: Implementation :: CPython
19
18
  Classifier: Programming Language :: Python :: Implementation :: PyPy
20
- Requires-Python: >=3.8
19
+ Requires-Python: >=3.9
20
+ Requires-Dist: aiohttp
21
21
  Requires-Dist: elasticsearch<9,>=8
22
- Requires-Dist: haystack-ai
22
+ Requires-Dist: haystack-ai>=2.11.0
23
23
  Description-Content-Type: text/markdown
24
24
 
25
25
  [![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml)
@@ -1,10 +1,10 @@
1
1
  haystack_integrations/components/retrievers/elasticsearch/__init__.py,sha256=cSJBsYjz_T4kK-M-auAHVUnYIcgUqqwwQe_hsF0_IG4,307
2
- haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=XA6UiNFb59CMM5LSoPmNDe3IzZ7ty7HViSaU2ZT4--w,5851
3
- haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=ZL9kHi6tCzks1_GXoOIRVLcN4BWnaMqN6t-JcwdTfao,5992
2
+ haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=ISHc6elYXoDXDvC62_3bMMCk_Dv67jvZIgQBCZ1ZHdw,7012
3
+ haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=jHDLMeecpf-DhvbRM1AAq2kIJn7xMNTR9vkm-FhHH7k,7332
4
4
  haystack_integrations/document_stores/elasticsearch/__init__.py,sha256=YTfu94dtVUBogbJFr1aJrKuaI6-Bw9VuHfPoyU7M8os,207
5
- haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=lNHnzVm30dHdZr6jJtRHY212r5fN7a2w_PEUCdoseA4,19817
5
+ haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=pZ0pPyOCPTCKNYD4q5YbLrslSGTIbVPj60U18-BImX8,27406
6
6
  haystack_integrations/document_stores/elasticsearch/filters.py,sha256=Umip-PP4uFjuWeB1JWkKhaKClQ0VpiykoDlDu99wIV0,9759
7
- elasticsearch_haystack-2.0.0.dist-info/METADATA,sha256=wTRyUYeJy0jvMOa9t0JtWHv105JE2awalXH7pmVSmyI,2168
8
- elasticsearch_haystack-2.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- elasticsearch_haystack-2.0.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
10
- elasticsearch_haystack-2.0.0.dist-info/RECORD,,
7
+ elasticsearch_haystack-3.0.0.dist-info/METADATA,sha256=E0ClBwzkNrT0g2L39vhe1jpnFNOvYQzeR50jCcHfw5c,2149
8
+ elasticsearch_haystack-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ elasticsearch_haystack-3.0.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
10
+ elasticsearch_haystack-3.0.0.dist-info/RECORD,,
@@ -120,7 +120,7 @@ class ElasticsearchBM25Retriever:
120
120
  """
121
121
  Retrieve documents using the BM25 keyword-based algorithm.
122
122
 
123
- :param query: String to search in `Document`s' text.
123
+ :param query: String to search in the `Document`s text.
124
124
  :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
125
125
  the `filter_policy` chosen at retriever initialization. See init method docstring for more
126
126
  details.
@@ -137,3 +137,26 @@ class ElasticsearchBM25Retriever:
137
137
  scale_score=self._scale_score,
138
138
  )
139
139
  return {"documents": docs}
140
+
141
+ @component.output_types(documents=List[Document])
142
+ async def run_async(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
143
+ """
144
+ Asynchronously retrieve documents using the BM25 keyword-based algorithm.
145
+
146
+ :param query: String to search in the `Document` text.
147
+ :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
148
+ the `filter_policy` chosen at retriever initialization. See init method docstring for more
149
+ details.
150
+ :param top_k: Maximum number of `Document` to return.
151
+ :returns: A dictionary with the following keys:
152
+ - `documents`: List of `Document`s that match the query.
153
+ """
154
+ filters = apply_filter_policy(self._filter_policy, self._filters, filters)
155
+ docs = await self._document_store._bm25_retrieval_async(
156
+ query=query,
157
+ filters=filters,
158
+ fuzziness=self._fuzziness,
159
+ top_k=top_k or self._top_k,
160
+ scale_score=self._scale_score,
161
+ )
162
+ return {"documents": docs}
@@ -119,10 +119,11 @@ class ElasticsearchEmbeddingRetriever:
119
119
  Retrieve documents using a vector similarity metric.
120
120
 
121
121
  :param query_embedding: Embedding of the query.
122
- :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
123
- the `filter_policy` chosen at retriever initialization. See init method docstring for more
124
- details.
125
- :param top_k: Maximum number of `Document`s to return.
122
+ :param filters: Filters applied when fetching documents from the Document Store.
123
+ Filters are applied during the approximate kNN search to ensure the Retriever returns
124
+ `top_k` matching documents.
125
+ The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
126
+ :param top_k: Maximum number of documents to return.
126
127
  :returns: A dictionary with the following keys:
127
128
  - `documents`: List of `Document`s most similar to the given `query_embedding`
128
129
  """
@@ -134,3 +135,28 @@ class ElasticsearchEmbeddingRetriever:
134
135
  num_candidates=self._num_candidates,
135
136
  )
136
137
  return {"documents": docs}
138
+
139
+ @component.output_types(documents=List[Document])
140
+ async def run_async(
141
+ self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
142
+ ):
143
+ """
144
+ Asynchronously retrieve documents using a vector similarity metric.
145
+
146
+ :param query_embedding: Embedding of the query.
147
+ :param filters: Filters applied when fetching documents from the Document Store.
148
+ Filters are applied during the approximate kNN search to ensure the Retriever returns
149
+ `top_k` matching documents.
150
+ The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
151
+ :param top_k: Maximum number of documents to return.
152
+ :returns: A dictionary with the following keys:
153
+ - `documents`: List of `Document`s that match the query.
154
+ """
155
+ filters = apply_filter_policy(self._filter_policy, self._filters, filters)
156
+ docs = await self._document_store._embedding_retrieval_async(
157
+ query_embedding=query_embedding,
158
+ filters=filters,
159
+ top_k=top_k or self._top_k,
160
+ num_candidates=self._num_candidates,
161
+ )
162
+ return {"documents": docs}
@@ -1,20 +1,20 @@
1
1
  # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0
4
- import logging
5
- from typing import Any, Dict, List, Literal, Mapping, Optional, Union
4
+ from collections.abc import Mapping
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
6
 
7
7
  import numpy as np
8
8
 
9
9
  # There are no import stubs for elastic_transport and elasticsearch so mypy fails
10
10
  from elastic_transport import NodeConfig # type: ignore[import-not-found]
11
- from haystack import default_from_dict, default_to_dict
11
+ from haystack import default_from_dict, default_to_dict, logging
12
12
  from haystack.dataclasses import Document
13
13
  from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
14
14
  from haystack.document_stores.types import DuplicatePolicy
15
15
  from haystack.version import __version__ as haystack_version
16
16
 
17
- from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found]
17
+ from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers # type: ignore[import-not-found]
18
18
 
19
19
  from .filters import _normalize_filters
20
20
 
@@ -30,11 +30,12 @@ Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]]]
30
30
  # Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
31
31
  # all be mapped to scores ~1.
32
32
  BM25_SCALING_FACTOR = 8
33
+ DOC_ALREADY_EXISTS = 409
33
34
 
34
35
 
35
36
  class ElasticsearchDocumentStore:
36
37
  """
37
- ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
38
+ An ElasticsearchDocumentStore instance that works with Elastic Cloud or your own
38
39
  Elasticsearch cluster.
39
40
 
40
41
  Usage example (Elastic Cloud):
@@ -93,28 +94,39 @@ class ElasticsearchDocumentStore:
93
94
  """
94
95
  self._hosts = hosts
95
96
  self._client = None
97
+ self._async_client = None
96
98
  self._index = index
97
99
  self._embedding_similarity_function = embedding_similarity_function
98
100
  self._custom_mapping = custom_mapping
99
101
  self._kwargs = kwargs
102
+ self._initialized = False
100
103
 
101
104
  if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
102
105
  msg = "custom_mapping must be a dictionary"
103
106
  raise ValueError(msg)
104
107
 
105
- @property
106
- def client(self) -> Elasticsearch:
107
- if self._client is None:
108
+ def _ensure_initialized(self):
109
+ """
110
+ Ensures both sync and async clients are initialized and the index exists.
111
+ """
112
+ if not self._initialized:
108
113
  headers = self._kwargs.pop("headers", {})
109
114
  headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
110
115
 
111
- client = Elasticsearch(
116
+ # Initialize both sync and async clients
117
+ self._client = Elasticsearch(
112
118
  self._hosts,
113
119
  headers=headers,
114
120
  **self._kwargs,
115
121
  )
122
+ self._async_client = AsyncElasticsearch(
123
+ self._hosts,
124
+ headers=headers,
125
+ **self._kwargs,
126
+ )
127
+
116
128
  # Check client connection, this will raise if not connected
117
- client.info()
129
+ self._client.info()
118
130
 
119
131
  if self._custom_mapping:
120
132
  mappings = self._custom_mapping
@@ -143,13 +155,27 @@ class ElasticsearchDocumentStore:
143
155
  }
144
156
 
145
157
  # Create the index if it doesn't exist
146
- if not client.indices.exists(index=self._index):
147
- client.indices.create(index=self._index, mappings=mappings)
158
+ if not self._client.indices.exists(index=self._index):
159
+ self._client.indices.create(index=self._index, mappings=mappings)
148
160
 
149
- self._client = client
161
+ self._initialized = True
150
162
 
163
+ @property
164
+ def client(self) -> Elasticsearch:
165
+ """
166
+ Returns the synchronous Elasticsearch client, initializing it if necessary.
167
+ """
168
+ self._ensure_initialized()
151
169
  return self._client
152
170
 
171
+ @property
172
+ def async_client(self) -> AsyncElasticsearch:
173
+ """
174
+ Returns the asynchronous Elasticsearch client, initializing it if necessary.
175
+ """
176
+ self._ensure_initialized()
177
+ return self._async_client
178
+
153
179
  def to_dict(self) -> Dict[str, Any]:
154
180
  """
155
181
  Serializes the component to a dictionary.
@@ -184,15 +210,26 @@ class ElasticsearchDocumentStore:
184
210
  def count_documents(self) -> int:
185
211
  """
186
212
  Returns how many documents are present in the document store.
187
- :returns: Number of documents in the document store.
213
+
214
+ :returns:
215
+ Number of documents in the document store.
188
216
  """
217
+ self._ensure_initialized()
189
218
  return self.client.count(index=self._index)["count"]
190
219
 
220
+ async def count_documents_async(self) -> int:
221
+ """
222
+ Asynchronously returns how many documents are present in the document store.
223
+ :returns: Number of documents in the document store.
224
+ """
225
+ self._ensure_initialized()
226
+ result = await self._async_client.count(index=self._index) # type: ignore
227
+ return result["count"]
228
+
191
229
  def _search_documents(self, **kwargs) -> List[Document]:
192
230
  """
193
231
  Calls the Elasticsearch client's search method and handles pagination.
194
232
  """
195
-
196
233
  top_k = kwargs.get("size")
197
234
  if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
198
235
  top_k = kwargs["knn"]["k"]
@@ -207,7 +244,7 @@ class ElasticsearchDocumentStore:
207
244
  **kwargs,
208
245
  )
209
246
 
210
- documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
247
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
211
248
  from_ = len(documents)
212
249
 
213
250
  if top_k is not None and from_ >= top_k:
@@ -216,6 +253,31 @@ class ElasticsearchDocumentStore:
216
253
  break
217
254
  return documents
218
255
 
256
+ async def _search_documents_async(self, **kwargs) -> List[Document]:
257
+ """
258
+ Asynchronously calls the Elasticsearch client's search method and handles pagination.
259
+ """
260
+ top_k = kwargs.get("size")
261
+ if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
262
+ top_k = kwargs["knn"]["k"]
263
+
264
+ documents: List[Document] = []
265
+ from_ = 0
266
+
267
+ # handle pagination
268
+ while True:
269
+ res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
270
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
271
+ from_ = len(documents)
272
+
273
+ if top_k is not None and from_ >= top_k:
274
+ break
275
+
276
+ if from_ >= res["hits"]["total"]["value"]:
277
+ break
278
+
279
+ return documents
280
+
219
281
  def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
220
282
  """
221
283
  The main query method for the document store. It retrieves all documents that match the filters.
@@ -229,10 +291,45 @@ class ElasticsearchDocumentStore:
229
291
  msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
230
292
  raise ValueError(msg)
231
293
 
294
+ self._ensure_initialized()
232
295
  query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
233
296
  documents = self._search_documents(query=query)
234
297
  return documents
235
298
 
299
+ async def filter_documents_async(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
300
+ """
301
+ Asynchronously retrieves all documents that match the filters.
302
+
303
+ :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
304
+ see the official Elasticsearch
305
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
306
+ :returns: List of `Document`s that match the filters.
307
+ """
308
+ if filters and "operator" not in filters and "conditions" not in filters:
309
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
310
+ raise ValueError(msg)
311
+
312
+ self._ensure_initialized()
313
+ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
314
+ documents = await self._search_documents_async(query=query)
315
+ return documents
316
+
317
+ @staticmethod
318
+ def _deserialize_document(hit: Dict[str, Any]) -> Document:
319
+ """
320
+ Creates a `Document` from the search hit provided.
321
+ This is mostly useful in self.filter_documents().
322
+ :param hit: A search hit from Elasticsearch.
323
+ :returns: `Document` created from the search hit.
324
+ """
325
+ data = hit["_source"]
326
+
327
+ if "highlight" in hit:
328
+ data["metadata"]["highlighted"] = hit["highlight"]
329
+ data["score"] = hit["_score"]
330
+
331
+ return Document.from_dict(data)
332
+
236
333
  def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
237
334
  """
238
335
  Writes `Document`s to Elasticsearch.
@@ -258,23 +355,15 @@ class ElasticsearchDocumentStore:
258
355
  elasticsearch_actions = []
259
356
  for doc in documents:
260
357
  doc_dict = doc.to_dict()
261
- if "dataframe" in doc_dict:
262
- dataframe = doc_dict.pop("dataframe")
263
- if dataframe:
264
- logger.warning(
265
- "Document %s has the `dataframe` field set,"
266
- "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
267
- "The `dataframe` field will soon be removed from Haystack Document.",
268
- doc.id,
269
- )
358
+
270
359
  if "sparse_embedding" in doc_dict:
271
360
  sparse_embedding = doc_dict.pop("sparse_embedding", None)
272
361
  if sparse_embedding:
273
362
  logger.warning(
274
- "Document %s has the `sparse_embedding` field set,"
363
+ "Document {doc_id} has the `sparse_embedding` field set,"
275
364
  "but storing sparse embeddings in Elasticsearch is not currently supported."
276
365
  "The `sparse_embedding` field will be ignored.",
277
- doc.id,
366
+ doc_id=doc.id,
278
367
  )
279
368
  elasticsearch_actions.append(
280
369
  {
@@ -315,40 +404,78 @@ class ElasticsearchDocumentStore:
315
404
 
316
405
  return documents_written
317
406
 
318
- @staticmethod
319
- def _deserialize_document(hit: Dict[str, Any]) -> Document:
407
+ async def write_documents_async(
408
+ self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE
409
+ ) -> int:
320
410
  """
321
- Creates a `Document` from the search hit provided.
322
-
323
- This is mostly useful in self.filter_documents().
411
+ Asynchronously writes `Document`s to Elasticsearch.
324
412
 
325
- :param hit: A search hit from Elasticsearch.
326
- :returns: `Document` created from the search hit.
413
+ :param documents: List of Documents to write to the document store.
414
+ :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
415
+ :raises ValueError: If `documents` is not a list of `Document`s.
416
+ :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
417
+ `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
418
+ :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
419
+ :returns: Number of documents written to the document store.
327
420
  """
328
- data = hit["_source"]
421
+ self._ensure_initialized()
329
422
 
330
- if "highlight" in hit:
331
- data["metadata"]["highlighted"] = hit["highlight"]
332
- data["score"] = hit["_score"]
423
+ if len(documents) > 0:
424
+ if not isinstance(documents[0], Document):
425
+ msg = "param 'documents' must contain a list of objects of type Document"
426
+ raise ValueError(msg)
333
427
 
334
- if "dataframe" in data:
335
- dataframe = data.pop("dataframe")
336
- if dataframe:
337
- logger.warning(
338
- "Document %s has the `dataframe` field set,"
339
- "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
340
- "The `dataframe` field will soon be removed from Haystack Document.",
341
- data["id"],
342
- )
343
- return Document.from_dict(data)
428
+ if policy == DuplicatePolicy.NONE:
429
+ policy = DuplicatePolicy.FAIL
430
+
431
+ actions = []
432
+ for doc in documents:
433
+ doc_dict = doc.to_dict()
434
+
435
+ if "sparse_embedding" in doc_dict:
436
+ sparse_embedding = doc_dict.pop("sparse_embedding", None)
437
+ if sparse_embedding:
438
+ logger.warning(
439
+ "Document {doc_id} has the `sparse_embedding` field set,"
440
+ "but storing sparse embeddings in Elasticsearch is not currently supported."
441
+ "The `sparse_embedding` field will be ignored.",
442
+ doc_id=doc.id,
443
+ )
444
+
445
+ action = {
446
+ "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
447
+ "_id": doc.id,
448
+ "_source": doc_dict,
449
+ }
450
+ actions.append(action)
451
+
452
+ try:
453
+ success, failed = await helpers.async_bulk(
454
+ client=self._async_client,
455
+ actions=actions,
456
+ index=self._index,
457
+ refresh=True,
458
+ raise_on_error=False,
459
+ )
460
+ if failed:
461
+ if policy == DuplicatePolicy.FAIL:
462
+ for error in failed:
463
+ if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
464
+ msg = f"ID '{error['create']['_id']}' already exists in the document store"
465
+ raise DuplicateDocumentError(msg)
466
+ msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
467
+ raise DocumentStoreError(msg)
468
+ return success
469
+ except Exception as e:
470
+ msg = f"Failed to write documents to Elasticsearch: {e!s}"
471
+ raise DocumentStoreError(msg) from e
344
472
 
345
473
  def delete_documents(self, document_ids: List[str]) -> None:
346
474
  """
347
- Deletes all `Document`s with a matching `document_ids` from the document store.
475
+ Deletes all documents with a matching document_ids from the document store.
348
476
 
349
- :param document_ids: the object IDs to delete
477
+ :param document_ids: the document ids to delete
350
478
  """
351
-
352
479
  helpers.bulk(
353
480
  client=self.client,
354
481
  actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
@@ -357,6 +484,25 @@ class ElasticsearchDocumentStore:
357
484
  raise_on_error=False,
358
485
  )
359
486
 
487
+ async def delete_documents_async(self, document_ids: List[str]) -> None:
488
+ """
489
+ Asynchronously deletes all documents with a matching document_ids from the document store.
490
+
491
+ :param document_ids: the document ids to delete
492
+ """
493
+ self._ensure_initialized()
494
+
495
+ try:
496
+ await helpers.async_bulk(
497
+ client=self._async_client,
498
+ actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
499
+ index=self._index,
500
+ refresh=True,
501
+ )
502
+ except Exception as e:
503
+ msg = f"Failed to delete documents from Elasticsearch: {e!s}"
504
+ raise DocumentStoreError(msg) from e
505
+
360
506
  def _bm25_retrieval(
361
507
  self,
362
508
  query: str,
@@ -367,27 +513,15 @@ class ElasticsearchDocumentStore:
367
513
  scale_score: bool = False,
368
514
  ) -> List[Document]:
369
515
  """
370
- Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
371
-
372
- Even though this method is called `bm25_retrieval` it searches for `query`
373
- using the search algorithm `_client` was configured with.
374
-
375
- This method is not meant to be part of the public interface of
376
- `ElasticsearchDocumentStore` nor called directly.
377
- `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
378
-
379
- :param query: String to search in saved `Document`s' text.
380
- :param filters: Filters applied to the retrieved `Document`s, for more info
381
- see `ElasticsearchDocumentStore.filter_documents`.
382
- :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
383
- [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
384
- for valid values.
385
- :param top_k: Maximum number of `Document`s to return.
386
- :param scale_score: If `True` scales the `Document``s scores between 0 and 1.
387
- :raises ValueError: If `query` is an empty string
388
- :returns: List of `Document` that match `query`
516
+ Retrieves documents using BM25 retrieval.
517
+
518
+ :param query: The query string to search for
519
+ :param filters: Optional filters to narrow down the search space
520
+ :param fuzziness: Fuzziness parameter for the search query
521
+ :param top_k: Maximum number of documents to return
522
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
523
+ :returns: List of Documents that match the query
389
524
  """
390
-
391
525
  if not query:
392
526
  msg = "query must be a non empty string"
393
527
  raise ValueError(msg)
@@ -421,35 +555,79 @@ class ElasticsearchDocumentStore:
421
555
 
422
556
  return documents
423
557
 
424
- def _embedding_retrieval(
558
+ async def _bm25_retrieval_async(
425
559
  self,
426
- query_embedding: List[float],
560
+ query: str,
427
561
  *,
428
562
  filters: Optional[Dict[str, Any]] = None,
563
+ fuzziness: str = "AUTO",
429
564
  top_k: int = 10,
430
- num_candidates: Optional[int] = None,
565
+ scale_score: bool = False,
431
566
  ) -> List[Document]:
432
567
  """
433
- Retrieves documents that are most similar to the query embedding using a vector similarity metric.
568
+ Asynchronously retrieves documents using BM25 retrieval.
569
+
570
+ :param query: The query string to search for
571
+ :param filters: Optional filters to narrow down the search space
572
+ :param fuzziness: Fuzziness parameter for the search query
573
+ :param top_k: Maximum number of documents to return
574
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
575
+ :returns: List of Documents that match the query
576
+ """
577
+ self._ensure_initialized()
578
+
579
+ if not query:
580
+ msg = "query must be a non empty string"
581
+ raise ValueError(msg)
582
+
583
+ # Prepare the search body
584
+ search_body = {
585
+ "size": top_k,
586
+ "query": {
587
+ "bool": {
588
+ "must": [
589
+ {
590
+ "multi_match": {
591
+ "query": query,
592
+ "type": "most_fields",
593
+ "operator": "OR",
594
+ "fuzziness": fuzziness,
595
+ }
596
+ }
597
+ ]
598
+ }
599
+ },
600
+ }
434
601
 
435
- It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
602
+ if filters:
603
+ search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
604
+
605
+ documents = await self._search_documents_async(**search_body)
606
+
607
+ if scale_score:
608
+ for doc in documents:
609
+ if doc.score is not None:
610
+ doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
436
611
 
437
- This method is not meant to be part of the public interface of
438
- `ElasticsearchDocumentStore` nor called directly.
439
- `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
612
+ return documents
440
613
 
441
- :param query_embedding: Embedding of the query.
442
- :param filters: Filters applied to the retrieved `Document`s.
443
- Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
444
- :param top_k: Maximum number of `Document`s to return.
445
- :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
446
- Increasing this value will improve search accuracy at the cost of slower search speeds.
447
- You can read more about it in the Elasticsearch
448
- [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
449
- :raises ValueError: If `query_embedding` is an empty list.
450
- :returns: List of `Document` that are most similar to `query_embedding`.
614
+ def _embedding_retrieval(
615
+ self,
616
+ query_embedding: List[float],
617
+ *,
618
+ filters: Optional[Dict[str, Any]] = None,
619
+ top_k: int = 10,
620
+ num_candidates: Optional[int] = None,
621
+ ) -> List[Document]:
451
622
  """
623
+ Retrieves documents using dense vector similarity search.
452
624
 
625
+ :param query_embedding: Embedding vector to search for
626
+ :param filters: Optional filters to narrow down the search space
627
+ :param top_k: Maximum number of documents to return
628
+ :param num_candidates: Number of candidates to consider in the search
629
+ :returns: List of Documents most similar to query_embedding
630
+ """
453
631
  if not query_embedding:
454
632
  msg = "query_embedding must be a non-empty list of floats"
455
633
  raise ValueError(msg)
@@ -471,3 +649,45 @@ class ElasticsearchDocumentStore:
471
649
 
472
650
  docs = self._search_documents(**body)
473
651
  return docs
652
+
653
+ async def _embedding_retrieval_async(
654
+ self,
655
+ query_embedding: List[float],
656
+ *,
657
+ filters: Optional[Dict[str, Any]] = None,
658
+ top_k: int = 10,
659
+ num_candidates: Optional[int] = None,
660
+ ) -> List[Document]:
661
+ """
662
+ Asynchronously retrieves documents using dense vector similarity search.
663
+
664
+ :param query_embedding: Embedding vector to search for
665
+ :param filters: Optional filters to narrow down the search space
666
+ :param top_k: Maximum number of documents to return
667
+ :param num_candidates: Number of candidates to consider in the search
668
+ :returns: List of Documents most similar to query_embedding
669
+ """
670
+ self._ensure_initialized()
671
+
672
+ if not query_embedding:
673
+ msg = "query_embedding must be a non-empty list of floats"
674
+ raise ValueError(msg)
675
+
676
+ # If num_candidates is not set, use top_k * 10 as default
677
+ if num_candidates is None:
678
+ num_candidates = top_k * 10
679
+
680
+ # Prepare the search body
681
+ search_body = {
682
+ "knn": {
683
+ "field": "embedding",
684
+ "query_vector": query_embedding,
685
+ "k": top_k,
686
+ "num_candidates": num_candidates,
687
+ },
688
+ }
689
+
690
+ if filters:
691
+ search_body["knn"]["filter"] = _normalize_filters(filters)
692
+
693
+ return await self._search_documents_async(**search_body)