elasticsearch-haystack 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (19) hide show
  1. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/CHANGELOG.md +68 -14
  2. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/PKG-INFO +2 -3
  3. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/pyproject.toml +6 -3
  4. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +24 -1
  5. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +30 -4
  6. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +321 -75
  7. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_bm25_retriever.py +63 -0
  8. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_document_store.py +174 -10
  9. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_embedding_retriever.py +65 -0
  10. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/.gitignore +0 -0
  11. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/LICENSE +0 -0
  12. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/README.md +0 -0
  13. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/docker-compose.yml +0 -0
  14. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/pydoc/config.yml +0 -0
  15. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
  16. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
  17. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
  18. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/__init__.py +0 -0
  19. {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_filters.py +0 -0
@@ -1,12 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/elasticsearch-v2.0.0] - 2025-02-14
4
+
5
+ ### 🧹 Chores
6
+
7
+ - Inherit from `FilterDocumentsTestWithDataframe` in Document Stores (#1290)
8
+ - [**breaking**] Elasticsearch - remove dataframe support (#1377)
9
+
10
+
3
11
  ## [integrations/elasticsearch-v1.0.1] - 2024-10-28
4
12
 
5
- ### ⚙️ Miscellaneous Tasks
13
+ ### ⚙️ CI
14
+
15
+ - Adopt uv as installer (#1142)
16
+
17
+ ### 🧹 Chores
6
18
 
7
19
  - Update changelog after removing legacy filters (#1083)
8
20
  - Update ruff linting scripts and settings (#1105)
9
- - Adopt uv as installer (#1142)
21
+
22
+ ### 🌀 Miscellaneous
23
+
24
+ - Fix: Elasticsearch - allow passing headers (#1156)
10
25
 
11
26
  ## [integrations/elasticsearch-v1.0.0] - 2024-09-12
12
27
 
@@ -23,18 +38,32 @@
23
38
 
24
39
  - Do not retry tests in `hatch run test` command (#954)
25
40
 
26
- ### ⚙️ Miscellaneous Tasks
41
+ ### ⚙️ CI
27
42
 
28
43
  - Retry tests to reduce flakyness (#836)
44
+
45
+ ### 🧹 Chores
46
+
29
47
  - Update ruff invocation to include check parameter (#853)
30
48
  - ElasticSearch - remove legacy filters elasticsearch (#1078)
31
49
 
50
+ ### 🌀 Miscellaneous
51
+
52
+ - Ci: install `pytest-rerunfailures` where needed; add retry config to `test-cov` script (#845)
53
+ - Chore: Minor retriever pydoc fix (#884)
54
+ - Chore: elasticsearch - ruff update, don't ruff tests (#999)
55
+
32
56
  ## [integrations/elasticsearch-v0.5.0] - 2024-05-24
33
57
 
34
58
  ### 🐛 Bug Fixes
35
59
 
36
60
  - Add support for custom mapping in ElasticsearchDocumentStore (#721)
37
61
 
62
+ ### 🌀 Miscellaneous
63
+
64
+ - Chore: add license classifiers (#680)
65
+ - Chore: change the pydoc renderer class (#718)
66
+
38
67
  ## [integrations/elasticsearch-v0.4.0] - 2024-04-03
39
68
 
40
69
  ### 📚 Documentation
@@ -43,49 +72,64 @@
43
72
  - Review Elastic (#541)
44
73
  - Disable-class-def (#556)
45
74
 
75
+ ### 🌀 Miscellaneous
76
+
77
+ - Make tests show coverage (#566)
78
+ - Refactor tests (#574)
79
+ - Remove references to Python 3.7 (#601)
80
+ - Make Document Stores initially skip `SparseEmbedding` (#606)
81
+ - [Elasticsearch] fix: Filters not working with metadata that contain a space or capitalization (#639)
82
+
46
83
  ## [integrations/elasticsearch-v0.3.0] - 2024-02-23
47
84
 
48
85
  ### 🐛 Bug Fixes
49
86
 
50
87
  - Fix order of API docs (#447)
51
88
 
52
- This PR will also push the docs to Readme
53
-
54
89
  ### 📚 Documentation
55
90
 
56
91
  - Update category slug (#442)
57
92
 
58
- ### Elasticsearch
93
+ ### 🌀 Miscellaneous
59
94
 
95
+ - Generate api docs (#322)
96
+ - Add filters to run function in retrievers of elasticsearch (#440)
60
97
  - Add user-agent header (#457)
61
98
 
62
- ### Feat
99
+ ## [integrations/elasticsearch-v0.2.0] - 2024-01-19
63
100
 
64
- - Add filters to run function in retrievers of elasticsearch (#440)
101
+ ### 🌀 Miscellaneous
65
102
 
66
- ### Elasticsearch
103
+ - Mount import paths under haystack_integrations (#244)
67
104
 
68
- - Generate api docs (#322)
105
+ ## [integrations/elasticsearch-v0.1.3] - 2024-01-18
69
106
 
70
- ## [integrations/elasticsearch-v0.2.0] - 2024-01-19
107
+ ### 🌀 Miscellaneous
71
108
 
72
- ## [integrations/elasticsearch-v0.1.3] - 2024-01-18
109
+ - Added top_k argument in the run function of ElasticSearcBM25Retriever (#130)
110
+ - Add more docstrings for `ElasticsearchDocumentStore` and `ElasticsearchBM25Retriever` (#184)
111
+ - Elastic - update imports for beta5 (#238)
73
112
 
74
113
  ## [integrations/elasticsearch-v0.1.2] - 2023-12-20
75
114
 
76
115
  ### 🐛 Bug Fixes
77
116
 
78
- - Fix project urls (#96)
117
+ - Fix project URLs (#96)
79
118
 
80
119
  ### 🚜 Refactor
81
120
 
82
121
  - Use `hatch_vcs` to manage integrations versioning (#103)
83
122
 
123
+ ### 🌀 Miscellaneous
124
+
125
+ - Update elasticsearch test badge (#79)
126
+ - [Elasticsearch] - BM25 retrieval: not all terms must mandatorily match (#125)
127
+
84
128
  ## [integrations/elasticsearch-v0.1.1] - 2023-12-05
85
129
 
86
130
  ### 🐛 Bug Fixes
87
131
 
88
- - Fix import and increase version (#77)
132
+ - Document Stores: fix protocol import (#77)
89
133
 
90
134
  ## [integrations/elasticsearch-v0.1.0] - 2023-12-04
91
135
 
@@ -93,6 +137,16 @@ This PR will also push the docs to Readme
93
137
 
94
138
  - Fix license headers
95
139
 
140
+ ### 🌀 Miscellaneous
141
+
142
+ - Remove Document Store decorator (#76)
143
+
96
144
  ## [integrations/elasticsearch-v0.0.2] - 2023-11-29
97
145
 
146
+ ### 🌀 Miscellaneous
147
+
148
+ - Reorganize repository (#62)
149
+ - Update `ElasticSearchDocumentStore` to use latest `haystack-ai` version (#63)
150
+ - Bump elasticsearch_haystack to 0.0.2
151
+
98
152
  <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: elasticsearch-haystack
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -11,13 +11,12 @@ License-File: LICENSE
11
11
  Classifier: Development Status :: 4 - Beta
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Programming Language :: Python
14
- Classifier: Programming Language :: Python :: 3.8
15
14
  Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: Implementation :: CPython
19
18
  Classifier: Programming Language :: Python :: Implementation :: PyPy
20
- Requires-Python: >=3.8
19
+ Requires-Python: >=3.9
21
20
  Requires-Dist: elasticsearch<9,>=8
22
21
  Requires-Dist: haystack-ai
23
22
  Description-Content-Type: text/markdown
@@ -7,7 +7,7 @@ name = "elasticsearch-haystack"
7
7
  dynamic = ["version"]
8
8
  description = 'Haystack 2.x Document Store for ElasticSearch'
9
9
  readme = "README.md"
10
- requires-python = ">=3.8"
10
+ requires-python = ">=3.9"
11
11
  license = "Apache-2.0"
12
12
  keywords = []
13
13
  authors = [{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" }]
@@ -15,7 +15,6 @@ classifiers = [
15
15
  "License :: OSI Approved :: Apache Software License",
16
16
  "Development Status :: 4 - Beta",
17
17
  "Programming Language :: Python",
18
- "Programming Language :: Python :: 3.8",
19
18
  "Programming Language :: Python :: 3.9",
20
19
  "Programming Language :: Python :: 3.10",
21
20
  "Programming Language :: Python :: 3.11",
@@ -45,6 +44,7 @@ installer = "uv"
45
44
  dependencies = [
46
45
  "coverage[toml]>=6.5",
47
46
  "pytest",
47
+ "pytest-asyncio",
48
48
  "pytest-rerunfailures",
49
49
  "pytest-xdist",
50
50
  "haystack-pydoc-tools",
@@ -59,12 +59,13 @@ cov-retry = ["test-cov-retry", "cov-report"]
59
59
  docs = ["pydoc-markdown pydoc/config.yml"]
60
60
 
61
61
  [[tool.hatch.envs.all.matrix]]
62
- python = ["3.8", "3.9", "3.10", "3.11"]
62
+ python = [ "3.9", "3.10", "3.11"]
63
63
 
64
64
  [tool.hatch.envs.lint]
65
65
  installer = "uv"
66
66
  detached = true
67
67
  dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
68
+
68
69
  [tool.hatch.envs.lint.scripts]
69
70
  typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
70
71
  style = ["ruff check {args:}", "black --check --diff {args:.}"]
@@ -157,6 +158,8 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
157
158
  [tool.pytest.ini_options]
158
159
  minversion = "6.0"
159
160
  markers = ["unit: unit tests", "integration: integration tests"]
161
+ asyncio_mode = "auto"
162
+ asyncio_default_fixture_loop_scope = "class"
160
163
 
161
164
  [[tool.mypy.overrides]]
162
165
  module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
@@ -120,7 +120,7 @@ class ElasticsearchBM25Retriever:
120
120
  """
121
121
  Retrieve documents using the BM25 keyword-based algorithm.
122
122
 
123
- :param query: String to search in `Document`s' text.
123
+ :param query: String to search in the `Document`s text.
124
124
  :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
125
125
  the `filter_policy` chosen at retriever initialization. See init method docstring for more
126
126
  details.
@@ -137,3 +137,26 @@ class ElasticsearchBM25Retriever:
137
137
  scale_score=self._scale_score,
138
138
  )
139
139
  return {"documents": docs}
140
+
141
+ @component.output_types(documents=List[Document])
142
+ async def run_async(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
143
+ """
144
+ Asynchronously retrieve documents using the BM25 keyword-based algorithm.
145
+
146
+ :param query: String to search in the `Document` text.
147
+ :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
148
+ the `filter_policy` chosen at retriever initialization. See init method docstring for more
149
+ details.
150
+ :param top_k: Maximum number of `Document` to return.
151
+ :returns: A dictionary with the following keys:
152
+ - `documents`: List of `Document`s that match the query.
153
+ """
154
+ filters = apply_filter_policy(self._filter_policy, self._filters, filters)
155
+ docs = await self._document_store._bm25_retrieval_async(
156
+ query=query,
157
+ filters=filters,
158
+ fuzziness=self._fuzziness,
159
+ top_k=top_k or self._top_k,
160
+ scale_score=self._scale_score,
161
+ )
162
+ return {"documents": docs}
@@ -119,10 +119,11 @@ class ElasticsearchEmbeddingRetriever:
119
119
  Retrieve documents using a vector similarity metric.
120
120
 
121
121
  :param query_embedding: Embedding of the query.
122
- :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
123
- the `filter_policy` chosen at retriever initialization. See init method docstring for more
124
- details.
125
- :param top_k: Maximum number of `Document`s to return.
122
+ :param filters: Filters applied when fetching documents from the Document Store.
123
+ Filters are applied during the approximate kNN search to ensure the Retriever returns
124
+ `top_k` matching documents.
125
+ The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
126
+ :param top_k: Maximum number of documents to return.
126
127
  :returns: A dictionary with the following keys:
127
128
  - `documents`: List of `Document`s most similar to the given `query_embedding`
128
129
  """
@@ -134,3 +135,28 @@ class ElasticsearchEmbeddingRetriever:
134
135
  num_candidates=self._num_candidates,
135
136
  )
136
137
  return {"documents": docs}
138
+
139
+ @component.output_types(documents=List[Document])
140
+ async def run_async(
141
+ self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
142
+ ):
143
+ """
144
+ Asynchronously retrieve documents using a vector similarity metric.
145
+
146
+ :param query_embedding: Embedding of the query.
147
+ :param filters: Filters applied when fetching documents from the Document Store.
148
+ Filters are applied during the approximate kNN search to ensure the Retriever returns
149
+ `top_k` matching documents.
150
+ The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
151
+ :param top_k: Maximum number of documents to return.
152
+ :returns: A dictionary with the following keys:
153
+ - `documents`: List of `Document`s that match the query.
154
+ """
155
+ filters = apply_filter_policy(self._filter_policy, self._filters, filters)
156
+ docs = await self._document_store._embedding_retrieval_async(
157
+ query_embedding=query_embedding,
158
+ filters=filters,
159
+ top_k=top_k or self._top_k,
160
+ num_candidates=self._num_candidates,
161
+ )
162
+ return {"documents": docs}
@@ -2,7 +2,8 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
  import logging
5
- from typing import Any, Dict, List, Literal, Mapping, Optional, Union
5
+ from collections.abc import Mapping
6
+ from typing import Any, Dict, List, Literal, Optional, Union
6
7
 
7
8
  import numpy as np
8
9
 
@@ -14,7 +15,7 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumen
14
15
  from haystack.document_stores.types import DuplicatePolicy
15
16
  from haystack.version import __version__ as haystack_version
16
17
 
17
- from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found]
18
+ from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers # type: ignore[import-not-found]
18
19
 
19
20
  from .filters import _normalize_filters
20
21
 
@@ -30,6 +31,7 @@ Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]]]
30
31
  # Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
31
32
  # all be mapped to scores ~1.
32
33
  BM25_SCALING_FACTOR = 8
34
+ DOC_ALREADY_EXISTS = 409
33
35
 
34
36
 
35
37
  class ElasticsearchDocumentStore:
@@ -93,28 +95,39 @@ class ElasticsearchDocumentStore:
93
95
  """
94
96
  self._hosts = hosts
95
97
  self._client = None
98
+ self._async_client = None
96
99
  self._index = index
97
100
  self._embedding_similarity_function = embedding_similarity_function
98
101
  self._custom_mapping = custom_mapping
99
102
  self._kwargs = kwargs
103
+ self._initialized = False
100
104
 
101
105
  if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
102
106
  msg = "custom_mapping must be a dictionary"
103
107
  raise ValueError(msg)
104
108
 
105
- @property
106
- def client(self) -> Elasticsearch:
107
- if self._client is None:
109
+ def _ensure_initialized(self):
110
+ """
111
+ Ensures both sync and async clients are initialized and the index exists.
112
+ """
113
+ if not self._initialized:
108
114
  headers = self._kwargs.pop("headers", {})
109
115
  headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
110
116
 
111
- client = Elasticsearch(
117
+ # Initialize both sync and async clients
118
+ self._client = Elasticsearch(
119
+ self._hosts,
120
+ headers=headers,
121
+ **self._kwargs,
122
+ )
123
+ self._async_client = AsyncElasticsearch(
112
124
  self._hosts,
113
125
  headers=headers,
114
126
  **self._kwargs,
115
127
  )
128
+
116
129
  # Check client connection, this will raise if not connected
117
- client.info()
130
+ self._client.info()
118
131
 
119
132
  if self._custom_mapping:
120
133
  mappings = self._custom_mapping
@@ -143,13 +156,27 @@ class ElasticsearchDocumentStore:
143
156
  }
144
157
 
145
158
  # Create the index if it doesn't exist
146
- if not client.indices.exists(index=self._index):
147
- client.indices.create(index=self._index, mappings=mappings)
159
+ if not self._client.indices.exists(index=self._index):
160
+ self._client.indices.create(index=self._index, mappings=mappings)
148
161
 
149
- self._client = client
162
+ self._initialized = True
150
163
 
164
+ @property
165
+ def client(self) -> Elasticsearch:
166
+ """
167
+ Returns the synchronous Elasticsearch client, initializing it if necessary.
168
+ """
169
+ self._ensure_initialized()
151
170
  return self._client
152
171
 
172
+ @property
173
+ def async_client(self) -> AsyncElasticsearch:
174
+ """
175
+ Returns the asynchronous Elasticsearch client, initializing it if necessary.
176
+ """
177
+ self._ensure_initialized()
178
+ return self._async_client
179
+
153
180
  def to_dict(self) -> Dict[str, Any]:
154
181
  """
155
182
  Serializes the component to a dictionary.
@@ -184,15 +211,26 @@ class ElasticsearchDocumentStore:
184
211
  def count_documents(self) -> int:
185
212
  """
186
213
  Returns how many documents are present in the document store.
187
- :returns: Number of documents in the document store.
214
+
215
+ :returns:
216
+ Number of documents in the document store.
188
217
  """
218
+ self._ensure_initialized()
189
219
  return self.client.count(index=self._index)["count"]
190
220
 
221
+ async def count_documents_async(self) -> int:
222
+ """
223
+ Asynchronously returns how many documents are present in the document store.
224
+ :returns: Number of documents in the document store.
225
+ """
226
+ self._ensure_initialized()
227
+ result = await self._async_client.count(index=self._index) # type: ignore
228
+ return result["count"]
229
+
191
230
  def _search_documents(self, **kwargs) -> List[Document]:
192
231
  """
193
232
  Calls the Elasticsearch client's search method and handles pagination.
194
233
  """
195
-
196
234
  top_k = kwargs.get("size")
197
235
  if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
198
236
  top_k = kwargs["knn"]["k"]
@@ -207,7 +245,7 @@ class ElasticsearchDocumentStore:
207
245
  **kwargs,
208
246
  )
209
247
 
210
- documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
248
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
211
249
  from_ = len(documents)
212
250
 
213
251
  if top_k is not None and from_ >= top_k:
@@ -216,6 +254,31 @@ class ElasticsearchDocumentStore:
216
254
  break
217
255
  return documents
218
256
 
257
+ async def _search_documents_async(self, **kwargs) -> List[Document]:
258
+ """
259
+ Asynchronously calls the Elasticsearch client's search method and handles pagination.
260
+ """
261
+ top_k = kwargs.get("size")
262
+ if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
263
+ top_k = kwargs["knn"]["k"]
264
+
265
+ documents: List[Document] = []
266
+ from_ = 0
267
+
268
+ # handle pagination
269
+ while True:
270
+ res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
271
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
272
+ from_ = len(documents)
273
+
274
+ if top_k is not None and from_ >= top_k:
275
+ break
276
+
277
+ if from_ >= res["hits"]["total"]["value"]:
278
+ break
279
+
280
+ return documents
281
+
219
282
  def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
220
283
  """
221
284
  The main query method for the document store. It retrieves all documents that match the filters.
@@ -229,10 +292,54 @@ class ElasticsearchDocumentStore:
229
292
  msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
230
293
  raise ValueError(msg)
231
294
 
295
+ self._ensure_initialized()
232
296
  query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
233
297
  documents = self._search_documents(query=query)
234
298
  return documents
235
299
 
300
+ async def filter_documents_async(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
301
+ """
302
+ Asynchronously retrieves all documents that match the filters.
303
+
304
+ :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
305
+ see the official Elasticsearch
306
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
307
+ :returns: List of `Document`s that match the filters.
308
+ """
309
+ if filters and "operator" not in filters and "conditions" not in filters:
310
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
311
+ raise ValueError(msg)
312
+
313
+ self._ensure_initialized()
314
+ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
315
+ documents = await self._search_documents_async(query=query)
316
+ return documents
317
+
318
+ @staticmethod
319
+ def _deserialize_document(hit: Dict[str, Any]) -> Document:
320
+ """
321
+ Creates a `Document` from the search hit provided.
322
+ This is mostly useful in self.filter_documents().
323
+ :param hit: A search hit from Elasticsearch.
324
+ :returns: `Document` created from the search hit.
325
+ """
326
+ data = hit["_source"]
327
+
328
+ if "highlight" in hit:
329
+ data["metadata"]["highlighted"] = hit["highlight"]
330
+ data["score"] = hit["_score"]
331
+
332
+ if "dataframe" in data:
333
+ dataframe = data.pop("dataframe")
334
+ if dataframe:
335
+ logger.warning(
336
+ "Document %s has the `dataframe` field set,"
337
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
338
+ "The `dataframe` field will soon be removed from Haystack Document.",
339
+ data["id"],
340
+ )
341
+ return Document.from_dict(data)
342
+
236
343
  def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
237
344
  """
238
345
  Writes `Document`s to Elasticsearch.
@@ -315,40 +422,86 @@ class ElasticsearchDocumentStore:
315
422
 
316
423
  return documents_written
317
424
 
318
- @staticmethod
319
- def _deserialize_document(hit: Dict[str, Any]) -> Document:
425
+ async def write_documents_async(
426
+ self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE
427
+ ) -> int:
320
428
  """
321
- Creates a `Document` from the search hit provided.
322
-
323
- This is mostly useful in self.filter_documents().
429
+ Asynchronously writes `Document`s to Elasticsearch.
324
430
 
325
- :param hit: A search hit from Elasticsearch.
326
- :returns: `Document` created from the search hit.
431
+ :param documents: List of Documents to write to the document store.
432
+ :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
433
+ :raises ValueError: If `documents` is not a list of `Document`s.
434
+ :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
435
+ `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
436
+ :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
437
+ :returns: Number of documents written to the document store.
327
438
  """
328
- data = hit["_source"]
439
+ self._ensure_initialized()
329
440
 
330
- if "highlight" in hit:
331
- data["metadata"]["highlighted"] = hit["highlight"]
332
- data["score"] = hit["_score"]
441
+ if len(documents) > 0:
442
+ if not isinstance(documents[0], Document):
443
+ msg = "param 'documents' must contain a list of objects of type Document"
444
+ raise ValueError(msg)
333
445
 
334
- if "dataframe" in data:
335
- dataframe = data.pop("dataframe")
336
- if dataframe:
337
- logger.warning(
338
- "Document %s has the `dataframe` field set,"
339
- "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
340
- "The `dataframe` field will soon be removed from Haystack Document.",
341
- data["id"],
342
- )
343
- return Document.from_dict(data)
446
+ if policy == DuplicatePolicy.NONE:
447
+ policy = DuplicatePolicy.FAIL
448
+
449
+ actions = []
450
+ for doc in documents:
451
+ doc_dict = doc.to_dict()
452
+ if "dataframe" in doc_dict:
453
+ dataframe = doc_dict.pop("dataframe")
454
+ if dataframe:
455
+ logger.warning(
456
+ "Document {id} has the `dataframe` field set,"
457
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
458
+ "The `dataframe` field will soon be removed from Haystack Document.",
459
+ )
460
+
461
+ if "sparse_embedding" in doc_dict:
462
+ sparse_embedding = doc_dict.pop("sparse_embedding", None)
463
+ if sparse_embedding:
464
+ logger.warning(
465
+ "Document %s has the `sparse_embedding` field set,"
466
+ "but storing sparse embeddings in Elasticsearch is not currently supported."
467
+ "The `sparse_embedding` field will be ignored.",
468
+ doc.id,
469
+ )
470
+
471
+ action = {
472
+ "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
473
+ "_id": doc.id,
474
+ "_source": doc_dict,
475
+ }
476
+ actions.append(action)
477
+
478
+ try:
479
+ success, failed = await helpers.async_bulk(
480
+ client=self._async_client,
481
+ actions=actions,
482
+ index=self._index,
483
+ refresh=True,
484
+ raise_on_error=False,
485
+ )
486
+ if failed:
487
+ if policy == DuplicatePolicy.FAIL:
488
+ for error in failed:
489
+ if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
490
+ msg = f"ID '{error['create']['_id']}' already exists in the document store"
491
+ raise DuplicateDocumentError(msg)
492
+ msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
493
+ raise DocumentStoreError(msg)
494
+ return success
495
+ except Exception as e:
496
+ msg = f"Failed to write documents to Elasticsearch: {e!s}"
497
+ raise DocumentStoreError(msg) from e
344
498
 
345
499
  def delete_documents(self, document_ids: List[str]) -> None:
346
500
  """
347
- Deletes all `Document`s with a matching `document_ids` from the document store.
501
+ Deletes all documents with a matching document_ids from the document store.
348
502
 
349
- :param document_ids: the object IDs to delete
503
+ :param document_ids: the document ids to delete
350
504
  """
351
-
352
505
  helpers.bulk(
353
506
  client=self.client,
354
507
  actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
@@ -357,6 +510,25 @@ class ElasticsearchDocumentStore:
357
510
  raise_on_error=False,
358
511
  )
359
512
 
513
+ async def delete_documents_async(self, document_ids: List[str]) -> None:
514
+ """
515
+ Asynchronously deletes all documents with a matching document_ids from the document store.
516
+
517
+ :param document_ids: the document ids to delete
518
+ """
519
+ self._ensure_initialized()
520
+
521
+ try:
522
+ await helpers.async_bulk(
523
+ client=self._async_client,
524
+ actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
525
+ index=self._index,
526
+ refresh=True,
527
+ )
528
+ except Exception as e:
529
+ msg = f"Failed to delete documents from Elasticsearch: {e!s}"
530
+ raise DocumentStoreError(msg) from e
531
+
360
532
  def _bm25_retrieval(
361
533
  self,
362
534
  query: str,
@@ -367,27 +539,15 @@ class ElasticsearchDocumentStore:
367
539
  scale_score: bool = False,
368
540
  ) -> List[Document]:
369
541
  """
370
- Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
371
-
372
- Even though this method is called `bm25_retrieval` it searches for `query`
373
- using the search algorithm `_client` was configured with.
374
-
375
- This method is not meant to be part of the public interface of
376
- `ElasticsearchDocumentStore` nor called directly.
377
- `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
378
-
379
- :param query: String to search in saved `Document`s' text.
380
- :param filters: Filters applied to the retrieved `Document`s, for more info
381
- see `ElasticsearchDocumentStore.filter_documents`.
382
- :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
383
- [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
384
- for valid values.
385
- :param top_k: Maximum number of `Document`s to return.
386
- :param scale_score: If `True` scales the `Document``s scores between 0 and 1.
387
- :raises ValueError: If `query` is an empty string
388
- :returns: List of `Document` that match `query`
542
+ Retrieves documents using BM25 retrieval.
543
+
544
+ :param query: The query string to search for
545
+ :param filters: Optional filters to narrow down the search space
546
+ :param fuzziness: Fuzziness parameter for the search query
547
+ :param top_k: Maximum number of documents to return
548
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
549
+ :returns: List of Documents that match the query
389
550
  """
390
-
391
551
  if not query:
392
552
  msg = "query must be a non empty string"
393
553
  raise ValueError(msg)
@@ -421,35 +581,79 @@ class ElasticsearchDocumentStore:
421
581
 
422
582
  return documents
423
583
 
424
- def _embedding_retrieval(
584
+ async def _bm25_retrieval_async(
425
585
  self,
426
- query_embedding: List[float],
586
+ query: str,
427
587
  *,
428
588
  filters: Optional[Dict[str, Any]] = None,
589
+ fuzziness: str = "AUTO",
429
590
  top_k: int = 10,
430
- num_candidates: Optional[int] = None,
591
+ scale_score: bool = False,
431
592
  ) -> List[Document]:
432
593
  """
433
- Retrieves documents that are most similar to the query embedding using a vector similarity metric.
594
+ Asynchronously retrieves documents using BM25 retrieval.
595
+
596
+ :param query: The query string to search for
597
+ :param filters: Optional filters to narrow down the search space
598
+ :param fuzziness: Fuzziness parameter for the search query
599
+ :param top_k: Maximum number of documents to return
600
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
601
+ :returns: List of Documents that match the query
602
+ """
603
+ self._ensure_initialized()
604
+
605
+ if not query:
606
+ msg = "query must be a non empty string"
607
+ raise ValueError(msg)
608
+
609
+ # Prepare the search body
610
+ search_body = {
611
+ "size": top_k,
612
+ "query": {
613
+ "bool": {
614
+ "must": [
615
+ {
616
+ "multi_match": {
617
+ "query": query,
618
+ "type": "most_fields",
619
+ "operator": "OR",
620
+ "fuzziness": fuzziness,
621
+ }
622
+ }
623
+ ]
624
+ }
625
+ },
626
+ }
627
+
628
+ if filters:
629
+ search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
434
630
 
435
- It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
631
+ documents = await self._search_documents_async(**search_body)
436
632
 
437
- This method is not meant to be part of the public interface of
438
- `ElasticsearchDocumentStore` nor called directly.
439
- `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
633
+ if scale_score:
634
+ for doc in documents:
635
+ if doc.score is not None:
636
+ doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
440
637
 
441
- :param query_embedding: Embedding of the query.
442
- :param filters: Filters applied to the retrieved `Document`s.
443
- Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
444
- :param top_k: Maximum number of `Document`s to return.
445
- :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
446
- Increasing this value will improve search accuracy at the cost of slower search speeds.
447
- You can read more about it in the Elasticsearch
448
- [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
449
- :raises ValueError: If `query_embedding` is an empty list.
450
- :returns: List of `Document` that are most similar to `query_embedding`.
638
+ return documents
639
+
640
+ def _embedding_retrieval(
641
+ self,
642
+ query_embedding: List[float],
643
+ *,
644
+ filters: Optional[Dict[str, Any]] = None,
645
+ top_k: int = 10,
646
+ num_candidates: Optional[int] = None,
647
+ ) -> List[Document]:
451
648
  """
649
+ Retrieves documents using dense vector similarity search.
452
650
 
651
+ :param query_embedding: Embedding vector to search for
652
+ :param filters: Optional filters to narrow down the search space
653
+ :param top_k: Maximum number of documents to return
654
+ :param num_candidates: Number of candidates to consider in the search
655
+ :returns: List of Documents most similar to query_embedding
656
+ """
453
657
  if not query_embedding:
454
658
  msg = "query_embedding must be a non-empty list of floats"
455
659
  raise ValueError(msg)
@@ -471,3 +675,45 @@ class ElasticsearchDocumentStore:
471
675
 
472
676
  docs = self._search_documents(**body)
473
677
  return docs
678
+
679
+ async def _embedding_retrieval_async(
680
+ self,
681
+ query_embedding: List[float],
682
+ *,
683
+ filters: Optional[Dict[str, Any]] = None,
684
+ top_k: int = 10,
685
+ num_candidates: Optional[int] = None,
686
+ ) -> List[Document]:
687
+ """
688
+ Asynchronously retrieves documents using dense vector similarity search.
689
+
690
+ :param query_embedding: Embedding vector to search for
691
+ :param filters: Optional filters to narrow down the search space
692
+ :param top_k: Maximum number of documents to return
693
+ :param num_candidates: Number of candidates to consider in the search
694
+ :returns: List of Documents most similar to query_embedding
695
+ """
696
+ self._ensure_initialized()
697
+
698
+ if not query_embedding:
699
+ msg = "query_embedding must be a non-empty list of floats"
700
+ raise ValueError(msg)
701
+
702
+ # If num_candidates is not set, use top_k * 10 as default
703
+ if num_candidates is None:
704
+ num_candidates = top_k * 10
705
+
706
+ # Prepare the search body
707
+ search_body = {
708
+ "knn": {
709
+ "field": "embedding",
710
+ "query_vector": query_embedding,
711
+ "k": top_k,
712
+ "num_candidates": num_candidates,
713
+ },
714
+ }
715
+
716
+ if filters:
717
+ search_body["knn"]["filter"] = _normalize_filters(filters)
718
+
719
+ return await self._search_documents_async(**search_body)
@@ -117,3 +117,66 @@ def test_run():
117
117
  assert len(res) == 1
118
118
  assert len(res["documents"]) == 1
119
119
  assert res["documents"][0].content == "Test doc"
120
+
121
+
122
+ @pytest.mark.asyncio
123
+ async def test_run_async():
124
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
125
+ mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
126
+ retriever = ElasticsearchBM25Retriever(document_store=mock_store)
127
+
128
+ res = await retriever.run_async(query="some test query")
129
+ mock_store._bm25_retrieval_async.assert_called_once_with(
130
+ query="some test query", filters={}, fuzziness="AUTO", top_k=10, scale_score=False
131
+ )
132
+ assert len(res) == 1
133
+ assert len(res["documents"]) == 1
134
+ assert res["documents"][0].content == "test document"
135
+
136
+
137
+ @pytest.mark.asyncio
138
+ async def test_run_init_params_async():
139
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
140
+ mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
141
+ retriever = ElasticsearchBM25Retriever(
142
+ document_store=mock_store,
143
+ filters={"some": "filter"},
144
+ fuzziness="3",
145
+ top_k=3,
146
+ scale_score=True,
147
+ filter_policy=FilterPolicy.MERGE,
148
+ )
149
+ res = await retriever.run_async(query="some query")
150
+ mock_store._bm25_retrieval_async.assert_called_once_with(
151
+ query="some query",
152
+ filters={"some": "filter"},
153
+ fuzziness="3",
154
+ top_k=3,
155
+ scale_score=True,
156
+ )
157
+ assert len(res) == 1
158
+ assert len(res["documents"]) == 1
159
+ assert res["documents"][0].content == "test document"
160
+
161
+
162
+ @pytest.mark.asyncio
163
+ async def test_run_time_params_async():
164
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
165
+ mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
166
+ retriever = ElasticsearchBM25Retriever(
167
+ document_store=mock_store,
168
+ filters={"some": "filter"},
169
+ fuzziness="3",
170
+ top_k=3,
171
+ scale_score=True,
172
+ filter_policy=FilterPolicy.MERGE,
173
+ )
174
+
175
+ res = await retriever.run_async(query="some query", filters={"another": "filter"}, top_k=1)
176
+ mock_store._bm25_retrieval_async.assert_called_once_with(
177
+ query="some query", filters={"another": "filter"}, top_k=1, fuzziness="3", scale_score=True
178
+ )
179
+
180
+ assert len(res) == 1
181
+ assert len(res["documents"]) == 1
182
+ assert res["documents"][0].content == "test document"
@@ -9,6 +9,7 @@ from unittest.mock import Mock, patch
9
9
  import pytest
10
10
  from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found]
11
11
  from haystack.dataclasses.document import Document
12
+ from haystack.dataclasses.sparse_embedding import SparseEmbedding
12
13
  from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
13
14
  from haystack.document_stores.types import DuplicatePolicy
14
15
  from haystack.testing.document_store import DocumentStoreBaseTests
@@ -25,7 +26,9 @@ def test_init_is_lazy(_mock_es_client):
25
26
 
26
27
  @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
27
28
  def test_headers_are_supported(_mock_es_client):
28
- _ = ElasticsearchDocumentStore(hosts="testhost", headers={"header1": "value1", "header2": "value2"}).client
29
+ _ = ElasticsearchDocumentStore(
30
+ hosts="http://testhost:9200", headers={"header1": "value1", "header2": "value2"}
31
+ ).client
29
32
 
30
33
  assert _mock_es_client.call_count == 1
31
34
  _, kwargs = _mock_es_client.call_args
@@ -96,6 +99,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
96
99
  )
97
100
  yield store
98
101
  store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
102
+ store.client.close()
99
103
 
100
104
  def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
101
105
  """
@@ -134,15 +138,11 @@ class TestDocumentStore(DocumentStoreBaseTests):
134
138
  def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
135
139
  doc = Document(id="1", content="test")
136
140
  doc.dataframe = DataFrame({"a": [1, 2, 3]})
137
-
138
141
  document_store.write_documents([doc])
139
-
140
142
  res = document_store.filter_documents()
141
143
  assert len(res) == 1
142
-
143
144
  assert res[0].id == "1"
144
145
  assert res[0].content == "test"
145
-
146
146
  assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
147
147
 
148
148
  def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
@@ -242,16 +242,16 @@ class TestDocumentStore(DocumentStoreBaseTests):
242
242
  Test that not all terms must mandatorily match for BM25 retrieval to return a result.
243
243
  """
244
244
  documents = [
245
- Document(id=1, content="There are over 7,000 languages spoken around the world today."),
245
+ Document(id="1", content="There are over 7,000 languages spoken around the world today."),
246
246
  Document(
247
- id=2,
247
+ id="2",
248
248
  content=(
249
249
  "Elephants have been observed to behave in a way that indicates a high level of self-awareness"
250
250
  " such as recognizing themselves in mirrors."
251
251
  ),
252
252
  ),
253
253
  Document(
254
- id=3,
254
+ id="3",
255
255
  content=(
256
256
  "In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness"
257
257
  " the phenomenon of bioluminescent waves."
@@ -262,7 +262,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
262
262
 
263
263
  res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3)
264
264
  assert len(res) == 1
265
- assert res[0].id == 2
265
+ assert res[0].id == "2"
266
266
 
267
267
  def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore):
268
268
  docs = [
@@ -355,8 +355,172 @@ class TestDocumentStore(DocumentStoreBaseTests):
355
355
  )
356
356
  mock_elasticsearch.return_value = mock_client
357
357
 
358
- _ = ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping).client
358
+ _ = ElasticsearchDocumentStore(hosts="http://testhost:9200", custom_mapping=custom_mapping).client
359
359
  mock_client.indices.create.assert_called_once_with(
360
360
  index="default",
361
361
  mappings=custom_mapping,
362
362
  )
363
+
364
+
365
+ @pytest.mark.integration
366
+ class TestElasticsearchDocumentStoreAsync:
367
+
368
+ @pytest.fixture
369
+ async def document_store(self, request):
370
+ """
371
+ Basic fixture providing a document store instance for async tests
372
+ """
373
+ hosts = ["http://localhost:9200"]
374
+ # Use a different index for each test so we can run them in parallel
375
+ index = f"{request.node.name}"
376
+
377
+ store = ElasticsearchDocumentStore(hosts=hosts, index=index)
378
+ yield store
379
+ store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
380
+
381
+ await store.async_client.close()
382
+
383
+ @pytest.mark.asyncio
384
+ async def test_write_documents_async(self, document_store):
385
+ docs = [Document(id="1", content="test")]
386
+ assert await document_store.write_documents_async(docs) == 1
387
+ assert await document_store.count_documents_async() == 1
388
+ with pytest.raises(DocumentStoreError):
389
+ await document_store.write_documents_async(docs, policy=DuplicatePolicy.FAIL)
390
+
391
+ @pytest.mark.asyncio
392
+ async def test_count_documents_async(self, document_store):
393
+ docs = [
394
+ Document(content="test doc 1"),
395
+ Document(content="test doc 2"),
396
+ Document(content="test doc 3"),
397
+ ]
398
+ await document_store.write_documents_async(docs)
399
+ assert await document_store.count_documents_async() == 3
400
+
401
+ @pytest.mark.asyncio
402
+ async def test_delete_documents_async(self, document_store):
403
+ doc = Document(content="test doc")
404
+ await document_store.write_documents_async([doc])
405
+ assert await document_store.count_documents_async() == 1
406
+ await document_store.delete_documents_async([doc.id])
407
+ assert await document_store.count_documents_async() == 0
408
+
409
+ @pytest.mark.asyncio
410
+ async def test_filter_documents_async(self, document_store):
411
+ filterable_docs = [
412
+ Document(content="1", meta={"number": -10}),
413
+ Document(content="2", meta={"number": 100}),
414
+ ]
415
+ await document_store.write_documents_async(filterable_docs)
416
+ result = await document_store.filter_documents_async(
417
+ filters={"field": "number", "operator": "==", "value": 100}
418
+ )
419
+ assert len(result) == 1
420
+ assert result[0].meta["number"] == 100
421
+
422
+ @pytest.mark.asyncio
423
+ async def test_bm25_retrieval_async(self, document_store):
424
+ docs = [
425
+ Document(content="Haskell is a functional programming language"),
426
+ Document(content="Python is an object oriented programming language"),
427
+ ]
428
+ await document_store.write_documents_async(docs)
429
+ results = await document_store._bm25_retrieval_async("functional", top_k=1)
430
+ assert len(results) == 1
431
+ assert "functional" in results[0].content
432
+
433
+ @pytest.mark.asyncio
434
+ async def test_embedding_retrieval_async(self, document_store):
435
+
436
+ # init document store
437
+ docs = [
438
+ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),
439
+ Document(content="Less similar document", embedding=[0.5, 0.5, 0.5, 0.5]),
440
+ ]
441
+ await document_store.write_documents_async(docs)
442
+
443
+ # without num_candidates set to None
444
+ results = await document_store._embedding_retrieval_async(query_embedding=[1.0, 1.0, 1.0, 1.0], top_k=1)
445
+ assert len(results) == 1
446
+ assert results[0].content == "Most similar document"
447
+
448
+ # with num_candidates not None
449
+ results = await document_store._embedding_retrieval_async(
450
+ query_embedding=[1.0, 1.0, 1.0, 1.0], top_k=2, num_candidates=2
451
+ )
452
+ assert len(results) == 2
453
+ assert results[0].content == "Most similar document"
454
+
455
+ # with an embedding containing None
456
+ with pytest.raises(ValueError, match="query_embedding must be a non-empty list of floats"):
457
+ _ = await document_store._embedding_retrieval_async(query_embedding=None, top_k=2)
458
+
459
+ @pytest.mark.asyncio
460
+ async def test_bm25_retrieval_async_with_filters(self, document_store):
461
+ docs = [
462
+ Document(content="Haskell is a functional programming language", meta={"type": "functional"}),
463
+ Document(content="Python is an object oriented programming language", meta={"type": "oop"}),
464
+ ]
465
+ await document_store.write_documents_async(docs)
466
+ results = await document_store._bm25_retrieval_async(
467
+ "programming", filters={"field": "type", "operator": "==", "value": "functional"}, top_k=1
468
+ )
469
+ assert len(results) == 1
470
+ assert "functional" in results[0].content
471
+
472
+ # test with scale_score=True
473
+ results = await document_store._bm25_retrieval_async(
474
+ "programming", filters={"field": "type", "operator": "==", "value": "functional"}, top_k=1, scale_score=True
475
+ )
476
+ assert len(results) == 1
477
+ assert "functional" in results[0].content
478
+ assert 0 <= results[0].score <= 1 # score should be between 0 and 1
479
+
480
+ @pytest.mark.asyncio
481
+ async def test_embedding_retrieval_async_with_filters(self, document_store):
482
+ docs = [
483
+ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0], meta={"type": "similar"}),
484
+ Document(content="Less similar document", embedding=[0.5, 0.5, 0.5, 0.5], meta={"type": "different"}),
485
+ ]
486
+ await document_store.write_documents_async(docs)
487
+ results = await document_store._embedding_retrieval_async(
488
+ query_embedding=[1.0, 1.0, 1.0, 1.0],
489
+ filters={"field": "type", "operator": "==", "value": "similar"},
490
+ top_k=1,
491
+ )
492
+ assert len(results) == 1
493
+ assert results[0].content == "Most similar document"
494
+
495
+ @pytest.mark.asyncio
496
+ async def test_write_documents_async_invalid_document_type(self, document_store):
497
+ """Test write_documents with invalid document type"""
498
+ invalid_docs = [{"id": "1", "content": "test"}] # Dictionary instead of Document object
499
+ with pytest.raises(ValueError, match="param 'documents' must contain a list of objects of type Document"):
500
+ await document_store.write_documents_async(invalid_docs)
501
+
502
+ @pytest.mark.asyncio
503
+ async def test_write_documents_async_with_dataframe_warning(self, document_store, caplog):
504
+ """Test write_documents with document containing dataframe field"""
505
+ doc = Document(id="1", content="test", dataframe=DataFrame({"col": [1, 2, 3]}))
506
+
507
+ await document_store.write_documents_async([doc])
508
+ assert "ElasticsearchDocumentStore no longer supports dataframes" in caplog.text
509
+
510
+ results = await document_store.filter_documents_async()
511
+ assert len(results) == 1
512
+ assert results[0].id == "1"
513
+ assert not hasattr(results[0], "dataframe") or results[0].dataframe is None
514
+
515
+ @pytest.mark.asyncio
516
+ async def test_write_documents_async_with_sparse_embedding_warning(self, document_store, caplog):
517
+ """Test write_documents with document containing sparse_embedding field"""
518
+ doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5]))
519
+
520
+ await document_store.write_documents_async([doc])
521
+ assert "but storing sparse embeddings in Elasticsearch is not currently supported." in caplog.text
522
+
523
+ results = await document_store.filter_documents_async()
524
+ assert len(results) == 1
525
+ assert results[0].id == "1"
526
+ assert not hasattr(results[0], "sparse_embedding") or results[0].sparse_embedding is None
@@ -113,3 +113,68 @@ def test_run():
113
113
  assert len(res["documents"]) == 1
114
114
  assert res["documents"][0].content == "Test doc"
115
115
  assert res["documents"][0].embedding == [0.1, 0.2]
116
+
117
+
118
+ @pytest.mark.asyncio
119
+ async def test_run_async():
120
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
121
+ mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
122
+ retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store)
123
+ res = await retriever.run_async(query_embedding=[0.5, 0.7])
124
+ mock_store._embedding_retrieval_async.assert_called_once_with(
125
+ query_embedding=[0.5, 0.7],
126
+ filters={},
127
+ top_k=10,
128
+ num_candidates=None,
129
+ )
130
+ assert len(res) == 1
131
+ assert len(res["documents"]) == 1
132
+ assert res["documents"][0].content == "test document"
133
+ assert res["documents"][0].embedding == [0.1, 0.2]
134
+
135
+
136
+ @pytest.mark.asyncio
137
+ async def test_run_init_params_async():
138
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
139
+ mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
140
+ retriever = ElasticsearchEmbeddingRetriever(
141
+ document_store=mock_store,
142
+ filters={"some": "filter"},
143
+ top_k=3,
144
+ num_candidates=30,
145
+ filter_policy=FilterPolicy.MERGE,
146
+ )
147
+ res = await retriever.run_async(query_embedding=[0.5, 0.7])
148
+ mock_store._embedding_retrieval_async.assert_called_once_with(
149
+ query_embedding=[0.5, 0.7],
150
+ filters={"some": "filter"},
151
+ top_k=3,
152
+ num_candidates=30,
153
+ )
154
+ assert len(res) == 1
155
+ assert len(res["documents"]) == 1
156
+ assert res["documents"][0].content == "test document"
157
+ assert res["documents"][0].embedding == [0.1, 0.2]
158
+
159
+
160
+ @pytest.mark.asyncio
161
+ async def test_run_time_params_async():
162
+ mock_store = Mock(spec=ElasticsearchDocumentStore)
163
+ mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
164
+ retriever = ElasticsearchEmbeddingRetriever(
165
+ document_store=mock_store,
166
+ filters={"some": "filter"},
167
+ top_k=3,
168
+ num_candidates=30,
169
+ filter_policy=FilterPolicy.MERGE,
170
+ )
171
+
172
+ res = await retriever.run_async(query_embedding=[0.5, 0.7], filters={"another": "filter"}, top_k=1)
173
+ mock_store._embedding_retrieval_async.assert_called_once_with(
174
+ query_embedding=[0.5, 0.7], filters={"another": "filter"}, top_k=1, num_candidates=30
175
+ )
176
+
177
+ assert len(res) == 1
178
+ assert len(res["documents"]) == 1
179
+ assert res["documents"][0].content == "test document"
180
+ assert res["documents"][0].embedding == [0.1, 0.2]