elasticsearch-haystack 2.0.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/CHANGELOG.md +68 -14
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/PKG-INFO +2 -3
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/pyproject.toml +6 -3
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +24 -1
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +30 -4
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +321 -75
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_bm25_retriever.py +63 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_document_store.py +174 -10
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_embedding_retriever.py +65 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/.gitignore +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/LICENSE +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/README.md +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/pydoc/config.yml +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_filters.py +0 -0
|
@@ -1,12 +1,27 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/elasticsearch-v2.0.0] - 2025-02-14
|
|
4
|
+
|
|
5
|
+
### 🧹 Chores
|
|
6
|
+
|
|
7
|
+
- Inherit from `FilterDocumentsTestWithDataframe` in Document Stores (#1290)
|
|
8
|
+
- [**breaking**] Elasticsearch - remove dataframe support (#1377)
|
|
9
|
+
|
|
10
|
+
|
|
3
11
|
## [integrations/elasticsearch-v1.0.1] - 2024-10-28
|
|
4
12
|
|
|
5
|
-
### ⚙️
|
|
13
|
+
### ⚙️ CI
|
|
14
|
+
|
|
15
|
+
- Adopt uv as installer (#1142)
|
|
16
|
+
|
|
17
|
+
### 🧹 Chores
|
|
6
18
|
|
|
7
19
|
- Update changelog after removing legacy filters (#1083)
|
|
8
20
|
- Update ruff linting scripts and settings (#1105)
|
|
9
|
-
|
|
21
|
+
|
|
22
|
+
### 🌀 Miscellaneous
|
|
23
|
+
|
|
24
|
+
- Fix: Elasticsearch - allow passing headers (#1156)
|
|
10
25
|
|
|
11
26
|
## [integrations/elasticsearch-v1.0.0] - 2024-09-12
|
|
12
27
|
|
|
@@ -23,18 +38,32 @@
|
|
|
23
38
|
|
|
24
39
|
- Do not retry tests in `hatch run test` command (#954)
|
|
25
40
|
|
|
26
|
-
### ⚙️
|
|
41
|
+
### ⚙️ CI
|
|
27
42
|
|
|
28
43
|
- Retry tests to reduce flakyness (#836)
|
|
44
|
+
|
|
45
|
+
### 🧹 Chores
|
|
46
|
+
|
|
29
47
|
- Update ruff invocation to include check parameter (#853)
|
|
30
48
|
- ElasticSearch - remove legacy filters elasticsearch (#1078)
|
|
31
49
|
|
|
50
|
+
### 🌀 Miscellaneous
|
|
51
|
+
|
|
52
|
+
- Ci: install `pytest-rerunfailures` where needed; add retry config to `test-cov` script (#845)
|
|
53
|
+
- Chore: Minor retriever pydoc fix (#884)
|
|
54
|
+
- Chore: elasticsearch - ruff update, don't ruff tests (#999)
|
|
55
|
+
|
|
32
56
|
## [integrations/elasticsearch-v0.5.0] - 2024-05-24
|
|
33
57
|
|
|
34
58
|
### 🐛 Bug Fixes
|
|
35
59
|
|
|
36
60
|
- Add support for custom mapping in ElasticsearchDocumentStore (#721)
|
|
37
61
|
|
|
62
|
+
### 🌀 Miscellaneous
|
|
63
|
+
|
|
64
|
+
- Chore: add license classifiers (#680)
|
|
65
|
+
- Chore: change the pydoc renderer class (#718)
|
|
66
|
+
|
|
38
67
|
## [integrations/elasticsearch-v0.4.0] - 2024-04-03
|
|
39
68
|
|
|
40
69
|
### 📚 Documentation
|
|
@@ -43,49 +72,64 @@
|
|
|
43
72
|
- Review Elastic (#541)
|
|
44
73
|
- Disable-class-def (#556)
|
|
45
74
|
|
|
75
|
+
### 🌀 Miscellaneous
|
|
76
|
+
|
|
77
|
+
- Make tests show coverage (#566)
|
|
78
|
+
- Refactor tests (#574)
|
|
79
|
+
- Remove references to Python 3.7 (#601)
|
|
80
|
+
- Make Document Stores initially skip `SparseEmbedding` (#606)
|
|
81
|
+
- [Elasticsearch] fix: Filters not working with metadata that contain a space or capitalization (#639)
|
|
82
|
+
|
|
46
83
|
## [integrations/elasticsearch-v0.3.0] - 2024-02-23
|
|
47
84
|
|
|
48
85
|
### 🐛 Bug Fixes
|
|
49
86
|
|
|
50
87
|
- Fix order of API docs (#447)
|
|
51
88
|
|
|
52
|
-
This PR will also push the docs to Readme
|
|
53
|
-
|
|
54
89
|
### 📚 Documentation
|
|
55
90
|
|
|
56
91
|
- Update category slug (#442)
|
|
57
92
|
|
|
58
|
-
###
|
|
93
|
+
### 🌀 Miscellaneous
|
|
59
94
|
|
|
95
|
+
- Generate api docs (#322)
|
|
96
|
+
- Add filters to run function in retrievers of elasticsearch (#440)
|
|
60
97
|
- Add user-agent header (#457)
|
|
61
98
|
|
|
62
|
-
|
|
99
|
+
## [integrations/elasticsearch-v0.2.0] - 2024-01-19
|
|
63
100
|
|
|
64
|
-
|
|
101
|
+
### 🌀 Miscellaneous
|
|
65
102
|
|
|
66
|
-
|
|
103
|
+
- Mount import paths under haystack_integrations (#244)
|
|
67
104
|
|
|
68
|
-
-
|
|
105
|
+
## [integrations/elasticsearch-v0.1.3] - 2024-01-18
|
|
69
106
|
|
|
70
|
-
|
|
107
|
+
### 🌀 Miscellaneous
|
|
71
108
|
|
|
72
|
-
|
|
109
|
+
- Added top_k argument in the run function of ElasticSearcBM25Retriever (#130)
|
|
110
|
+
- Add more docstrings for `ElasticsearchDocumentStore` and `ElasticsearchBM25Retriever` (#184)
|
|
111
|
+
- Elastic - update imports for beta5 (#238)
|
|
73
112
|
|
|
74
113
|
## [integrations/elasticsearch-v0.1.2] - 2023-12-20
|
|
75
114
|
|
|
76
115
|
### 🐛 Bug Fixes
|
|
77
116
|
|
|
78
|
-
- Fix project
|
|
117
|
+
- Fix project URLs (#96)
|
|
79
118
|
|
|
80
119
|
### 🚜 Refactor
|
|
81
120
|
|
|
82
121
|
- Use `hatch_vcs` to manage integrations versioning (#103)
|
|
83
122
|
|
|
123
|
+
### 🌀 Miscellaneous
|
|
124
|
+
|
|
125
|
+
- Update elasticsearch test badge (#79)
|
|
126
|
+
- [Elasticsearch] - BM25 retrieval: not all terms must mandatorily match (#125)
|
|
127
|
+
|
|
84
128
|
## [integrations/elasticsearch-v0.1.1] - 2023-12-05
|
|
85
129
|
|
|
86
130
|
### 🐛 Bug Fixes
|
|
87
131
|
|
|
88
|
-
-
|
|
132
|
+
- Document Stores: fix protocol import (#77)
|
|
89
133
|
|
|
90
134
|
## [integrations/elasticsearch-v0.1.0] - 2023-12-04
|
|
91
135
|
|
|
@@ -93,6 +137,16 @@ This PR will also push the docs to Readme
|
|
|
93
137
|
|
|
94
138
|
- Fix license headers
|
|
95
139
|
|
|
140
|
+
### 🌀 Miscellaneous
|
|
141
|
+
|
|
142
|
+
- Remove Document Store decorator (#76)
|
|
143
|
+
|
|
96
144
|
## [integrations/elasticsearch-v0.0.2] - 2023-11-29
|
|
97
145
|
|
|
146
|
+
### 🌀 Miscellaneous
|
|
147
|
+
|
|
148
|
+
- Reorganize repository (#62)
|
|
149
|
+
- Update `ElasticSearchDocumentStore` to use latest `haystack-ai` version (#63)
|
|
150
|
+
- Bump elasticsearch_haystack to 0.0.2
|
|
151
|
+
|
|
98
152
|
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -11,13 +11,12 @@ License-File: LICENSE
|
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Programming Language :: Python
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
18
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
20
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.9
|
|
21
20
|
Requires-Dist: elasticsearch<9,>=8
|
|
22
21
|
Requires-Dist: haystack-ai
|
|
23
22
|
Description-Content-Type: text/markdown
|
|
@@ -7,7 +7,7 @@ name = "elasticsearch-haystack"
|
|
|
7
7
|
dynamic = ["version"]
|
|
8
8
|
description = 'Haystack 2.x Document Store for ElasticSearch'
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
keywords = []
|
|
13
13
|
authors = [{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" }]
|
|
@@ -15,7 +15,6 @@ classifiers = [
|
|
|
15
15
|
"License :: OSI Approved :: Apache Software License",
|
|
16
16
|
"Development Status :: 4 - Beta",
|
|
17
17
|
"Programming Language :: Python",
|
|
18
|
-
"Programming Language :: Python :: 3.8",
|
|
19
18
|
"Programming Language :: Python :: 3.9",
|
|
20
19
|
"Programming Language :: Python :: 3.10",
|
|
21
20
|
"Programming Language :: Python :: 3.11",
|
|
@@ -45,6 +44,7 @@ installer = "uv"
|
|
|
45
44
|
dependencies = [
|
|
46
45
|
"coverage[toml]>=6.5",
|
|
47
46
|
"pytest",
|
|
47
|
+
"pytest-asyncio",
|
|
48
48
|
"pytest-rerunfailures",
|
|
49
49
|
"pytest-xdist",
|
|
50
50
|
"haystack-pydoc-tools",
|
|
@@ -59,12 +59,13 @@ cov-retry = ["test-cov-retry", "cov-report"]
|
|
|
59
59
|
docs = ["pydoc-markdown pydoc/config.yml"]
|
|
60
60
|
|
|
61
61
|
[[tool.hatch.envs.all.matrix]]
|
|
62
|
-
python = [
|
|
62
|
+
python = [ "3.9", "3.10", "3.11"]
|
|
63
63
|
|
|
64
64
|
[tool.hatch.envs.lint]
|
|
65
65
|
installer = "uv"
|
|
66
66
|
detached = true
|
|
67
67
|
dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
|
|
68
|
+
|
|
68
69
|
[tool.hatch.envs.lint.scripts]
|
|
69
70
|
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
|
|
70
71
|
style = ["ruff check {args:}", "black --check --diff {args:.}"]
|
|
@@ -157,6 +158,8 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
|
|
|
157
158
|
[tool.pytest.ini_options]
|
|
158
159
|
minversion = "6.0"
|
|
159
160
|
markers = ["unit: unit tests", "integration: integration tests"]
|
|
161
|
+
asyncio_mode = "auto"
|
|
162
|
+
asyncio_default_fixture_loop_scope = "class"
|
|
160
163
|
|
|
161
164
|
[[tool.mypy.overrides]]
|
|
162
165
|
module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
|
|
@@ -120,7 +120,7 @@ class ElasticsearchBM25Retriever:
|
|
|
120
120
|
"""
|
|
121
121
|
Retrieve documents using the BM25 keyword-based algorithm.
|
|
122
122
|
|
|
123
|
-
:param query: String to search in `Document`s
|
|
123
|
+
:param query: String to search in the `Document`s text.
|
|
124
124
|
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
125
125
|
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
126
126
|
details.
|
|
@@ -137,3 +137,26 @@ class ElasticsearchBM25Retriever:
|
|
|
137
137
|
scale_score=self._scale_score,
|
|
138
138
|
)
|
|
139
139
|
return {"documents": docs}
|
|
140
|
+
|
|
141
|
+
@component.output_types(documents=List[Document])
|
|
142
|
+
async def run_async(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
|
|
143
|
+
"""
|
|
144
|
+
Asynchronously retrieve documents using the BM25 keyword-based algorithm.
|
|
145
|
+
|
|
146
|
+
:param query: String to search in the `Document` text.
|
|
147
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
148
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
149
|
+
details.
|
|
150
|
+
:param top_k: Maximum number of `Document` to return.
|
|
151
|
+
:returns: A dictionary with the following keys:
|
|
152
|
+
- `documents`: List of `Document`s that match the query.
|
|
153
|
+
"""
|
|
154
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
155
|
+
docs = await self._document_store._bm25_retrieval_async(
|
|
156
|
+
query=query,
|
|
157
|
+
filters=filters,
|
|
158
|
+
fuzziness=self._fuzziness,
|
|
159
|
+
top_k=top_k or self._top_k,
|
|
160
|
+
scale_score=self._scale_score,
|
|
161
|
+
)
|
|
162
|
+
return {"documents": docs}
|
|
@@ -119,10 +119,11 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
119
119
|
Retrieve documents using a vector similarity metric.
|
|
120
120
|
|
|
121
121
|
:param query_embedding: Embedding of the query.
|
|
122
|
-
:param filters: Filters applied
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
122
|
+
:param filters: Filters applied when fetching documents from the Document Store.
|
|
123
|
+
Filters are applied during the approximate kNN search to ensure the Retriever returns
|
|
124
|
+
`top_k` matching documents.
|
|
125
|
+
The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
|
|
126
|
+
:param top_k: Maximum number of documents to return.
|
|
126
127
|
:returns: A dictionary with the following keys:
|
|
127
128
|
- `documents`: List of `Document`s most similar to the given `query_embedding`
|
|
128
129
|
"""
|
|
@@ -134,3 +135,28 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
134
135
|
num_candidates=self._num_candidates,
|
|
135
136
|
)
|
|
136
137
|
return {"documents": docs}
|
|
138
|
+
|
|
139
|
+
@component.output_types(documents=List[Document])
|
|
140
|
+
async def run_async(
|
|
141
|
+
self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
Asynchronously retrieve documents using a vector similarity metric.
|
|
145
|
+
|
|
146
|
+
:param query_embedding: Embedding of the query.
|
|
147
|
+
:param filters: Filters applied when fetching documents from the Document Store.
|
|
148
|
+
Filters are applied during the approximate kNN search to ensure the Retriever returns
|
|
149
|
+
`top_k` matching documents.
|
|
150
|
+
The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
|
|
151
|
+
:param top_k: Maximum number of documents to return.
|
|
152
|
+
:returns: A dictionary with the following keys:
|
|
153
|
+
- `documents`: List of `Document`s that match the query.
|
|
154
|
+
"""
|
|
155
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
156
|
+
docs = await self._document_store._embedding_retrieval_async(
|
|
157
|
+
query_embedding=query_embedding,
|
|
158
|
+
filters=filters,
|
|
159
|
+
top_k=top_k or self._top_k,
|
|
160
|
+
num_candidates=self._num_candidates,
|
|
161
|
+
)
|
|
162
|
+
return {"documents": docs}
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import logging
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
|
|
@@ -14,7 +15,7 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumen
|
|
|
14
15
|
from haystack.document_stores.types import DuplicatePolicy
|
|
15
16
|
from haystack.version import __version__ as haystack_version
|
|
16
17
|
|
|
17
|
-
from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found]
|
|
18
|
+
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers # type: ignore[import-not-found]
|
|
18
19
|
|
|
19
20
|
from .filters import _normalize_filters
|
|
20
21
|
|
|
@@ -30,6 +31,7 @@ Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]]]
|
|
|
30
31
|
# Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
|
|
31
32
|
# all be mapped to scores ~1.
|
|
32
33
|
BM25_SCALING_FACTOR = 8
|
|
34
|
+
DOC_ALREADY_EXISTS = 409
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
class ElasticsearchDocumentStore:
|
|
@@ -93,28 +95,39 @@ class ElasticsearchDocumentStore:
|
|
|
93
95
|
"""
|
|
94
96
|
self._hosts = hosts
|
|
95
97
|
self._client = None
|
|
98
|
+
self._async_client = None
|
|
96
99
|
self._index = index
|
|
97
100
|
self._embedding_similarity_function = embedding_similarity_function
|
|
98
101
|
self._custom_mapping = custom_mapping
|
|
99
102
|
self._kwargs = kwargs
|
|
103
|
+
self._initialized = False
|
|
100
104
|
|
|
101
105
|
if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
|
|
102
106
|
msg = "custom_mapping must be a dictionary"
|
|
103
107
|
raise ValueError(msg)
|
|
104
108
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
109
|
+
def _ensure_initialized(self):
|
|
110
|
+
"""
|
|
111
|
+
Ensures both sync and async clients are initialized and the index exists.
|
|
112
|
+
"""
|
|
113
|
+
if not self._initialized:
|
|
108
114
|
headers = self._kwargs.pop("headers", {})
|
|
109
115
|
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
110
116
|
|
|
111
|
-
|
|
117
|
+
# Initialize both sync and async clients
|
|
118
|
+
self._client = Elasticsearch(
|
|
119
|
+
self._hosts,
|
|
120
|
+
headers=headers,
|
|
121
|
+
**self._kwargs,
|
|
122
|
+
)
|
|
123
|
+
self._async_client = AsyncElasticsearch(
|
|
112
124
|
self._hosts,
|
|
113
125
|
headers=headers,
|
|
114
126
|
**self._kwargs,
|
|
115
127
|
)
|
|
128
|
+
|
|
116
129
|
# Check client connection, this will raise if not connected
|
|
117
|
-
|
|
130
|
+
self._client.info()
|
|
118
131
|
|
|
119
132
|
if self._custom_mapping:
|
|
120
133
|
mappings = self._custom_mapping
|
|
@@ -143,13 +156,27 @@ class ElasticsearchDocumentStore:
|
|
|
143
156
|
}
|
|
144
157
|
|
|
145
158
|
# Create the index if it doesn't exist
|
|
146
|
-
if not
|
|
147
|
-
|
|
159
|
+
if not self._client.indices.exists(index=self._index):
|
|
160
|
+
self._client.indices.create(index=self._index, mappings=mappings)
|
|
148
161
|
|
|
149
|
-
self.
|
|
162
|
+
self._initialized = True
|
|
150
163
|
|
|
164
|
+
@property
|
|
165
|
+
def client(self) -> Elasticsearch:
|
|
166
|
+
"""
|
|
167
|
+
Returns the synchronous Elasticsearch client, initializing it if necessary.
|
|
168
|
+
"""
|
|
169
|
+
self._ensure_initialized()
|
|
151
170
|
return self._client
|
|
152
171
|
|
|
172
|
+
@property
|
|
173
|
+
def async_client(self) -> AsyncElasticsearch:
|
|
174
|
+
"""
|
|
175
|
+
Returns the asynchronous Elasticsearch client, initializing it if necessary.
|
|
176
|
+
"""
|
|
177
|
+
self._ensure_initialized()
|
|
178
|
+
return self._async_client
|
|
179
|
+
|
|
153
180
|
def to_dict(self) -> Dict[str, Any]:
|
|
154
181
|
"""
|
|
155
182
|
Serializes the component to a dictionary.
|
|
@@ -184,15 +211,26 @@ class ElasticsearchDocumentStore:
|
|
|
184
211
|
def count_documents(self) -> int:
|
|
185
212
|
"""
|
|
186
213
|
Returns how many documents are present in the document store.
|
|
187
|
-
|
|
214
|
+
|
|
215
|
+
:returns:
|
|
216
|
+
Number of documents in the document store.
|
|
188
217
|
"""
|
|
218
|
+
self._ensure_initialized()
|
|
189
219
|
return self.client.count(index=self._index)["count"]
|
|
190
220
|
|
|
221
|
+
async def count_documents_async(self) -> int:
|
|
222
|
+
"""
|
|
223
|
+
Asynchronously returns how many documents are present in the document store.
|
|
224
|
+
:returns: Number of documents in the document store.
|
|
225
|
+
"""
|
|
226
|
+
self._ensure_initialized()
|
|
227
|
+
result = await self._async_client.count(index=self._index) # type: ignore
|
|
228
|
+
return result["count"]
|
|
229
|
+
|
|
191
230
|
def _search_documents(self, **kwargs) -> List[Document]:
|
|
192
231
|
"""
|
|
193
232
|
Calls the Elasticsearch client's search method and handles pagination.
|
|
194
233
|
"""
|
|
195
|
-
|
|
196
234
|
top_k = kwargs.get("size")
|
|
197
235
|
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
198
236
|
top_k = kwargs["knn"]["k"]
|
|
@@ -207,7 +245,7 @@ class ElasticsearchDocumentStore:
|
|
|
207
245
|
**kwargs,
|
|
208
246
|
)
|
|
209
247
|
|
|
210
|
-
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
|
|
248
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
|
|
211
249
|
from_ = len(documents)
|
|
212
250
|
|
|
213
251
|
if top_k is not None and from_ >= top_k:
|
|
@@ -216,6 +254,31 @@ class ElasticsearchDocumentStore:
|
|
|
216
254
|
break
|
|
217
255
|
return documents
|
|
218
256
|
|
|
257
|
+
async def _search_documents_async(self, **kwargs) -> List[Document]:
|
|
258
|
+
"""
|
|
259
|
+
Asynchronously calls the Elasticsearch client's search method and handles pagination.
|
|
260
|
+
"""
|
|
261
|
+
top_k = kwargs.get("size")
|
|
262
|
+
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
263
|
+
top_k = kwargs["knn"]["k"]
|
|
264
|
+
|
|
265
|
+
documents: List[Document] = []
|
|
266
|
+
from_ = 0
|
|
267
|
+
|
|
268
|
+
# handle pagination
|
|
269
|
+
while True:
|
|
270
|
+
res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
|
|
271
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
|
|
272
|
+
from_ = len(documents)
|
|
273
|
+
|
|
274
|
+
if top_k is not None and from_ >= top_k:
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
if from_ >= res["hits"]["total"]["value"]:
|
|
278
|
+
break
|
|
279
|
+
|
|
280
|
+
return documents
|
|
281
|
+
|
|
219
282
|
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
220
283
|
"""
|
|
221
284
|
The main query method for the document store. It retrieves all documents that match the filters.
|
|
@@ -229,10 +292,54 @@ class ElasticsearchDocumentStore:
|
|
|
229
292
|
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
230
293
|
raise ValueError(msg)
|
|
231
294
|
|
|
295
|
+
self._ensure_initialized()
|
|
232
296
|
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
233
297
|
documents = self._search_documents(query=query)
|
|
234
298
|
return documents
|
|
235
299
|
|
|
300
|
+
async def filter_documents_async(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
301
|
+
"""
|
|
302
|
+
Asynchronously retrieves all documents that match the filters.
|
|
303
|
+
|
|
304
|
+
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
|
|
305
|
+
see the official Elasticsearch
|
|
306
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
|
|
307
|
+
:returns: List of `Document`s that match the filters.
|
|
308
|
+
"""
|
|
309
|
+
if filters and "operator" not in filters and "conditions" not in filters:
|
|
310
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
311
|
+
raise ValueError(msg)
|
|
312
|
+
|
|
313
|
+
self._ensure_initialized()
|
|
314
|
+
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
315
|
+
documents = await self._search_documents_async(query=query)
|
|
316
|
+
return documents
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def _deserialize_document(hit: Dict[str, Any]) -> Document:
|
|
320
|
+
"""
|
|
321
|
+
Creates a `Document` from the search hit provided.
|
|
322
|
+
This is mostly useful in self.filter_documents().
|
|
323
|
+
:param hit: A search hit from Elasticsearch.
|
|
324
|
+
:returns: `Document` created from the search hit.
|
|
325
|
+
"""
|
|
326
|
+
data = hit["_source"]
|
|
327
|
+
|
|
328
|
+
if "highlight" in hit:
|
|
329
|
+
data["metadata"]["highlighted"] = hit["highlight"]
|
|
330
|
+
data["score"] = hit["_score"]
|
|
331
|
+
|
|
332
|
+
if "dataframe" in data:
|
|
333
|
+
dataframe = data.pop("dataframe")
|
|
334
|
+
if dataframe:
|
|
335
|
+
logger.warning(
|
|
336
|
+
"Document %s has the `dataframe` field set,"
|
|
337
|
+
"ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
|
|
338
|
+
"The `dataframe` field will soon be removed from Haystack Document.",
|
|
339
|
+
data["id"],
|
|
340
|
+
)
|
|
341
|
+
return Document.from_dict(data)
|
|
342
|
+
|
|
236
343
|
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
|
|
237
344
|
"""
|
|
238
345
|
Writes `Document`s to Elasticsearch.
|
|
@@ -315,40 +422,86 @@ class ElasticsearchDocumentStore:
|
|
|
315
422
|
|
|
316
423
|
return documents_written
|
|
317
424
|
|
|
318
|
-
|
|
319
|
-
|
|
425
|
+
async def write_documents_async(
|
|
426
|
+
self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE
|
|
427
|
+
) -> int:
|
|
320
428
|
"""
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
This is mostly useful in self.filter_documents().
|
|
429
|
+
Asynchronously writes `Document`s to Elasticsearch.
|
|
324
430
|
|
|
325
|
-
:param
|
|
326
|
-
:
|
|
431
|
+
:param documents: List of Documents to write to the document store.
|
|
432
|
+
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
|
|
433
|
+
:raises ValueError: If `documents` is not a list of `Document`s.
|
|
434
|
+
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
|
|
435
|
+
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
|
|
436
|
+
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
|
|
437
|
+
:returns: Number of documents written to the document store.
|
|
327
438
|
"""
|
|
328
|
-
|
|
439
|
+
self._ensure_initialized()
|
|
329
440
|
|
|
330
|
-
if
|
|
331
|
-
|
|
332
|
-
|
|
441
|
+
if len(documents) > 0:
|
|
442
|
+
if not isinstance(documents[0], Document):
|
|
443
|
+
msg = "param 'documents' must contain a list of objects of type Document"
|
|
444
|
+
raise ValueError(msg)
|
|
333
445
|
|
|
334
|
-
if
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
446
|
+
if policy == DuplicatePolicy.NONE:
|
|
447
|
+
policy = DuplicatePolicy.FAIL
|
|
448
|
+
|
|
449
|
+
actions = []
|
|
450
|
+
for doc in documents:
|
|
451
|
+
doc_dict = doc.to_dict()
|
|
452
|
+
if "dataframe" in doc_dict:
|
|
453
|
+
dataframe = doc_dict.pop("dataframe")
|
|
454
|
+
if dataframe:
|
|
455
|
+
logger.warning(
|
|
456
|
+
"Document {id} has the `dataframe` field set,"
|
|
457
|
+
"ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
|
|
458
|
+
"The `dataframe` field will soon be removed from Haystack Document.",
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
if "sparse_embedding" in doc_dict:
|
|
462
|
+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
463
|
+
if sparse_embedding:
|
|
464
|
+
logger.warning(
|
|
465
|
+
"Document %s has the `sparse_embedding` field set,"
|
|
466
|
+
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
467
|
+
"The `sparse_embedding` field will be ignored.",
|
|
468
|
+
doc.id,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
action = {
|
|
472
|
+
"_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
|
|
473
|
+
"_id": doc.id,
|
|
474
|
+
"_source": doc_dict,
|
|
475
|
+
}
|
|
476
|
+
actions.append(action)
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
success, failed = await helpers.async_bulk(
|
|
480
|
+
client=self._async_client,
|
|
481
|
+
actions=actions,
|
|
482
|
+
index=self._index,
|
|
483
|
+
refresh=True,
|
|
484
|
+
raise_on_error=False,
|
|
485
|
+
)
|
|
486
|
+
if failed:
|
|
487
|
+
if policy == DuplicatePolicy.FAIL:
|
|
488
|
+
for error in failed:
|
|
489
|
+
if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
|
|
490
|
+
msg = f"ID '{error['create']['_id']}' already exists in the document store"
|
|
491
|
+
raise DuplicateDocumentError(msg)
|
|
492
|
+
msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
|
|
493
|
+
raise DocumentStoreError(msg)
|
|
494
|
+
return success
|
|
495
|
+
except Exception as e:
|
|
496
|
+
msg = f"Failed to write documents to Elasticsearch: {e!s}"
|
|
497
|
+
raise DocumentStoreError(msg) from e
|
|
344
498
|
|
|
345
499
|
def delete_documents(self, document_ids: List[str]) -> None:
|
|
346
500
|
"""
|
|
347
|
-
Deletes all
|
|
501
|
+
Deletes all documents with a matching document_ids from the document store.
|
|
348
502
|
|
|
349
|
-
:param document_ids: the
|
|
503
|
+
:param document_ids: the document ids to delete
|
|
350
504
|
"""
|
|
351
|
-
|
|
352
505
|
helpers.bulk(
|
|
353
506
|
client=self.client,
|
|
354
507
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
@@ -357,6 +510,25 @@ class ElasticsearchDocumentStore:
|
|
|
357
510
|
raise_on_error=False,
|
|
358
511
|
)
|
|
359
512
|
|
|
513
|
+
async def delete_documents_async(self, document_ids: List[str]) -> None:
|
|
514
|
+
"""
|
|
515
|
+
Asynchronously deletes all documents with a matching document_ids from the document store.
|
|
516
|
+
|
|
517
|
+
:param document_ids: the document ids to delete
|
|
518
|
+
"""
|
|
519
|
+
self._ensure_initialized()
|
|
520
|
+
|
|
521
|
+
try:
|
|
522
|
+
await helpers.async_bulk(
|
|
523
|
+
client=self._async_client,
|
|
524
|
+
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
525
|
+
index=self._index,
|
|
526
|
+
refresh=True,
|
|
527
|
+
)
|
|
528
|
+
except Exception as e:
|
|
529
|
+
msg = f"Failed to delete documents from Elasticsearch: {e!s}"
|
|
530
|
+
raise DocumentStoreError(msg) from e
|
|
531
|
+
|
|
360
532
|
def _bm25_retrieval(
|
|
361
533
|
self,
|
|
362
534
|
query: str,
|
|
@@ -367,27 +539,15 @@ class ElasticsearchDocumentStore:
|
|
|
367
539
|
scale_score: bool = False,
|
|
368
540
|
) -> List[Document]:
|
|
369
541
|
"""
|
|
370
|
-
Retrieves
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
:param query: String to search in saved `Document`s' text.
|
|
380
|
-
:param filters: Filters applied to the retrieved `Document`s, for more info
|
|
381
|
-
see `ElasticsearchDocumentStore.filter_documents`.
|
|
382
|
-
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
|
|
383
|
-
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
|
|
384
|
-
for valid values.
|
|
385
|
-
:param top_k: Maximum number of `Document`s to return.
|
|
386
|
-
:param scale_score: If `True` scales the `Document``s scores between 0 and 1.
|
|
387
|
-
:raises ValueError: If `query` is an empty string
|
|
388
|
-
:returns: List of `Document` that match `query`
|
|
542
|
+
Retrieves documents using BM25 retrieval.
|
|
543
|
+
|
|
544
|
+
:param query: The query string to search for
|
|
545
|
+
:param filters: Optional filters to narrow down the search space
|
|
546
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
547
|
+
:param top_k: Maximum number of documents to return
|
|
548
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
549
|
+
:returns: List of Documents that match the query
|
|
389
550
|
"""
|
|
390
|
-
|
|
391
551
|
if not query:
|
|
392
552
|
msg = "query must be a non empty string"
|
|
393
553
|
raise ValueError(msg)
|
|
@@ -421,35 +581,79 @@ class ElasticsearchDocumentStore:
|
|
|
421
581
|
|
|
422
582
|
return documents
|
|
423
583
|
|
|
424
|
-
def
|
|
584
|
+
async def _bm25_retrieval_async(
|
|
425
585
|
self,
|
|
426
|
-
|
|
586
|
+
query: str,
|
|
427
587
|
*,
|
|
428
588
|
filters: Optional[Dict[str, Any]] = None,
|
|
589
|
+
fuzziness: str = "AUTO",
|
|
429
590
|
top_k: int = 10,
|
|
430
|
-
|
|
591
|
+
scale_score: bool = False,
|
|
431
592
|
) -> List[Document]:
|
|
432
593
|
"""
|
|
433
|
-
|
|
594
|
+
Asynchronously retrieves documents using BM25 retrieval.
|
|
595
|
+
|
|
596
|
+
:param query: The query string to search for
|
|
597
|
+
:param filters: Optional filters to narrow down the search space
|
|
598
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
599
|
+
:param top_k: Maximum number of documents to return
|
|
600
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
601
|
+
:returns: List of Documents that match the query
|
|
602
|
+
"""
|
|
603
|
+
self._ensure_initialized()
|
|
604
|
+
|
|
605
|
+
if not query:
|
|
606
|
+
msg = "query must be a non empty string"
|
|
607
|
+
raise ValueError(msg)
|
|
608
|
+
|
|
609
|
+
# Prepare the search body
|
|
610
|
+
search_body = {
|
|
611
|
+
"size": top_k,
|
|
612
|
+
"query": {
|
|
613
|
+
"bool": {
|
|
614
|
+
"must": [
|
|
615
|
+
{
|
|
616
|
+
"multi_match": {
|
|
617
|
+
"query": query,
|
|
618
|
+
"type": "most_fields",
|
|
619
|
+
"operator": "OR",
|
|
620
|
+
"fuzziness": fuzziness,
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
]
|
|
624
|
+
}
|
|
625
|
+
},
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
if filters:
|
|
629
|
+
search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
|
|
434
630
|
|
|
435
|
-
|
|
631
|
+
documents = await self._search_documents_async(**search_body)
|
|
436
632
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
633
|
+
if scale_score:
|
|
634
|
+
for doc in documents:
|
|
635
|
+
if doc.score is not None:
|
|
636
|
+
doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
|
|
440
637
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
:
|
|
450
|
-
|
|
638
|
+
return documents
|
|
639
|
+
|
|
640
|
+
def _embedding_retrieval(
|
|
641
|
+
self,
|
|
642
|
+
query_embedding: List[float],
|
|
643
|
+
*,
|
|
644
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
645
|
+
top_k: int = 10,
|
|
646
|
+
num_candidates: Optional[int] = None,
|
|
647
|
+
) -> List[Document]:
|
|
451
648
|
"""
|
|
649
|
+
Retrieves documents using dense vector similarity search.
|
|
452
650
|
|
|
651
|
+
:param query_embedding: Embedding vector to search for
|
|
652
|
+
:param filters: Optional filters to narrow down the search space
|
|
653
|
+
:param top_k: Maximum number of documents to return
|
|
654
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
655
|
+
:returns: List of Documents most similar to query_embedding
|
|
656
|
+
"""
|
|
453
657
|
if not query_embedding:
|
|
454
658
|
msg = "query_embedding must be a non-empty list of floats"
|
|
455
659
|
raise ValueError(msg)
|
|
@@ -471,3 +675,45 @@ class ElasticsearchDocumentStore:
|
|
|
471
675
|
|
|
472
676
|
docs = self._search_documents(**body)
|
|
473
677
|
return docs
|
|
678
|
+
|
|
679
|
+
async def _embedding_retrieval_async(
|
|
680
|
+
self,
|
|
681
|
+
query_embedding: List[float],
|
|
682
|
+
*,
|
|
683
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
684
|
+
top_k: int = 10,
|
|
685
|
+
num_candidates: Optional[int] = None,
|
|
686
|
+
) -> List[Document]:
|
|
687
|
+
"""
|
|
688
|
+
Asynchronously retrieves documents using dense vector similarity search.
|
|
689
|
+
|
|
690
|
+
:param query_embedding: Embedding vector to search for
|
|
691
|
+
:param filters: Optional filters to narrow down the search space
|
|
692
|
+
:param top_k: Maximum number of documents to return
|
|
693
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
694
|
+
:returns: List of Documents most similar to query_embedding
|
|
695
|
+
"""
|
|
696
|
+
self._ensure_initialized()
|
|
697
|
+
|
|
698
|
+
if not query_embedding:
|
|
699
|
+
msg = "query_embedding must be a non-empty list of floats"
|
|
700
|
+
raise ValueError(msg)
|
|
701
|
+
|
|
702
|
+
# If num_candidates is not set, use top_k * 10 as default
|
|
703
|
+
if num_candidates is None:
|
|
704
|
+
num_candidates = top_k * 10
|
|
705
|
+
|
|
706
|
+
# Prepare the search body
|
|
707
|
+
search_body = {
|
|
708
|
+
"knn": {
|
|
709
|
+
"field": "embedding",
|
|
710
|
+
"query_vector": query_embedding,
|
|
711
|
+
"k": top_k,
|
|
712
|
+
"num_candidates": num_candidates,
|
|
713
|
+
},
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
if filters:
|
|
717
|
+
search_body["knn"]["filter"] = _normalize_filters(filters)
|
|
718
|
+
|
|
719
|
+
return await self._search_documents_async(**search_body)
|
|
@@ -117,3 +117,66 @@ def test_run():
|
|
|
117
117
|
assert len(res) == 1
|
|
118
118
|
assert len(res["documents"]) == 1
|
|
119
119
|
assert res["documents"][0].content == "Test doc"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@pytest.mark.asyncio
|
|
123
|
+
async def test_run_async():
|
|
124
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
125
|
+
mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
|
|
126
|
+
retriever = ElasticsearchBM25Retriever(document_store=mock_store)
|
|
127
|
+
|
|
128
|
+
res = await retriever.run_async(query="some test query")
|
|
129
|
+
mock_store._bm25_retrieval_async.assert_called_once_with(
|
|
130
|
+
query="some test query", filters={}, fuzziness="AUTO", top_k=10, scale_score=False
|
|
131
|
+
)
|
|
132
|
+
assert len(res) == 1
|
|
133
|
+
assert len(res["documents"]) == 1
|
|
134
|
+
assert res["documents"][0].content == "test document"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@pytest.mark.asyncio
|
|
138
|
+
async def test_run_init_params_async():
|
|
139
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
140
|
+
mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
|
|
141
|
+
retriever = ElasticsearchBM25Retriever(
|
|
142
|
+
document_store=mock_store,
|
|
143
|
+
filters={"some": "filter"},
|
|
144
|
+
fuzziness="3",
|
|
145
|
+
top_k=3,
|
|
146
|
+
scale_score=True,
|
|
147
|
+
filter_policy=FilterPolicy.MERGE,
|
|
148
|
+
)
|
|
149
|
+
res = await retriever.run_async(query="some query")
|
|
150
|
+
mock_store._bm25_retrieval_async.assert_called_once_with(
|
|
151
|
+
query="some query",
|
|
152
|
+
filters={"some": "filter"},
|
|
153
|
+
fuzziness="3",
|
|
154
|
+
top_k=3,
|
|
155
|
+
scale_score=True,
|
|
156
|
+
)
|
|
157
|
+
assert len(res) == 1
|
|
158
|
+
assert len(res["documents"]) == 1
|
|
159
|
+
assert res["documents"][0].content == "test document"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@pytest.mark.asyncio
|
|
163
|
+
async def test_run_time_params_async():
|
|
164
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
165
|
+
mock_store._bm25_retrieval_async.return_value = [Document(content="test document")]
|
|
166
|
+
retriever = ElasticsearchBM25Retriever(
|
|
167
|
+
document_store=mock_store,
|
|
168
|
+
filters={"some": "filter"},
|
|
169
|
+
fuzziness="3",
|
|
170
|
+
top_k=3,
|
|
171
|
+
scale_score=True,
|
|
172
|
+
filter_policy=FilterPolicy.MERGE,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
res = await retriever.run_async(query="some query", filters={"another": "filter"}, top_k=1)
|
|
176
|
+
mock_store._bm25_retrieval_async.assert_called_once_with(
|
|
177
|
+
query="some query", filters={"another": "filter"}, top_k=1, fuzziness="3", scale_score=True
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
assert len(res) == 1
|
|
181
|
+
assert len(res["documents"]) == 1
|
|
182
|
+
assert res["documents"][0].content == "test document"
|
|
@@ -9,6 +9,7 @@ from unittest.mock import Mock, patch
|
|
|
9
9
|
import pytest
|
|
10
10
|
from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found]
|
|
11
11
|
from haystack.dataclasses.document import Document
|
|
12
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
12
13
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
13
14
|
from haystack.document_stores.types import DuplicatePolicy
|
|
14
15
|
from haystack.testing.document_store import DocumentStoreBaseTests
|
|
@@ -25,7 +26,9 @@ def test_init_is_lazy(_mock_es_client):
|
|
|
25
26
|
|
|
26
27
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
27
28
|
def test_headers_are_supported(_mock_es_client):
|
|
28
|
-
_ = ElasticsearchDocumentStore(
|
|
29
|
+
_ = ElasticsearchDocumentStore(
|
|
30
|
+
hosts="http://testhost:9200", headers={"header1": "value1", "header2": "value2"}
|
|
31
|
+
).client
|
|
29
32
|
|
|
30
33
|
assert _mock_es_client.call_count == 1
|
|
31
34
|
_, kwargs = _mock_es_client.call_args
|
|
@@ -96,6 +99,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
96
99
|
)
|
|
97
100
|
yield store
|
|
98
101
|
store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
|
|
102
|
+
store.client.close()
|
|
99
103
|
|
|
100
104
|
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
101
105
|
"""
|
|
@@ -134,15 +138,11 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
134
138
|
def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
|
|
135
139
|
doc = Document(id="1", content="test")
|
|
136
140
|
doc.dataframe = DataFrame({"a": [1, 2, 3]})
|
|
137
|
-
|
|
138
141
|
document_store.write_documents([doc])
|
|
139
|
-
|
|
140
142
|
res = document_store.filter_documents()
|
|
141
143
|
assert len(res) == 1
|
|
142
|
-
|
|
143
144
|
assert res[0].id == "1"
|
|
144
145
|
assert res[0].content == "test"
|
|
145
|
-
|
|
146
146
|
assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
|
|
147
147
|
|
|
148
148
|
def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
|
|
@@ -242,16 +242,16 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
242
242
|
Test that not all terms must mandatorily match for BM25 retrieval to return a result.
|
|
243
243
|
"""
|
|
244
244
|
documents = [
|
|
245
|
-
Document(id=1, content="There are over 7,000 languages spoken around the world today."),
|
|
245
|
+
Document(id="1", content="There are over 7,000 languages spoken around the world today."),
|
|
246
246
|
Document(
|
|
247
|
-
id=2,
|
|
247
|
+
id="2",
|
|
248
248
|
content=(
|
|
249
249
|
"Elephants have been observed to behave in a way that indicates a high level of self-awareness"
|
|
250
250
|
" such as recognizing themselves in mirrors."
|
|
251
251
|
),
|
|
252
252
|
),
|
|
253
253
|
Document(
|
|
254
|
-
id=3,
|
|
254
|
+
id="3",
|
|
255
255
|
content=(
|
|
256
256
|
"In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness"
|
|
257
257
|
" the phenomenon of bioluminescent waves."
|
|
@@ -262,7 +262,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
262
262
|
|
|
263
263
|
res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3)
|
|
264
264
|
assert len(res) == 1
|
|
265
|
-
assert res[0].id == 2
|
|
265
|
+
assert res[0].id == "2"
|
|
266
266
|
|
|
267
267
|
def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore):
|
|
268
268
|
docs = [
|
|
@@ -355,8 +355,172 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
355
355
|
)
|
|
356
356
|
mock_elasticsearch.return_value = mock_client
|
|
357
357
|
|
|
358
|
-
_ = ElasticsearchDocumentStore(hosts="
|
|
358
|
+
_ = ElasticsearchDocumentStore(hosts="http://testhost:9200", custom_mapping=custom_mapping).client
|
|
359
359
|
mock_client.indices.create.assert_called_once_with(
|
|
360
360
|
index="default",
|
|
361
361
|
mappings=custom_mapping,
|
|
362
362
|
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
@pytest.mark.integration
|
|
366
|
+
class TestElasticsearchDocumentStoreAsync:
|
|
367
|
+
|
|
368
|
+
@pytest.fixture
|
|
369
|
+
async def document_store(self, request):
|
|
370
|
+
"""
|
|
371
|
+
Basic fixture providing a document store instance for async tests
|
|
372
|
+
"""
|
|
373
|
+
hosts = ["http://localhost:9200"]
|
|
374
|
+
# Use a different index for each test so we can run them in parallel
|
|
375
|
+
index = f"{request.node.name}"
|
|
376
|
+
|
|
377
|
+
store = ElasticsearchDocumentStore(hosts=hosts, index=index)
|
|
378
|
+
yield store
|
|
379
|
+
store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
|
|
380
|
+
|
|
381
|
+
await store.async_client.close()
|
|
382
|
+
|
|
383
|
+
@pytest.mark.asyncio
|
|
384
|
+
async def test_write_documents_async(self, document_store):
|
|
385
|
+
docs = [Document(id="1", content="test")]
|
|
386
|
+
assert await document_store.write_documents_async(docs) == 1
|
|
387
|
+
assert await document_store.count_documents_async() == 1
|
|
388
|
+
with pytest.raises(DocumentStoreError):
|
|
389
|
+
await document_store.write_documents_async(docs, policy=DuplicatePolicy.FAIL)
|
|
390
|
+
|
|
391
|
+
@pytest.mark.asyncio
|
|
392
|
+
async def test_count_documents_async(self, document_store):
|
|
393
|
+
docs = [
|
|
394
|
+
Document(content="test doc 1"),
|
|
395
|
+
Document(content="test doc 2"),
|
|
396
|
+
Document(content="test doc 3"),
|
|
397
|
+
]
|
|
398
|
+
await document_store.write_documents_async(docs)
|
|
399
|
+
assert await document_store.count_documents_async() == 3
|
|
400
|
+
|
|
401
|
+
@pytest.mark.asyncio
|
|
402
|
+
async def test_delete_documents_async(self, document_store):
|
|
403
|
+
doc = Document(content="test doc")
|
|
404
|
+
await document_store.write_documents_async([doc])
|
|
405
|
+
assert await document_store.count_documents_async() == 1
|
|
406
|
+
await document_store.delete_documents_async([doc.id])
|
|
407
|
+
assert await document_store.count_documents_async() == 0
|
|
408
|
+
|
|
409
|
+
@pytest.mark.asyncio
|
|
410
|
+
async def test_filter_documents_async(self, document_store):
|
|
411
|
+
filterable_docs = [
|
|
412
|
+
Document(content="1", meta={"number": -10}),
|
|
413
|
+
Document(content="2", meta={"number": 100}),
|
|
414
|
+
]
|
|
415
|
+
await document_store.write_documents_async(filterable_docs)
|
|
416
|
+
result = await document_store.filter_documents_async(
|
|
417
|
+
filters={"field": "number", "operator": "==", "value": 100}
|
|
418
|
+
)
|
|
419
|
+
assert len(result) == 1
|
|
420
|
+
assert result[0].meta["number"] == 100
|
|
421
|
+
|
|
422
|
+
@pytest.mark.asyncio
|
|
423
|
+
async def test_bm25_retrieval_async(self, document_store):
|
|
424
|
+
docs = [
|
|
425
|
+
Document(content="Haskell is a functional programming language"),
|
|
426
|
+
Document(content="Python is an object oriented programming language"),
|
|
427
|
+
]
|
|
428
|
+
await document_store.write_documents_async(docs)
|
|
429
|
+
results = await document_store._bm25_retrieval_async("functional", top_k=1)
|
|
430
|
+
assert len(results) == 1
|
|
431
|
+
assert "functional" in results[0].content
|
|
432
|
+
|
|
433
|
+
@pytest.mark.asyncio
|
|
434
|
+
async def test_embedding_retrieval_async(self, document_store):
|
|
435
|
+
|
|
436
|
+
# init document store
|
|
437
|
+
docs = [
|
|
438
|
+
Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),
|
|
439
|
+
Document(content="Less similar document", embedding=[0.5, 0.5, 0.5, 0.5]),
|
|
440
|
+
]
|
|
441
|
+
await document_store.write_documents_async(docs)
|
|
442
|
+
|
|
443
|
+
# without num_candidates set to None
|
|
444
|
+
results = await document_store._embedding_retrieval_async(query_embedding=[1.0, 1.0, 1.0, 1.0], top_k=1)
|
|
445
|
+
assert len(results) == 1
|
|
446
|
+
assert results[0].content == "Most similar document"
|
|
447
|
+
|
|
448
|
+
# with num_candidates not None
|
|
449
|
+
results = await document_store._embedding_retrieval_async(
|
|
450
|
+
query_embedding=[1.0, 1.0, 1.0, 1.0], top_k=2, num_candidates=2
|
|
451
|
+
)
|
|
452
|
+
assert len(results) == 2
|
|
453
|
+
assert results[0].content == "Most similar document"
|
|
454
|
+
|
|
455
|
+
# with an embedding containing None
|
|
456
|
+
with pytest.raises(ValueError, match="query_embedding must be a non-empty list of floats"):
|
|
457
|
+
_ = await document_store._embedding_retrieval_async(query_embedding=None, top_k=2)
|
|
458
|
+
|
|
459
|
+
@pytest.mark.asyncio
|
|
460
|
+
async def test_bm25_retrieval_async_with_filters(self, document_store):
|
|
461
|
+
docs = [
|
|
462
|
+
Document(content="Haskell is a functional programming language", meta={"type": "functional"}),
|
|
463
|
+
Document(content="Python is an object oriented programming language", meta={"type": "oop"}),
|
|
464
|
+
]
|
|
465
|
+
await document_store.write_documents_async(docs)
|
|
466
|
+
results = await document_store._bm25_retrieval_async(
|
|
467
|
+
"programming", filters={"field": "type", "operator": "==", "value": "functional"}, top_k=1
|
|
468
|
+
)
|
|
469
|
+
assert len(results) == 1
|
|
470
|
+
assert "functional" in results[0].content
|
|
471
|
+
|
|
472
|
+
# test with scale_score=True
|
|
473
|
+
results = await document_store._bm25_retrieval_async(
|
|
474
|
+
"programming", filters={"field": "type", "operator": "==", "value": "functional"}, top_k=1, scale_score=True
|
|
475
|
+
)
|
|
476
|
+
assert len(results) == 1
|
|
477
|
+
assert "functional" in results[0].content
|
|
478
|
+
assert 0 <= results[0].score <= 1 # score should be between 0 and 1
|
|
479
|
+
|
|
480
|
+
@pytest.mark.asyncio
|
|
481
|
+
async def test_embedding_retrieval_async_with_filters(self, document_store):
|
|
482
|
+
docs = [
|
|
483
|
+
Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0], meta={"type": "similar"}),
|
|
484
|
+
Document(content="Less similar document", embedding=[0.5, 0.5, 0.5, 0.5], meta={"type": "different"}),
|
|
485
|
+
]
|
|
486
|
+
await document_store.write_documents_async(docs)
|
|
487
|
+
results = await document_store._embedding_retrieval_async(
|
|
488
|
+
query_embedding=[1.0, 1.0, 1.0, 1.0],
|
|
489
|
+
filters={"field": "type", "operator": "==", "value": "similar"},
|
|
490
|
+
top_k=1,
|
|
491
|
+
)
|
|
492
|
+
assert len(results) == 1
|
|
493
|
+
assert results[0].content == "Most similar document"
|
|
494
|
+
|
|
495
|
+
@pytest.mark.asyncio
|
|
496
|
+
async def test_write_documents_async_invalid_document_type(self, document_store):
|
|
497
|
+
"""Test write_documents with invalid document type"""
|
|
498
|
+
invalid_docs = [{"id": "1", "content": "test"}] # Dictionary instead of Document object
|
|
499
|
+
with pytest.raises(ValueError, match="param 'documents' must contain a list of objects of type Document"):
|
|
500
|
+
await document_store.write_documents_async(invalid_docs)
|
|
501
|
+
|
|
502
|
+
@pytest.mark.asyncio
|
|
503
|
+
async def test_write_documents_async_with_dataframe_warning(self, document_store, caplog):
|
|
504
|
+
"""Test write_documents with document containing dataframe field"""
|
|
505
|
+
doc = Document(id="1", content="test", dataframe=DataFrame({"col": [1, 2, 3]}))
|
|
506
|
+
|
|
507
|
+
await document_store.write_documents_async([doc])
|
|
508
|
+
assert "ElasticsearchDocumentStore no longer supports dataframes" in caplog.text
|
|
509
|
+
|
|
510
|
+
results = await document_store.filter_documents_async()
|
|
511
|
+
assert len(results) == 1
|
|
512
|
+
assert results[0].id == "1"
|
|
513
|
+
assert not hasattr(results[0], "dataframe") or results[0].dataframe is None
|
|
514
|
+
|
|
515
|
+
@pytest.mark.asyncio
|
|
516
|
+
async def test_write_documents_async_with_sparse_embedding_warning(self, document_store, caplog):
|
|
517
|
+
"""Test write_documents with document containing sparse_embedding field"""
|
|
518
|
+
doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5]))
|
|
519
|
+
|
|
520
|
+
await document_store.write_documents_async([doc])
|
|
521
|
+
assert "but storing sparse embeddings in Elasticsearch is not currently supported." in caplog.text
|
|
522
|
+
|
|
523
|
+
results = await document_store.filter_documents_async()
|
|
524
|
+
assert len(results) == 1
|
|
525
|
+
assert results[0].id == "1"
|
|
526
|
+
assert not hasattr(results[0], "sparse_embedding") or results[0].sparse_embedding is None
|
{elasticsearch_haystack-2.0.0 → elasticsearch_haystack-2.1.0}/tests/test_embedding_retriever.py
RENAMED
|
@@ -113,3 +113,68 @@ def test_run():
|
|
|
113
113
|
assert len(res["documents"]) == 1
|
|
114
114
|
assert res["documents"][0].content == "Test doc"
|
|
115
115
|
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@pytest.mark.asyncio
|
|
119
|
+
async def test_run_async():
|
|
120
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
121
|
+
mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
|
|
122
|
+
retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store)
|
|
123
|
+
res = await retriever.run_async(query_embedding=[0.5, 0.7])
|
|
124
|
+
mock_store._embedding_retrieval_async.assert_called_once_with(
|
|
125
|
+
query_embedding=[0.5, 0.7],
|
|
126
|
+
filters={},
|
|
127
|
+
top_k=10,
|
|
128
|
+
num_candidates=None,
|
|
129
|
+
)
|
|
130
|
+
assert len(res) == 1
|
|
131
|
+
assert len(res["documents"]) == 1
|
|
132
|
+
assert res["documents"][0].content == "test document"
|
|
133
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@pytest.mark.asyncio
|
|
137
|
+
async def test_run_init_params_async():
|
|
138
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
139
|
+
mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
|
|
140
|
+
retriever = ElasticsearchEmbeddingRetriever(
|
|
141
|
+
document_store=mock_store,
|
|
142
|
+
filters={"some": "filter"},
|
|
143
|
+
top_k=3,
|
|
144
|
+
num_candidates=30,
|
|
145
|
+
filter_policy=FilterPolicy.MERGE,
|
|
146
|
+
)
|
|
147
|
+
res = await retriever.run_async(query_embedding=[0.5, 0.7])
|
|
148
|
+
mock_store._embedding_retrieval_async.assert_called_once_with(
|
|
149
|
+
query_embedding=[0.5, 0.7],
|
|
150
|
+
filters={"some": "filter"},
|
|
151
|
+
top_k=3,
|
|
152
|
+
num_candidates=30,
|
|
153
|
+
)
|
|
154
|
+
assert len(res) == 1
|
|
155
|
+
assert len(res["documents"]) == 1
|
|
156
|
+
assert res["documents"][0].content == "test document"
|
|
157
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@pytest.mark.asyncio
|
|
161
|
+
async def test_run_time_params_async():
|
|
162
|
+
mock_store = Mock(spec=ElasticsearchDocumentStore)
|
|
163
|
+
mock_store._embedding_retrieval_async.return_value = [Document(content="test document", embedding=[0.1, 0.2])]
|
|
164
|
+
retriever = ElasticsearchEmbeddingRetriever(
|
|
165
|
+
document_store=mock_store,
|
|
166
|
+
filters={"some": "filter"},
|
|
167
|
+
top_k=3,
|
|
168
|
+
num_candidates=30,
|
|
169
|
+
filter_policy=FilterPolicy.MERGE,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
res = await retriever.run_async(query_embedding=[0.5, 0.7], filters={"another": "filter"}, top_k=1)
|
|
173
|
+
mock_store._embedding_retrieval_async.assert_called_once_with(
|
|
174
|
+
query_embedding=[0.5, 0.7], filters={"another": "filter"}, top_k=1, num_candidates=30
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
assert len(res) == 1
|
|
178
|
+
assert len(res["documents"]) == 1
|
|
179
|
+
assert res["documents"][0].content == "test document"
|
|
180
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|