elasticsearch-haystack 3.1.0__tar.gz → 4.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/CHANGELOG.md +25 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/PKG-INFO +9 -25
- elasticsearch_haystack-4.1.0/README.md +16 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/pyproject.toml +0 -5
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +197 -29
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/test_bm25_retriever.py +15 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/test_document_store.py +274 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/test_embedding_retriever.py +14 -0
- elasticsearch_haystack-3.1.0/README.md +0 -32
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/.gitignore +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/LICENSE +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/pydoc/config.yml +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/components/retrievers/py.typed +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/src/haystack_integrations/document_stores/py.typed +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/test_filters.py +0 -0
|
@@ -1,5 +1,30 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/elasticsearch-v4.0.0] - 2025-09-24
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- [**breaking**] Adding `api_token` and `apit_token_id` authentication support to `ElasticSearchDocumentStore` (#2292)
|
|
8
|
+
|
|
9
|
+
### 🧹 Chores
|
|
10
|
+
|
|
11
|
+
- Remove black (#1985)
|
|
12
|
+
- Standardize readmes - part 1 (#2202)
|
|
13
|
+
- Standardize readmes - part 2 (#2205)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## [integrations/elasticsearch-v3.1.0] - 2025-06-12
|
|
17
|
+
|
|
18
|
+
### 🐛 Bug Fixes
|
|
19
|
+
|
|
20
|
+
- Fix Elasticsearch types + add py.typed (#1923)
|
|
21
|
+
|
|
22
|
+
### 🧹 Chores
|
|
23
|
+
|
|
24
|
+
- Align core-integrations Hatch scripts (#1898)
|
|
25
|
+
- Update md files for new hatch scripts (#1911)
|
|
26
|
+
|
|
27
|
+
|
|
3
28
|
## [integrations/elasticsearch-v3.0.1] - 2025-05-27
|
|
4
29
|
|
|
5
30
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -24,35 +24,19 @@ Requires-Dist: elasticsearch<9,>=8
|
|
|
24
24
|
Requires-Dist: haystack-ai>=2.4.0
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
# elasticsearch-haystack
|
|
28
28
|
|
|
29
29
|
[](https://pypi.org/project/elasticsearch-haystack)
|
|
30
30
|
[](https://pypi.org/project/elasticsearch-haystack)
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
- [Integration page](https://haystack.deepset.ai/integrations/elasticsearch-document-store)
|
|
33
|
+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/elasticsearch/CHANGELOG.md)
|
|
33
34
|
|
|
34
|
-
|
|
35
|
+
---
|
|
35
36
|
|
|
36
|
-
##
|
|
37
|
+
## Contributing
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
pip install elasticsearch-haystack
|
|
40
|
-
```
|
|
39
|
+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
To run tests first start a Docker container running ElasticSearch. We provide a utility `docker-compose.yml` for that:
|
|
45
|
-
|
|
46
|
-
```console
|
|
47
|
-
docker-compose up
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
Then run tests:
|
|
51
|
-
|
|
52
|
-
```console
|
|
53
|
-
hatch run test:all
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## License
|
|
57
|
-
|
|
58
|
-
`elasticsearch-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
|
|
41
|
+
To run integration tests locally, you need a Docker container running ElasticSearch.
|
|
42
|
+
Use the provided `docker-compose.yml` file to start the container: `docker compose up -d`.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# elasticsearch-haystack
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/elasticsearch-haystack)
|
|
4
|
+
[](https://pypi.org/project/elasticsearch-haystack)
|
|
5
|
+
|
|
6
|
+
- [Integration page](https://haystack.deepset.ai/integrations/elasticsearch-document-store)
|
|
7
|
+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/elasticsearch/CHANGELOG.md)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Contributing
|
|
12
|
+
|
|
13
|
+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
14
|
+
|
|
15
|
+
To run integration tests locally, you need a Docker container running ElasticSearch.
|
|
16
|
+
Use the provided `docker-compose.yml` file to start the container: `docker compose up -d`.
|
|
@@ -82,11 +82,6 @@ disallow_incomplete_defs = true
|
|
|
82
82
|
[tool.hatch.metadata]
|
|
83
83
|
allow-direct-references = true
|
|
84
84
|
|
|
85
|
-
[tool.black]
|
|
86
|
-
target-version = ["py38"]
|
|
87
|
-
line-length = 120
|
|
88
|
-
skip-string-normalization = true
|
|
89
|
-
|
|
90
85
|
[tool.ruff]
|
|
91
86
|
target-version = "py38"
|
|
92
87
|
line-length = 120
|
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument
|
|
6
|
+
# ruff: noqa: B008 function-call-in-default-argument
|
|
7
|
+
# ruff: noqa: S101 disable checks for uses of the assert keyword
|
|
8
|
+
|
|
9
|
+
|
|
4
10
|
from collections.abc import Mapping
|
|
5
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
|
11
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
6
12
|
|
|
7
13
|
import numpy as np
|
|
8
14
|
from elastic_transport import NodeConfig
|
|
@@ -10,6 +16,7 @@ from haystack import default_from_dict, default_to_dict, logging
|
|
|
10
16
|
from haystack.dataclasses import Document
|
|
11
17
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
12
18
|
from haystack.document_stores.types import DuplicatePolicy
|
|
19
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
13
20
|
from haystack.version import __version__ as haystack_version
|
|
14
21
|
|
|
15
22
|
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
|
|
@@ -38,13 +45,16 @@ class ElasticsearchDocumentStore:
|
|
|
38
45
|
|
|
39
46
|
Usage example (Elastic Cloud):
|
|
40
47
|
```python
|
|
41
|
-
from
|
|
42
|
-
document_store = ElasticsearchDocumentStore(
|
|
48
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
49
|
+
document_store = ElasticsearchDocumentStore(
|
|
50
|
+
api_key_id=Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
|
|
51
|
+
api_key=Secret.from_env_var("ELASTIC_API_KEY", strict=False),
|
|
52
|
+
)
|
|
43
53
|
```
|
|
44
54
|
|
|
45
55
|
Usage example (self-hosted Elasticsearch instance):
|
|
46
56
|
```python
|
|
47
|
-
from
|
|
57
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
48
58
|
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
|
|
49
59
|
```
|
|
50
60
|
In the above example we connect with security disabled just to show the basic usage.
|
|
@@ -63,6 +73,8 @@ class ElasticsearchDocumentStore:
|
|
|
63
73
|
hosts: Optional[Hosts] = None,
|
|
64
74
|
custom_mapping: Optional[Dict[str, Any]] = None,
|
|
65
75
|
index: str = "default",
|
|
76
|
+
api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False),
|
|
77
|
+
api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
|
|
66
78
|
embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
|
|
67
79
|
**kwargs: Any,
|
|
68
80
|
):
|
|
@@ -80,9 +92,16 @@ class ElasticsearchDocumentStore:
|
|
|
80
92
|
For the full list of supported kwargs, see the official Elasticsearch
|
|
81
93
|
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
|
|
82
94
|
|
|
95
|
+
Authentication is provided via Secret objects, which by default are loaded from environment variables.
|
|
96
|
+
You can either provide both `api_key_id` and `api_key`, or just `api_key` containing a base64-encoded string
|
|
97
|
+
of `id:secret`. Secret instances can also be loaded from a token using the `Secret.from_token()` method.
|
|
98
|
+
|
|
83
99
|
:param hosts: List of hosts running the Elasticsearch client.
|
|
84
100
|
:param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
|
|
85
101
|
:param index: Name of index in Elasticsearch.
|
|
102
|
+
:param api_key: A Secret object containing the API key for authenticating or base64-encoded with the
|
|
103
|
+
concatenated secret and id for authenticating with Elasticsearch (separated by “:”).
|
|
104
|
+
:param api_key_id: A Secret object containing the API key ID for authenticating with Elasticsearch.
|
|
86
105
|
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
|
|
87
106
|
This parameter only takes effect if the index does not yet exist and is created.
|
|
88
107
|
To choose the most appropriate function, look for information about your embedding model.
|
|
@@ -94,6 +113,8 @@ class ElasticsearchDocumentStore:
|
|
|
94
113
|
self._client: Optional[Elasticsearch] = None
|
|
95
114
|
self._async_client: Optional[AsyncElasticsearch] = None
|
|
96
115
|
self._index = index
|
|
116
|
+
self._api_key = api_key
|
|
117
|
+
self._api_key_id = api_key_id
|
|
97
118
|
self._embedding_similarity_function = embedding_similarity_function
|
|
98
119
|
self._custom_mapping = custom_mapping
|
|
99
120
|
self._kwargs = kwargs
|
|
@@ -103,6 +124,29 @@ class ElasticsearchDocumentStore:
|
|
|
103
124
|
msg = "custom_mapping must be a dictionary"
|
|
104
125
|
raise ValueError(msg)
|
|
105
126
|
|
|
127
|
+
if not self._custom_mapping:
|
|
128
|
+
self._default_mappings = {
|
|
129
|
+
"properties": {
|
|
130
|
+
"embedding": {
|
|
131
|
+
"type": "dense_vector",
|
|
132
|
+
"index": True,
|
|
133
|
+
"similarity": self._embedding_similarity_function,
|
|
134
|
+
},
|
|
135
|
+
"content": {"type": "text"},
|
|
136
|
+
},
|
|
137
|
+
"dynamic_templates": [
|
|
138
|
+
{
|
|
139
|
+
"strings": {
|
|
140
|
+
"path_match": "*",
|
|
141
|
+
"match_mapping_type": "string",
|
|
142
|
+
"mapping": {
|
|
143
|
+
"type": "keyword",
|
|
144
|
+
},
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
],
|
|
148
|
+
}
|
|
149
|
+
|
|
106
150
|
def _ensure_initialized(self):
|
|
107
151
|
"""
|
|
108
152
|
Ensures both sync and async clients are initialized and the index exists.
|
|
@@ -111,14 +155,18 @@ class ElasticsearchDocumentStore:
|
|
|
111
155
|
headers = self._kwargs.pop("headers", {})
|
|
112
156
|
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
113
157
|
|
|
158
|
+
api_key = self._handle_auth()
|
|
159
|
+
|
|
114
160
|
# Initialize both sync and async clients
|
|
115
161
|
self._client = Elasticsearch(
|
|
116
162
|
self._hosts,
|
|
163
|
+
api_key=api_key,
|
|
117
164
|
headers=headers,
|
|
118
165
|
**self._kwargs,
|
|
119
166
|
)
|
|
120
167
|
self._async_client = AsyncElasticsearch(
|
|
121
168
|
self._hosts,
|
|
169
|
+
api_key=api_key,
|
|
122
170
|
headers=headers,
|
|
123
171
|
**self._kwargs,
|
|
124
172
|
)
|
|
@@ -130,27 +178,7 @@ class ElasticsearchDocumentStore:
|
|
|
130
178
|
mappings = self._custom_mapping
|
|
131
179
|
else:
|
|
132
180
|
# Configure mapping for the embedding field if none is provided
|
|
133
|
-
mappings =
|
|
134
|
-
"properties": {
|
|
135
|
-
"embedding": {
|
|
136
|
-
"type": "dense_vector",
|
|
137
|
-
"index": True,
|
|
138
|
-
"similarity": self._embedding_similarity_function,
|
|
139
|
-
},
|
|
140
|
-
"content": {"type": "text"},
|
|
141
|
-
},
|
|
142
|
-
"dynamic_templates": [
|
|
143
|
-
{
|
|
144
|
-
"strings": {
|
|
145
|
-
"path_match": "*",
|
|
146
|
-
"match_mapping_type": "string",
|
|
147
|
-
"mapping": {
|
|
148
|
-
"type": "keyword",
|
|
149
|
-
},
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
],
|
|
153
|
-
}
|
|
181
|
+
mappings = self._default_mappings
|
|
154
182
|
|
|
155
183
|
# Create the index if it doesn't exist
|
|
156
184
|
if not self._client.indices.exists(index=self._index):
|
|
@@ -158,13 +186,56 @@ class ElasticsearchDocumentStore:
|
|
|
158
186
|
|
|
159
187
|
self._initialized = True
|
|
160
188
|
|
|
189
|
+
def _handle_auth(self) -> Optional[Union[str, Tuple[str, str]]]:
|
|
190
|
+
"""
|
|
191
|
+
Handles authentication for the Elasticsearch client.
|
|
192
|
+
|
|
193
|
+
There are three possible scenarios.
|
|
194
|
+
|
|
195
|
+
1) Authentication with both api_key and api_key_id, either as Secrets or as environment variables. In this case,
|
|
196
|
+
use both for authentication.
|
|
197
|
+
|
|
198
|
+
2) Authentication with only api_key, either as a Secret or as an environment variable. In this case, the api_key
|
|
199
|
+
must be a base64-encoded string that encodes both id and secret <id:secret>.
|
|
200
|
+
|
|
201
|
+
3) There's no authentication, neither api_key nor api_key_id are provided as a Secret nor defined as
|
|
202
|
+
environment variables. In this case, the client will connect without authentication.
|
|
203
|
+
|
|
204
|
+
:returns:
|
|
205
|
+
api_key: Optional[Union[str, Tuple[str, str]]]
|
|
206
|
+
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
api_key: Optional[Union[str, Tuple[str, str]]] # make the type checker happy
|
|
210
|
+
|
|
211
|
+
api_key_resolved = self._api_key.resolve_value()
|
|
212
|
+
api_key_id_resolved = self._api_key_id.resolve_value()
|
|
213
|
+
|
|
214
|
+
# Scenario 1: both are found, use them
|
|
215
|
+
if api_key_id_resolved and api_key_resolved:
|
|
216
|
+
api_key = (api_key_id_resolved, api_key_resolved)
|
|
217
|
+
return api_key
|
|
218
|
+
|
|
219
|
+
# Scenario 2: only api_key is set, must be a base64-encoded string that encodes id and secret (separated by “:”)
|
|
220
|
+
elif api_key_resolved and not api_key_id_resolved:
|
|
221
|
+
return api_key_resolved
|
|
222
|
+
|
|
223
|
+
# Error: only api_key_id is found, raise an error
|
|
224
|
+
elif api_key_id_resolved and not api_key_resolved:
|
|
225
|
+
msg = "api_key_id is provided but api_key is missing."
|
|
226
|
+
raise ValueError(msg)
|
|
227
|
+
|
|
228
|
+
else:
|
|
229
|
+
# Scenario 3: neither found, no authentication
|
|
230
|
+
return None
|
|
231
|
+
|
|
161
232
|
@property
|
|
162
233
|
def client(self) -> Elasticsearch:
|
|
163
234
|
"""
|
|
164
235
|
Returns the synchronous Elasticsearch client, initializing it if necessary.
|
|
165
236
|
"""
|
|
166
237
|
self._ensure_initialized()
|
|
167
|
-
assert self._client is not None
|
|
238
|
+
assert self._client is not None
|
|
168
239
|
return self._client
|
|
169
240
|
|
|
170
241
|
@property
|
|
@@ -173,7 +244,7 @@ class ElasticsearchDocumentStore:
|
|
|
173
244
|
Returns the asynchronous Elasticsearch client, initializing it if necessary.
|
|
174
245
|
"""
|
|
175
246
|
self._ensure_initialized()
|
|
176
|
-
assert self._async_client is not None
|
|
247
|
+
assert self._async_client is not None
|
|
177
248
|
return self._async_client
|
|
178
249
|
|
|
179
250
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -191,6 +262,8 @@ class ElasticsearchDocumentStore:
|
|
|
191
262
|
hosts=self._hosts,
|
|
192
263
|
custom_mapping=self._custom_mapping,
|
|
193
264
|
index=self._index,
|
|
265
|
+
api_key=self._api_key.to_dict(),
|
|
266
|
+
api_key_id=self._api_key_id.to_dict(),
|
|
194
267
|
embedding_similarity_function=self._embedding_similarity_function,
|
|
195
268
|
**self._kwargs,
|
|
196
269
|
)
|
|
@@ -205,6 +278,7 @@ class ElasticsearchDocumentStore:
|
|
|
205
278
|
:returns:
|
|
206
279
|
Deserialized component.
|
|
207
280
|
"""
|
|
281
|
+
deserialize_secrets_inplace(data, keys=["api_key", "api_key_id"])
|
|
208
282
|
return default_from_dict(cls, data)
|
|
209
283
|
|
|
210
284
|
def count_documents(self) -> int:
|
|
@@ -384,7 +458,7 @@ class ElasticsearchDocumentStore:
|
|
|
384
458
|
|
|
385
459
|
if errors:
|
|
386
460
|
# with stats_only=False, errors is guaranteed to be a list of dicts
|
|
387
|
-
assert isinstance(errors, list)
|
|
461
|
+
assert isinstance(errors, list)
|
|
388
462
|
duplicate_errors_ids = []
|
|
389
463
|
other_errors = []
|
|
390
464
|
for e in errors:
|
|
@@ -463,7 +537,7 @@ class ElasticsearchDocumentStore:
|
|
|
463
537
|
)
|
|
464
538
|
if failed:
|
|
465
539
|
# with stats_only=False, failed is guaranteed to be a list of dicts
|
|
466
|
-
assert isinstance(failed, list)
|
|
540
|
+
assert isinstance(failed, list)
|
|
467
541
|
if policy == DuplicatePolicy.FAIL:
|
|
468
542
|
for error in failed:
|
|
469
543
|
if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
|
|
@@ -490,6 +564,14 @@ class ElasticsearchDocumentStore:
|
|
|
490
564
|
raise_on_error=False,
|
|
491
565
|
)
|
|
492
566
|
|
|
567
|
+
def _prepare_delete_all_request(self, *, is_async: bool) -> Dict[str, Any]:
|
|
568
|
+
return {
|
|
569
|
+
"index": self._index,
|
|
570
|
+
"body": {"query": {"match_all": {}}}, # Delete all documents
|
|
571
|
+
"wait_for_completion": False if is_async else True, # block until done (set False for async)
|
|
572
|
+
"refresh": True, # Ensure changes are visible immediately
|
|
573
|
+
}
|
|
574
|
+
|
|
493
575
|
async def delete_documents_async(self, document_ids: List[str]) -> None:
|
|
494
576
|
"""
|
|
495
577
|
Asynchronously deletes all documents with a matching document_ids from the document store.
|
|
@@ -509,6 +591,92 @@ class ElasticsearchDocumentStore:
|
|
|
509
591
|
msg = f"Failed to delete documents from Elasticsearch: {e!s}"
|
|
510
592
|
raise DocumentStoreError(msg) from e
|
|
511
593
|
|
|
594
|
+
def delete_all_documents(self, recreate_index: bool = False) -> None:
|
|
595
|
+
"""
|
|
596
|
+
Deletes all documents in the document store.
|
|
597
|
+
|
|
598
|
+
A fast way to clear all documents from the document store while preserving any index settings and mappings.
|
|
599
|
+
|
|
600
|
+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
|
|
601
|
+
settings. If False, all documents will be deleted using the `delete_by_query` API.
|
|
602
|
+
"""
|
|
603
|
+
self._ensure_initialized() # _ensure_initialized ensures _client is not None and an index exists
|
|
604
|
+
|
|
605
|
+
if recreate_index:
|
|
606
|
+
# get the current index mappings and settings
|
|
607
|
+
index_name = self._index
|
|
608
|
+
mappings = self._client.indices.get(index=self._index)[index_name]["mappings"] # type: ignore
|
|
609
|
+
settings = self._client.indices.get(index=self._index)[index_name]["settings"] # type: ignore
|
|
610
|
+
|
|
611
|
+
# remove settings that cannot be set during index creation
|
|
612
|
+
settings["index"].pop("uuid", None)
|
|
613
|
+
settings["index"].pop("creation_date", None)
|
|
614
|
+
settings["index"].pop("provided_name", None)
|
|
615
|
+
settings["index"].pop("version", None)
|
|
616
|
+
|
|
617
|
+
self._client.indices.delete(index=self._index) # type: ignore
|
|
618
|
+
self._client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
|
|
619
|
+
|
|
620
|
+
# delete index
|
|
621
|
+
self._client.indices.delete(index=self._index) # type: ignore
|
|
622
|
+
|
|
623
|
+
# recreate with mappings
|
|
624
|
+
self._client.indices.create(index=self._index, mappings=mappings) # type: ignore
|
|
625
|
+
|
|
626
|
+
else:
|
|
627
|
+
result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False)) # type: ignore
|
|
628
|
+
logger.info(
|
|
629
|
+
"Deleted all the {n_docs} documents from the index '{index}'.",
|
|
630
|
+
index=self._index,
|
|
631
|
+
n_docs=result["deleted"],
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
async def delete_all_documents_async(self, recreate_index: bool = False) -> None:
|
|
635
|
+
"""
|
|
636
|
+
Asynchronously deletes all documents in the document store.
|
|
637
|
+
|
|
638
|
+
A fast way to clear all documents from the document store while preserving any index settings and mappings.
|
|
639
|
+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
|
|
640
|
+
settings. If False, all documents will be deleted using the `delete_by_query` API.
|
|
641
|
+
"""
|
|
642
|
+
self._ensure_initialized() # ensures _async_client is not None
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
if recreate_index:
|
|
646
|
+
# get the current index mappings and settings
|
|
647
|
+
index_name = self._index
|
|
648
|
+
index_info = await self._async_client.indices.get(index=self._index) # type: ignore
|
|
649
|
+
mappings = index_info[index_name]["mappings"]
|
|
650
|
+
settings = index_info[index_name]["settings"]
|
|
651
|
+
|
|
652
|
+
# remove settings that cannot be set during index creation
|
|
653
|
+
settings["index"].pop("uuid", None)
|
|
654
|
+
settings["index"].pop("creation_date", None)
|
|
655
|
+
settings["index"].pop("provided_name", None)
|
|
656
|
+
settings["index"].pop("version", None)
|
|
657
|
+
|
|
658
|
+
# delete index
|
|
659
|
+
await self._async_client.indices.delete(index=self._index) # type: ignore
|
|
660
|
+
|
|
661
|
+
# recreate with settings and mappings
|
|
662
|
+
await self._async_client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
|
|
663
|
+
|
|
664
|
+
else:
|
|
665
|
+
# use delete_by_query for more efficient deletion without index recreation
|
|
666
|
+
# For async, we need to wait for completion to get the deleted count
|
|
667
|
+
delete_request = self._prepare_delete_all_request(is_async=True)
|
|
668
|
+
delete_request["wait_for_completion"] = True # Override to wait for completion in async
|
|
669
|
+
result = await self._async_client.delete_by_query(**delete_request) # type: ignore
|
|
670
|
+
logger.info(
|
|
671
|
+
"Deleted all the {n_docs} documents from the index '{index}'.",
|
|
672
|
+
index=self._index,
|
|
673
|
+
n_docs=result["deleted"],
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
except Exception as e:
|
|
677
|
+
msg = f"Failed to delete all documents from Elasticsearch: {e!s}"
|
|
678
|
+
raise DocumentStoreError(msg) from e
|
|
679
|
+
|
|
512
680
|
def _bm25_retrieval(
|
|
513
681
|
self,
|
|
514
682
|
query: str,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
4
5
|
from unittest.mock import Mock, patch
|
|
5
6
|
|
|
6
7
|
import pytest
|
|
@@ -38,6 +39,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
38
39
|
"document_store": {
|
|
39
40
|
"init_parameters": {
|
|
40
41
|
"hosts": "some fake host",
|
|
42
|
+
"api_key": {
|
|
43
|
+
"env_vars": [
|
|
44
|
+
"ELASTIC_API_KEY",
|
|
45
|
+
],
|
|
46
|
+
"strict": False,
|
|
47
|
+
"type": "env_var",
|
|
48
|
+
},
|
|
49
|
+
"api_key_id": {
|
|
50
|
+
"env_vars": [
|
|
51
|
+
"ELASTIC_API_KEY_ID",
|
|
52
|
+
],
|
|
53
|
+
"strict": False,
|
|
54
|
+
"type": "env_var",
|
|
55
|
+
},
|
|
41
56
|
"custom_mapping": None,
|
|
42
57
|
"index": "default",
|
|
43
58
|
"embedding_similarity_function": "cosine",
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import random
|
|
6
|
+
import time
|
|
6
7
|
from typing import List
|
|
7
8
|
from unittest.mock import Mock, patch
|
|
8
9
|
|
|
@@ -13,6 +14,8 @@ from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
|
13
14
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
14
15
|
from haystack.document_stores.types import DuplicatePolicy
|
|
15
16
|
from haystack.testing.document_store import DocumentStoreBaseTests
|
|
17
|
+
from haystack.utils import Secret
|
|
18
|
+
from haystack.utils.auth import TokenSecret
|
|
16
19
|
|
|
17
20
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
18
21
|
|
|
@@ -46,6 +49,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
46
49
|
assert res == {
|
|
47
50
|
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
48
51
|
"init_parameters": {
|
|
52
|
+
"api_key": {
|
|
53
|
+
"env_vars": [
|
|
54
|
+
"ELASTIC_API_KEY",
|
|
55
|
+
],
|
|
56
|
+
"strict": False,
|
|
57
|
+
"type": "env_var",
|
|
58
|
+
},
|
|
59
|
+
"api_key_id": {
|
|
60
|
+
"env_vars": [
|
|
61
|
+
"ELASTIC_API_KEY_ID",
|
|
62
|
+
],
|
|
63
|
+
"strict": False,
|
|
64
|
+
"type": "env_var",
|
|
65
|
+
},
|
|
49
66
|
"hosts": "some hosts",
|
|
50
67
|
"custom_mapping": None,
|
|
51
68
|
"index": "default",
|
|
@@ -62,6 +79,8 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
62
79
|
"hosts": "some hosts",
|
|
63
80
|
"custom_mapping": None,
|
|
64
81
|
"index": "default",
|
|
82
|
+
"api_key": None,
|
|
83
|
+
"api_key_id": None,
|
|
65
84
|
"embedding_similarity_function": "cosine",
|
|
66
85
|
},
|
|
67
86
|
}
|
|
@@ -69,9 +88,135 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
69
88
|
assert document_store._hosts == "some hosts"
|
|
70
89
|
assert document_store._index == "default"
|
|
71
90
|
assert document_store._custom_mapping is None
|
|
91
|
+
assert document_store._api_key is None
|
|
92
|
+
assert document_store._api_key_id is None
|
|
72
93
|
assert document_store._embedding_similarity_function == "cosine"
|
|
73
94
|
|
|
74
95
|
|
|
96
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
97
|
+
def test_to_dict_with_api_keys_env_vars(_mock_elasticsearch_client, monkeypatch):
|
|
98
|
+
monkeypatch.setenv("ELASTIC_API_KEY", "test-api-key")
|
|
99
|
+
monkeypatch.setenv("ELASTIC_API_KEY_ID", "test-api-key-id")
|
|
100
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200")
|
|
101
|
+
document_store.client()
|
|
102
|
+
res = document_store.to_dict()
|
|
103
|
+
assert res["init_parameters"]["api_key"] == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}
|
|
104
|
+
assert res["init_parameters"]["api_key_id"] == {
|
|
105
|
+
"type": "env_var",
|
|
106
|
+
"env_vars": ["ELASTIC_API_KEY_ID"],
|
|
107
|
+
"strict": False,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
112
|
+
def test_to_dict_with_api_keys_as_secret(_mock_elasticsearch_client, monkeypatch):
|
|
113
|
+
monkeypatch.setenv("ELASTIC_API_KEY", "test-api-key")
|
|
114
|
+
monkeypatch.setenv("ELASTIC_API_KEY_ID", "test-api-key-id")
|
|
115
|
+
with pytest.raises(ValueError):
|
|
116
|
+
document_store = ElasticsearchDocumentStore(
|
|
117
|
+
hosts="https://localhost:9200",
|
|
118
|
+
api_key=TokenSecret(_token="test-api-key"),
|
|
119
|
+
api_key_id=TokenSecret(_token="test-api-key-id"),
|
|
120
|
+
)
|
|
121
|
+
document_store.client()
|
|
122
|
+
_ = document_store.to_dict()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
126
|
+
def test_from_dict_with_api_keys_env_vars(_mock_elasticsearch_client):
|
|
127
|
+
data = {
|
|
128
|
+
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
129
|
+
"init_parameters": {
|
|
130
|
+
"hosts": "some hosts",
|
|
131
|
+
"custom_mapping": None,
|
|
132
|
+
"index": "default",
|
|
133
|
+
"api_key": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False},
|
|
134
|
+
"api_key_id": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False},
|
|
135
|
+
"embedding_similarity_function": "cosine",
|
|
136
|
+
},
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
document_store = ElasticsearchDocumentStore.from_dict(data)
|
|
140
|
+
assert document_store._api_key == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}
|
|
141
|
+
assert document_store._api_key_id == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
145
|
+
def test_api_key_validation_only_api_key(_mock_elasticsearch_client):
|
|
146
|
+
api_key = Secret.from_token("test_api_key")
|
|
147
|
+
|
|
148
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key)
|
|
149
|
+
document_store.client()
|
|
150
|
+
assert document_store._api_key == api_key
|
|
151
|
+
# not passing the api_key_id makes it default to reading from env var
|
|
152
|
+
assert document_store._api_key_id == Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
156
|
+
def test_api_key_validation_only_api_key_id_raises_error(_mock_elasticsearch_client):
|
|
157
|
+
api_key_id = Secret.from_token("test_api_key_id")
|
|
158
|
+
with pytest.raises(ValueError, match="api_key_id is provided but api_key is missing"):
|
|
159
|
+
es = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key_id=api_key_id)
|
|
160
|
+
es.client()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
164
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
|
|
165
|
+
def test_client_initialization_with_api_key_tuple(_mock_async_es, _mock_es):
|
|
166
|
+
api_key = Secret.from_token("test_api_key")
|
|
167
|
+
api_key_id = Secret.from_token("test_api_key_id")
|
|
168
|
+
|
|
169
|
+
# Mock the client.info() call to avoid actual connection
|
|
170
|
+
mock_client = Mock()
|
|
171
|
+
mock_client.info.return_value = {"version": {"number": "8.0.0"}}
|
|
172
|
+
_mock_es.return_value = mock_client
|
|
173
|
+
|
|
174
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key, api_key_id=api_key_id)
|
|
175
|
+
|
|
176
|
+
# Access client to trigger initialization
|
|
177
|
+
_ = document_store.client
|
|
178
|
+
|
|
179
|
+
# Check that Elasticsearch was called with the correct api_key tuple
|
|
180
|
+
_mock_es.assert_called_once()
|
|
181
|
+
call_args = _mock_es.call_args
|
|
182
|
+
assert call_args[0][0] == "https://localhost:9200" # hosts
|
|
183
|
+
assert call_args[1]["api_key"] == ("test_api_key_id", "test_api_key")
|
|
184
|
+
|
|
185
|
+
# Check that AsyncElasticsearch was called with the same api_key tuple
|
|
186
|
+
_mock_async_es.assert_called_once()
|
|
187
|
+
async_call_args = _mock_async_es.call_args
|
|
188
|
+
assert async_call_args[0][0] == "https://localhost:9200" # hosts
|
|
189
|
+
assert async_call_args[1]["api_key"] == ("test_api_key_id", "test_api_key")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
193
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
|
|
194
|
+
def test_client_initialization_with_api_key_string(_mock_async_es, _mock_es):
|
|
195
|
+
api_key = Secret.from_token("test_api_key")
|
|
196
|
+
|
|
197
|
+
# Mock the client.info() call to avoid actual connection
|
|
198
|
+
mock_client = Mock()
|
|
199
|
+
mock_client.info.return_value = {"version": {"number": "8.0.0"}}
|
|
200
|
+
_mock_es.return_value = mock_client
|
|
201
|
+
|
|
202
|
+
document_store = ElasticsearchDocumentStore(hosts="testhost", api_key=api_key)
|
|
203
|
+
|
|
204
|
+
# Access client to trigger initialization
|
|
205
|
+
_ = document_store.client
|
|
206
|
+
|
|
207
|
+
# Check that Elasticsearch was called with the correct api_key string
|
|
208
|
+
_mock_es.assert_called_once()
|
|
209
|
+
call_args = _mock_es.call_args
|
|
210
|
+
assert call_args[0][0] == "testhost" # hosts
|
|
211
|
+
assert call_args[1]["api_key"] == "test_api_key"
|
|
212
|
+
|
|
213
|
+
# Check that AsyncElasticsearch was called with the same api_key string
|
|
214
|
+
_mock_async_es.assert_called_once()
|
|
215
|
+
async_call_args = _mock_async_es.call_args
|
|
216
|
+
assert async_call_args[0][0] == "testhost" # hosts
|
|
217
|
+
assert async_call_args[1]["api_key"] == "test_api_key"
|
|
218
|
+
|
|
219
|
+
|
|
75
220
|
@pytest.mark.integration
|
|
76
221
|
class TestDocumentStore(DocumentStoreBaseTests):
|
|
77
222
|
"""
|
|
@@ -339,6 +484,59 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
339
484
|
mappings=custom_mapping,
|
|
340
485
|
)
|
|
341
486
|
|
|
487
|
+
def test_delete_all_documents_index_recreation(self, document_store: ElasticsearchDocumentStore):
|
|
488
|
+
# populate the index with some documents
|
|
489
|
+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
|
|
490
|
+
document_store.write_documents(docs)
|
|
491
|
+
|
|
492
|
+
# capture index structure before deletion
|
|
493
|
+
assert document_store._client is not None
|
|
494
|
+
index_info_before = document_store._client.indices.get(index=document_store._index)
|
|
495
|
+
mappings_before = index_info_before[document_store._index]["mappings"]
|
|
496
|
+
settings_before = index_info_before[document_store._index]["settings"]
|
|
497
|
+
|
|
498
|
+
# delete all documents
|
|
499
|
+
document_store.delete_all_documents(recreate_index=True)
|
|
500
|
+
assert document_store.count_documents() == 0
|
|
501
|
+
|
|
502
|
+
# verify index structure is preserved
|
|
503
|
+
index_info_after = document_store._client.indices.get(index=document_store._index)
|
|
504
|
+
mappings_after = index_info_after[document_store._index]["mappings"]
|
|
505
|
+
assert mappings_after == mappings_before, "delete_all_documents should preserve index mappings"
|
|
506
|
+
|
|
507
|
+
settings_after = index_info_after[document_store._index]["settings"]
|
|
508
|
+
settings_after["index"].pop("uuid", None)
|
|
509
|
+
settings_after["index"].pop("creation_date", None)
|
|
510
|
+
settings_before["index"].pop("uuid", None)
|
|
511
|
+
settings_before["index"].pop("creation_date", None)
|
|
512
|
+
assert settings_after == settings_before, "delete_all_documents should preserve index settings"
|
|
513
|
+
|
|
514
|
+
# verify index can accept new documents and retrieve
|
|
515
|
+
new_doc = Document(id="4", content="New document after delete all")
|
|
516
|
+
document_store.write_documents([new_doc])
|
|
517
|
+
assert document_store.count_documents() == 1
|
|
518
|
+
|
|
519
|
+
results = document_store.filter_documents()
|
|
520
|
+
assert len(results) == 1
|
|
521
|
+
assert results[0].content == "New document after delete all"
|
|
522
|
+
|
|
523
|
+
def test_delete_all_documents_no_index_recreation(self, document_store: ElasticsearchDocumentStore):
|
|
524
|
+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
|
|
525
|
+
document_store.write_documents(docs)
|
|
526
|
+
assert document_store.count_documents() == 2
|
|
527
|
+
|
|
528
|
+
document_store.delete_all_documents(recreate_index=False)
|
|
529
|
+
time.sleep(2) # need to wait for the deletion to be reflected in count_documents
|
|
530
|
+
assert document_store.count_documents() == 0
|
|
531
|
+
|
|
532
|
+
new_doc = Document(id="3", content="New document after delete all")
|
|
533
|
+
document_store.write_documents([new_doc])
|
|
534
|
+
assert document_store.count_documents() == 1
|
|
535
|
+
|
|
536
|
+
results = document_store.filter_documents()
|
|
537
|
+
assert len(results) == 1
|
|
538
|
+
assert results[0].content == "New document after delete all"
|
|
539
|
+
|
|
342
540
|
|
|
343
541
|
@pytest.mark.integration
|
|
344
542
|
class TestElasticsearchDocumentStoreAsync:
|
|
@@ -487,3 +685,79 @@ class TestElasticsearchDocumentStoreAsync:
|
|
|
487
685
|
assert len(results) == 1
|
|
488
686
|
assert results[0].id == "1"
|
|
489
687
|
assert not hasattr(results[0], "sparse_embedding") or results[0].sparse_embedding is None
|
|
688
|
+
|
|
689
|
+
@pytest.mark.asyncio
|
|
690
|
+
async def test_delete_all_documents_async(self, document_store):
|
|
691
|
+
docs = [
|
|
692
|
+
Document(id="1", content="First document", meta={"category": "test"}),
|
|
693
|
+
Document(id="2", content="Second document", meta={"category": "test"}),
|
|
694
|
+
Document(id="3", content="Third document", meta={"category": "other"}),
|
|
695
|
+
]
|
|
696
|
+
await document_store.write_documents_async(docs)
|
|
697
|
+
assert await document_store.count_documents_async() == 3
|
|
698
|
+
|
|
699
|
+
# delete all documents
|
|
700
|
+
await document_store.delete_all_documents_async(recreate_index=False)
|
|
701
|
+
assert await document_store.count_documents_async() == 0
|
|
702
|
+
|
|
703
|
+
# verify index still exists and can accept new documents and retrieve
|
|
704
|
+
new_doc = Document(id="4", content="New document after delete all")
|
|
705
|
+
await document_store.write_documents_async([new_doc])
|
|
706
|
+
assert await document_store.count_documents_async() == 1
|
|
707
|
+
|
|
708
|
+
results = await document_store.filter_documents_async()
|
|
709
|
+
assert len(results) == 1
|
|
710
|
+
assert results[0].id == "4"
|
|
711
|
+
assert results[0].content == "New document after delete all"
|
|
712
|
+
|
|
713
|
+
@pytest.mark.asyncio
|
|
714
|
+
async def test_delete_all_documents_async_index_recreation(self, document_store):
|
|
715
|
+
# populate the index with some documents
|
|
716
|
+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
|
|
717
|
+
await document_store.write_documents_async(docs)
|
|
718
|
+
|
|
719
|
+
# capture index structure before deletion
|
|
720
|
+
assert document_store._async_client is not None
|
|
721
|
+
index_info_before = await document_store._async_client.indices.get(index=document_store._index)
|
|
722
|
+
mappings_before = index_info_before[document_store._index]["mappings"]
|
|
723
|
+
settings_before = index_info_before[document_store._index]["settings"]
|
|
724
|
+
|
|
725
|
+
# delete all documents with index recreation
|
|
726
|
+
await document_store.delete_all_documents_async(recreate_index=True)
|
|
727
|
+
assert await document_store.count_documents_async() == 0
|
|
728
|
+
|
|
729
|
+
# verify index structure is preserved
|
|
730
|
+
index_info_after = await document_store._async_client.indices.get(index=document_store._index)
|
|
731
|
+
mappings_after = index_info_after[document_store._index]["mappings"]
|
|
732
|
+
assert mappings_after == mappings_before, "delete_all_documents_async should preserve index mappings"
|
|
733
|
+
|
|
734
|
+
settings_after = index_info_after[document_store._index]["settings"]
|
|
735
|
+
settings_after["index"].pop("uuid", None)
|
|
736
|
+
settings_after["index"].pop("creation_date", None)
|
|
737
|
+
settings_before["index"].pop("uuid", None)
|
|
738
|
+
settings_before["index"].pop("creation_date", None)
|
|
739
|
+
assert settings_after == settings_before, "delete_all_documents_async should preserve index settings"
|
|
740
|
+
|
|
741
|
+
# verify index can accept new documents and retrieve
|
|
742
|
+
new_doc = Document(id="4", content="New document after delete all")
|
|
743
|
+
await document_store.write_documents_async([new_doc])
|
|
744
|
+
assert await document_store.count_documents_async() == 1
|
|
745
|
+
|
|
746
|
+
results = await document_store.filter_documents_async()
|
|
747
|
+
assert len(results) == 1
|
|
748
|
+
assert results[0].content == "New document after delete all"
|
|
749
|
+
|
|
750
|
+
@pytest.mark.asyncio
|
|
751
|
+
async def test_delete_all_documents_async_no_index_recreation(self, document_store):
|
|
752
|
+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
|
|
753
|
+
await document_store.write_documents_async(docs)
|
|
754
|
+
assert await document_store.count_documents_async() == 2
|
|
755
|
+
|
|
756
|
+
await document_store.delete_all_documents_async(recreate_index=False)
|
|
757
|
+
# Need to wait for the deletion to be reflected in count_documents
|
|
758
|
+
time.sleep(2)
|
|
759
|
+
assert await document_store.count_documents_async() == 0
|
|
760
|
+
|
|
761
|
+
new_doc = Document(id="3", content="New document after delete all")
|
|
762
|
+
await document_store.write_documents_async([new_doc])
|
|
763
|
+
assert await document_store.count_documents_async() == 1
|
{elasticsearch_haystack-3.1.0 → elasticsearch_haystack-4.1.0}/tests/test_embedding_retriever.py
RENAMED
|
@@ -37,6 +37,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
37
37
|
"init_parameters": {
|
|
38
38
|
"document_store": {
|
|
39
39
|
"init_parameters": {
|
|
40
|
+
"api_key": {
|
|
41
|
+
"env_vars": [
|
|
42
|
+
"ELASTIC_API_KEY",
|
|
43
|
+
],
|
|
44
|
+
"strict": False,
|
|
45
|
+
"type": "env_var",
|
|
46
|
+
},
|
|
47
|
+
"api_key_id": {
|
|
48
|
+
"env_vars": [
|
|
49
|
+
"ELASTIC_API_KEY_ID",
|
|
50
|
+
],
|
|
51
|
+
"strict": False,
|
|
52
|
+
"type": "env_var",
|
|
53
|
+
},
|
|
40
54
|
"hosts": "some fake host",
|
|
41
55
|
"custom_mapping": None,
|
|
42
56
|
"index": "default",
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
[](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml)
|
|
2
|
-
|
|
3
|
-
[](https://pypi.org/project/elasticsearch-haystack)
|
|
4
|
-
[](https://pypi.org/project/elasticsearch-haystack)
|
|
5
|
-
|
|
6
|
-
# Elasticsearch Document Store
|
|
7
|
-
|
|
8
|
-
Document Store for Haystack 2.x, supports ElasticSearch 8.
|
|
9
|
-
|
|
10
|
-
## Installation
|
|
11
|
-
|
|
12
|
-
```console
|
|
13
|
-
pip install elasticsearch-haystack
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
## Testing
|
|
17
|
-
|
|
18
|
-
To run tests first start a Docker container running ElasticSearch. We provide a utility `docker-compose.yml` for that:
|
|
19
|
-
|
|
20
|
-
```console
|
|
21
|
-
docker-compose up
|
|
22
|
-
```
|
|
23
|
-
|
|
24
|
-
Then run tests:
|
|
25
|
-
|
|
26
|
-
```console
|
|
27
|
-
hatch run test:all
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
## License
|
|
31
|
-
|
|
32
|
-
`elasticsearch-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|