elasticsearch-haystack 3.0.1__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/CHANGELOG.md +24 -3
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/PKG-INFO +9 -25
- elasticsearch_haystack-4.0.0/README.md +16 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/pyproject.toml +27 -31
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +6 -2
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +4 -2
- elasticsearch_haystack-4.0.0/src/haystack_integrations/components/retrievers/py.typed +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +89 -15
- elasticsearch_haystack-4.0.0/src/haystack_integrations/document_stores/py.typed +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/test_bm25_retriever.py +15 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/test_document_store.py +144 -2
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/test_embedding_retriever.py +14 -0
- elasticsearch_haystack-3.0.1/README.md +0 -32
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/.gitignore +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/LICENSE +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/pydoc/config.yml +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/test_filters.py +0 -0
|
@@ -1,7 +1,31 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/elasticsearch-v3.1.0] - 2025-06-12
|
|
4
|
+
|
|
5
|
+
### 🐛 Bug Fixes
|
|
6
|
+
|
|
7
|
+
- Fix Elasticsearch types + add py.typed (#1923)
|
|
8
|
+
|
|
9
|
+
### 🧹 Chores
|
|
10
|
+
|
|
11
|
+
- Align core-integrations Hatch scripts (#1898)
|
|
12
|
+
- Update md files for new hatch scripts (#1911)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
## [integrations/elasticsearch-v3.0.1] - 2025-05-27
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### ⚙️ CI
|
|
19
|
+
|
|
20
|
+
- Review testing workflows (#1541)
|
|
21
|
+
|
|
22
|
+
### 🌀 Miscellaneous
|
|
23
|
+
|
|
24
|
+
- Pining lower versions of haystack and `aiohttp` for `ElasticSearch` (#1827)
|
|
25
|
+
|
|
3
26
|
## [integrations/elasticsearch-v3.0.0] - 2025-03-11
|
|
4
27
|
|
|
28
|
+
|
|
5
29
|
### 🧹 Chores
|
|
6
30
|
|
|
7
31
|
- Use Haystack logging across integrations (#1484)
|
|
@@ -21,9 +45,6 @@
|
|
|
21
45
|
|
|
22
46
|
- Remove Python 3.8 support (#1421)
|
|
23
47
|
|
|
24
|
-
### 🌀 Miscellaneous
|
|
25
|
-
|
|
26
|
-
- Docs: update changelog for integrations/elasticsearch (#1400)
|
|
27
48
|
|
|
28
49
|
## [integrations/elasticsearch-v2.0.0] - 2025-02-14
|
|
29
50
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -24,35 +24,19 @@ Requires-Dist: elasticsearch<9,>=8
|
|
|
24
24
|
Requires-Dist: haystack-ai>=2.4.0
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
# elasticsearch-haystack
|
|
28
28
|
|
|
29
29
|
[](https://pypi.org/project/elasticsearch-haystack)
|
|
30
30
|
[](https://pypi.org/project/elasticsearch-haystack)
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
- [Integration page](https://haystack.deepset.ai/integrations/elasticsearch-document-store)
|
|
33
|
+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/elasticsearch/CHANGELOG.md)
|
|
33
34
|
|
|
34
|
-
|
|
35
|
+
---
|
|
35
36
|
|
|
36
|
-
##
|
|
37
|
+
## Contributing
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
pip install elasticsearch-haystack
|
|
40
|
-
```
|
|
39
|
+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
To run tests first start a Docker container running ElasticSearch. We provide a utility `docker-compose.yml` for that:
|
|
45
|
-
|
|
46
|
-
```console
|
|
47
|
-
docker-compose up
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
Then run tests:
|
|
51
|
-
|
|
52
|
-
```console
|
|
53
|
-
hatch run test
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## License
|
|
57
|
-
|
|
58
|
-
`elasticsearch-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
|
|
41
|
+
To run integration tests locally, you need a Docker container running ElasticSearch.
|
|
42
|
+
Use the provided `docker-compose.yml` file to start the container: `docker compose up -d`.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# elasticsearch-haystack
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/elasticsearch-haystack)
|
|
4
|
+
[](https://pypi.org/project/elasticsearch-haystack)
|
|
5
|
+
|
|
6
|
+
- [Integration page](https://haystack.deepset.ai/integrations/elasticsearch-document-store)
|
|
7
|
+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/elasticsearch/CHANGELOG.md)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Contributing
|
|
12
|
+
|
|
13
|
+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
|
|
14
|
+
|
|
15
|
+
To run integration tests locally, you need a Docker container running ElasticSearch.
|
|
16
|
+
Use the provided `docker-compose.yml` file to start the container: `docker compose up -d`.
|
|
@@ -47,42 +47,41 @@ git_describe_command = 'git describe --tags --match="integrations/elasticsearch-
|
|
|
47
47
|
|
|
48
48
|
[tool.hatch.envs.default]
|
|
49
49
|
installer = "uv"
|
|
50
|
-
dependencies = [
|
|
51
|
-
|
|
52
|
-
"pytest",
|
|
53
|
-
"pytest-asyncio",
|
|
54
|
-
"pytest-rerunfailures",
|
|
55
|
-
"pytest-xdist",
|
|
56
|
-
"haystack-pydoc-tools",
|
|
57
|
-
]
|
|
50
|
+
dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
51
|
+
|
|
58
52
|
[tool.hatch.envs.default.scripts]
|
|
59
|
-
test = "pytest {args:tests}"
|
|
60
|
-
test-cov = "coverage run -m pytest {args:tests}"
|
|
61
|
-
test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
|
|
62
|
-
cov-report = ["- coverage combine", "coverage report"]
|
|
63
|
-
cov = ["test-cov", "cov-report"]
|
|
64
|
-
cov-retry = ["test-cov-retry", "cov-report"]
|
|
65
53
|
docs = ["pydoc-markdown pydoc/config.yml"]
|
|
54
|
+
fmt = "ruff check --fix {args} && ruff format {args}"
|
|
55
|
+
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
66
56
|
|
|
67
|
-
[tool.hatch.envs.
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
57
|
+
[tool.hatch.envs.test]
|
|
58
|
+
dependencies = [
|
|
59
|
+
"pytest",
|
|
60
|
+
"pytest-asyncio",
|
|
61
|
+
"pytest-cov",
|
|
62
|
+
"pytest-rerunfailures",
|
|
63
|
+
"mypy",
|
|
64
|
+
"pip"
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[tool.hatch.envs.test.scripts]
|
|
68
|
+
unit = 'pytest -m "not integration" {args:tests}'
|
|
69
|
+
integration = 'pytest -m "integration" {args:tests}'
|
|
70
|
+
all = 'pytest {args:tests}'
|
|
71
|
+
cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
|
|
72
|
+
|
|
73
|
+
types = """mypy -p haystack_integrations.document_stores.elasticsearch \
|
|
74
|
+
-p haystack_integrations.components.retrievers.elasticsearch {args}"""
|
|
71
75
|
|
|
72
|
-
[tool.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
76
|
+
[tool.mypy]
|
|
77
|
+
install_types = true
|
|
78
|
+
non_interactive = true
|
|
79
|
+
check_untyped_defs = true
|
|
80
|
+
disallow_incomplete_defs = true
|
|
77
81
|
|
|
78
82
|
[tool.hatch.metadata]
|
|
79
83
|
allow-direct-references = true
|
|
80
84
|
|
|
81
|
-
[tool.black]
|
|
82
|
-
target-version = ["py38"]
|
|
83
|
-
line-length = 120
|
|
84
|
-
skip-string-normalization = true
|
|
85
|
-
|
|
86
85
|
[tool.ruff]
|
|
87
86
|
target-version = "py38"
|
|
88
87
|
line-length = 120
|
|
@@ -164,6 +163,3 @@ markers = ["unit: unit tests", "integration: integration tests"]
|
|
|
164
163
|
asyncio_mode = "auto"
|
|
165
164
|
asyncio_default_fixture_loop_scope = "class"
|
|
166
165
|
|
|
167
|
-
[[tool.mypy.overrides]]
|
|
168
|
-
module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
|
|
169
|
-
ignore_missing_imports = true
|
|
@@ -116,7 +116,9 @@ class ElasticsearchBM25Retriever:
|
|
|
116
116
|
return default_from_dict(cls, data)
|
|
117
117
|
|
|
118
118
|
@component.output_types(documents=List[Document])
|
|
119
|
-
def run(
|
|
119
|
+
def run(
|
|
120
|
+
self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
121
|
+
) -> Dict[str, List[Document]]:
|
|
120
122
|
"""
|
|
121
123
|
Retrieve documents using the BM25 keyword-based algorithm.
|
|
122
124
|
|
|
@@ -139,7 +141,9 @@ class ElasticsearchBM25Retriever:
|
|
|
139
141
|
return {"documents": docs}
|
|
140
142
|
|
|
141
143
|
@component.output_types(documents=List[Document])
|
|
142
|
-
async def run_async(
|
|
144
|
+
async def run_async(
|
|
145
|
+
self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
146
|
+
) -> Dict[str, List[Document]]:
|
|
143
147
|
"""
|
|
144
148
|
Asynchronously retrieve documents using the BM25 keyword-based algorithm.
|
|
145
149
|
|
|
@@ -114,7 +114,9 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
114
114
|
return default_from_dict(cls, data)
|
|
115
115
|
|
|
116
116
|
@component.output_types(documents=List[Document])
|
|
117
|
-
def run(
|
|
117
|
+
def run(
|
|
118
|
+
self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
119
|
+
) -> Dict[str, List[Document]]:
|
|
118
120
|
"""
|
|
119
121
|
Retrieve documents using a vector similarity metric.
|
|
120
122
|
|
|
@@ -139,7 +141,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
139
141
|
@component.output_types(documents=List[Document])
|
|
140
142
|
async def run_async(
|
|
141
143
|
self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
|
|
142
|
-
):
|
|
144
|
+
) -> Dict[str, List[Document]]:
|
|
143
145
|
"""
|
|
144
146
|
Asynchronously retrieve documents using a vector similarity metric.
|
|
145
147
|
|
|
File without changes
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
4
5
|
from collections.abc import Mapping
|
|
5
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
|
-
|
|
9
|
-
# There are no import stubs for elastic_transport and elasticsearch so mypy fails
|
|
10
|
-
from elastic_transport import NodeConfig # type: ignore[import-not-found]
|
|
9
|
+
from elastic_transport import NodeConfig
|
|
11
10
|
from haystack import default_from_dict, default_to_dict, logging
|
|
12
11
|
from haystack.dataclasses import Document
|
|
13
12
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
14
13
|
from haystack.document_stores.types import DuplicatePolicy
|
|
14
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
15
15
|
from haystack.version import __version__ as haystack_version
|
|
16
16
|
|
|
17
|
-
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
|
|
17
|
+
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
|
|
18
18
|
|
|
19
19
|
from .filters import _normalize_filters
|
|
20
20
|
|
|
@@ -40,13 +40,16 @@ class ElasticsearchDocumentStore:
|
|
|
40
40
|
|
|
41
41
|
Usage example (Elastic Cloud):
|
|
42
42
|
```python
|
|
43
|
-
from
|
|
44
|
-
document_store = ElasticsearchDocumentStore(
|
|
43
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
44
|
+
document_store = ElasticsearchDocumentStore(
|
|
45
|
+
api_key_id=Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
|
|
46
|
+
api_key=Secret.from_env_var("ELASTIC_API_KEY", strict=False),
|
|
47
|
+
)
|
|
45
48
|
```
|
|
46
49
|
|
|
47
50
|
Usage example (self-hosted Elasticsearch instance):
|
|
48
51
|
```python
|
|
49
|
-
from
|
|
52
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
50
53
|
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
|
|
51
54
|
```
|
|
52
55
|
In the above example we connect with security disabled just to show the basic usage.
|
|
@@ -65,8 +68,10 @@ class ElasticsearchDocumentStore:
|
|
|
65
68
|
hosts: Optional[Hosts] = None,
|
|
66
69
|
custom_mapping: Optional[Dict[str, Any]] = None,
|
|
67
70
|
index: str = "default",
|
|
71
|
+
api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False), # noqa: B008
|
|
72
|
+
api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False), # noqa: B008
|
|
68
73
|
embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
|
|
69
|
-
**kwargs,
|
|
74
|
+
**kwargs: Any,
|
|
70
75
|
):
|
|
71
76
|
"""
|
|
72
77
|
Creates a new ElasticsearchDocumentStore instance.
|
|
@@ -82,9 +87,16 @@ class ElasticsearchDocumentStore:
|
|
|
82
87
|
For the full list of supported kwargs, see the official Elasticsearch
|
|
83
88
|
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
|
|
84
89
|
|
|
90
|
+
Authentication is provided via Secret objects, which by default are loaded from environment variables.
|
|
91
|
+
You can either provide both `api_key_id` and `api_key`, or just `api_key` containing a base64-encoded string
|
|
92
|
+
of `id:secret`. Secret instances can also be loaded from a token using the `Secret.from_token()` method.
|
|
93
|
+
|
|
85
94
|
:param hosts: List of hosts running the Elasticsearch client.
|
|
86
95
|
:param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
|
|
87
96
|
:param index: Name of index in Elasticsearch.
|
|
97
|
+
:param api_key: A Secret object containing the API key for authenticating or base64-encoded with the
|
|
98
|
+
concatenated secret and id for authenticating with Elasticsearch (separated by “:”).
|
|
99
|
+
:param api_key_id: A Secret object containing the API key ID for authenticating with Elasticsearch.
|
|
88
100
|
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
|
|
89
101
|
This parameter only takes effect if the index does not yet exist and is created.
|
|
90
102
|
To choose the most appropriate function, look for information about your embedding model.
|
|
@@ -93,9 +105,11 @@ class ElasticsearchDocumentStore:
|
|
|
93
105
|
:param **kwargs: Optional arguments that `Elasticsearch` takes.
|
|
94
106
|
"""
|
|
95
107
|
self._hosts = hosts
|
|
96
|
-
self._client = None
|
|
97
|
-
self._async_client = None
|
|
108
|
+
self._client: Optional[Elasticsearch] = None
|
|
109
|
+
self._async_client: Optional[AsyncElasticsearch] = None
|
|
98
110
|
self._index = index
|
|
111
|
+
self._api_key = api_key
|
|
112
|
+
self._api_key_id = api_key_id
|
|
99
113
|
self._embedding_similarity_function = embedding_similarity_function
|
|
100
114
|
self._custom_mapping = custom_mapping
|
|
101
115
|
self._kwargs = kwargs
|
|
@@ -113,14 +127,18 @@ class ElasticsearchDocumentStore:
|
|
|
113
127
|
headers = self._kwargs.pop("headers", {})
|
|
114
128
|
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
115
129
|
|
|
130
|
+
api_key = self._handle_auth()
|
|
131
|
+
|
|
116
132
|
# Initialize both sync and async clients
|
|
117
133
|
self._client = Elasticsearch(
|
|
118
134
|
self._hosts,
|
|
135
|
+
api_key=api_key,
|
|
119
136
|
headers=headers,
|
|
120
137
|
**self._kwargs,
|
|
121
138
|
)
|
|
122
139
|
self._async_client = AsyncElasticsearch(
|
|
123
140
|
self._hosts,
|
|
141
|
+
api_key=api_key,
|
|
124
142
|
headers=headers,
|
|
125
143
|
**self._kwargs,
|
|
126
144
|
)
|
|
@@ -160,12 +178,56 @@ class ElasticsearchDocumentStore:
|
|
|
160
178
|
|
|
161
179
|
self._initialized = True
|
|
162
180
|
|
|
181
|
+
def _handle_auth(self) -> Optional[Union[str, Tuple[str, str]]]:
|
|
182
|
+
"""
|
|
183
|
+
Handles authentication for the Elasticsearch client.
|
|
184
|
+
|
|
185
|
+
There are three possible scenarios.
|
|
186
|
+
|
|
187
|
+
1) Authentication with both api_key and api_key_id, either as Secrets or as environment variables. In this case,
|
|
188
|
+
use both for authentication.
|
|
189
|
+
|
|
190
|
+
2) Authentication with only api_key, either as a Secret or as an environment variable. In this case, the api_key
|
|
191
|
+
must be a base64-encoded string that encodes both id and secret <id:secret>.
|
|
192
|
+
|
|
193
|
+
3) There's no authentication, neither api_key nor api_key_id are provided as a Secret nor defined as
|
|
194
|
+
environment variables. In this case, the client will connect without authentication.
|
|
195
|
+
|
|
196
|
+
:returns:
|
|
197
|
+
api_key: Optional[Union[str, Tuple[str, str]]]
|
|
198
|
+
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
api_key: Optional[Union[str, Tuple[str, str]]] # make the type checker happy
|
|
202
|
+
|
|
203
|
+
api_key_resolved = self._api_key.resolve_value()
|
|
204
|
+
api_key_id_resolved = self._api_key_id.resolve_value()
|
|
205
|
+
|
|
206
|
+
# Scenario 1: both are found, use them
|
|
207
|
+
if api_key_id_resolved and api_key_resolved:
|
|
208
|
+
api_key = (api_key_id_resolved, api_key_resolved)
|
|
209
|
+
return api_key
|
|
210
|
+
|
|
211
|
+
# Scenario 2: only api_key is set, must be a base64-encoded string that encodes id and secret (separated by “:”)
|
|
212
|
+
elif api_key_resolved and not api_key_id_resolved:
|
|
213
|
+
return api_key_resolved
|
|
214
|
+
|
|
215
|
+
# Error: only api_key_id is found, raise an error
|
|
216
|
+
elif api_key_id_resolved and not api_key_resolved:
|
|
217
|
+
msg = "api_key_id is provided but api_key is missing."
|
|
218
|
+
raise ValueError(msg)
|
|
219
|
+
|
|
220
|
+
else:
|
|
221
|
+
# Scenario 3: neither found, no authentication
|
|
222
|
+
return None
|
|
223
|
+
|
|
163
224
|
@property
|
|
164
225
|
def client(self) -> Elasticsearch:
|
|
165
226
|
"""
|
|
166
227
|
Returns the synchronous Elasticsearch client, initializing it if necessary.
|
|
167
228
|
"""
|
|
168
229
|
self._ensure_initialized()
|
|
230
|
+
assert self._client is not None # noqa: S101
|
|
169
231
|
return self._client
|
|
170
232
|
|
|
171
233
|
@property
|
|
@@ -174,6 +236,7 @@ class ElasticsearchDocumentStore:
|
|
|
174
236
|
Returns the asynchronous Elasticsearch client, initializing it if necessary.
|
|
175
237
|
"""
|
|
176
238
|
self._ensure_initialized()
|
|
239
|
+
assert self._async_client is not None # noqa: S101
|
|
177
240
|
return self._async_client
|
|
178
241
|
|
|
179
242
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -191,6 +254,8 @@ class ElasticsearchDocumentStore:
|
|
|
191
254
|
hosts=self._hosts,
|
|
192
255
|
custom_mapping=self._custom_mapping,
|
|
193
256
|
index=self._index,
|
|
257
|
+
api_key=self._api_key.to_dict(),
|
|
258
|
+
api_key_id=self._api_key_id.to_dict(),
|
|
194
259
|
embedding_similarity_function=self._embedding_similarity_function,
|
|
195
260
|
**self._kwargs,
|
|
196
261
|
)
|
|
@@ -205,6 +270,7 @@ class ElasticsearchDocumentStore:
|
|
|
205
270
|
:returns:
|
|
206
271
|
Deserialized component.
|
|
207
272
|
"""
|
|
273
|
+
deserialize_secrets_inplace(data, keys=["api_key", "api_key_id"])
|
|
208
274
|
return default_from_dict(cls, data)
|
|
209
275
|
|
|
210
276
|
def count_documents(self) -> int:
|
|
@@ -226,7 +292,7 @@ class ElasticsearchDocumentStore:
|
|
|
226
292
|
result = await self._async_client.count(index=self._index) # type: ignore
|
|
227
293
|
return result["count"]
|
|
228
294
|
|
|
229
|
-
def _search_documents(self, **kwargs) -> List[Document]:
|
|
295
|
+
def _search_documents(self, **kwargs: Any) -> List[Document]:
|
|
230
296
|
"""
|
|
231
297
|
Calls the Elasticsearch client's search method and handles pagination.
|
|
232
298
|
"""
|
|
@@ -253,7 +319,7 @@ class ElasticsearchDocumentStore:
|
|
|
253
319
|
break
|
|
254
320
|
return documents
|
|
255
321
|
|
|
256
|
-
async def _search_documents_async(self, **kwargs) -> List[Document]:
|
|
322
|
+
async def _search_documents_async(self, **kwargs: Any) -> List[Document]:
|
|
257
323
|
"""
|
|
258
324
|
Asynchronously calls the Elasticsearch client's search method and handles pagination.
|
|
259
325
|
"""
|
|
@@ -379,9 +445,12 @@ class ElasticsearchDocumentStore:
|
|
|
379
445
|
refresh="wait_for",
|
|
380
446
|
index=self._index,
|
|
381
447
|
raise_on_error=False,
|
|
448
|
+
stats_only=False,
|
|
382
449
|
)
|
|
383
450
|
|
|
384
451
|
if errors:
|
|
452
|
+
# with stats_only=False, errors is guaranteed to be a list of dicts
|
|
453
|
+
assert isinstance(errors, list) # noqa: S101
|
|
385
454
|
duplicate_errors_ids = []
|
|
386
455
|
other_errors = []
|
|
387
456
|
for e in errors:
|
|
@@ -451,13 +520,16 @@ class ElasticsearchDocumentStore:
|
|
|
451
520
|
|
|
452
521
|
try:
|
|
453
522
|
success, failed = await helpers.async_bulk(
|
|
454
|
-
client=self.
|
|
523
|
+
client=self.async_client,
|
|
455
524
|
actions=actions,
|
|
456
525
|
index=self._index,
|
|
457
526
|
refresh=True,
|
|
458
527
|
raise_on_error=False,
|
|
528
|
+
stats_only=False,
|
|
459
529
|
)
|
|
460
530
|
if failed:
|
|
531
|
+
# with stats_only=False, failed is guaranteed to be a list of dicts
|
|
532
|
+
assert isinstance(failed, list) # noqa: S101
|
|
461
533
|
if policy == DuplicatePolicy.FAIL:
|
|
462
534
|
for error in failed:
|
|
463
535
|
if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
|
|
@@ -494,7 +566,7 @@ class ElasticsearchDocumentStore:
|
|
|
494
566
|
|
|
495
567
|
try:
|
|
496
568
|
await helpers.async_bulk(
|
|
497
|
-
client=self.
|
|
569
|
+
client=self.async_client,
|
|
498
570
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
499
571
|
index=self._index,
|
|
500
572
|
refresh=True,
|
|
@@ -551,6 +623,8 @@ class ElasticsearchDocumentStore:
|
|
|
551
623
|
|
|
552
624
|
if scale_score:
|
|
553
625
|
for doc in documents:
|
|
626
|
+
if doc.score is None:
|
|
627
|
+
continue
|
|
554
628
|
doc.score = float(1 / (1 + np.exp(-np.asarray(doc.score / BM25_SCALING_FACTOR))))
|
|
555
629
|
|
|
556
630
|
return documents
|
|
File without changes
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
4
5
|
from unittest.mock import Mock, patch
|
|
5
6
|
|
|
6
7
|
import pytest
|
|
@@ -38,6 +39,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
38
39
|
"document_store": {
|
|
39
40
|
"init_parameters": {
|
|
40
41
|
"hosts": "some fake host",
|
|
42
|
+
"api_key": {
|
|
43
|
+
"env_vars": [
|
|
44
|
+
"ELASTIC_API_KEY",
|
|
45
|
+
],
|
|
46
|
+
"strict": False,
|
|
47
|
+
"type": "env_var",
|
|
48
|
+
},
|
|
49
|
+
"api_key_id": {
|
|
50
|
+
"env_vars": [
|
|
51
|
+
"ELASTIC_API_KEY_ID",
|
|
52
|
+
],
|
|
53
|
+
"strict": False,
|
|
54
|
+
"type": "env_var",
|
|
55
|
+
},
|
|
41
56
|
"custom_mapping": None,
|
|
42
57
|
"index": "default",
|
|
43
58
|
"embedding_similarity_function": "cosine",
|
|
@@ -13,6 +13,8 @@ from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
|
13
13
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
14
14
|
from haystack.document_stores.types import DuplicatePolicy
|
|
15
15
|
from haystack.testing.document_store import DocumentStoreBaseTests
|
|
16
|
+
from haystack.utils import Secret
|
|
17
|
+
from haystack.utils.auth import TokenSecret
|
|
16
18
|
|
|
17
19
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
18
20
|
|
|
@@ -46,6 +48,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
46
48
|
assert res == {
|
|
47
49
|
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
48
50
|
"init_parameters": {
|
|
51
|
+
"api_key": {
|
|
52
|
+
"env_vars": [
|
|
53
|
+
"ELASTIC_API_KEY",
|
|
54
|
+
],
|
|
55
|
+
"strict": False,
|
|
56
|
+
"type": "env_var",
|
|
57
|
+
},
|
|
58
|
+
"api_key_id": {
|
|
59
|
+
"env_vars": [
|
|
60
|
+
"ELASTIC_API_KEY_ID",
|
|
61
|
+
],
|
|
62
|
+
"strict": False,
|
|
63
|
+
"type": "env_var",
|
|
64
|
+
},
|
|
49
65
|
"hosts": "some hosts",
|
|
50
66
|
"custom_mapping": None,
|
|
51
67
|
"index": "default",
|
|
@@ -62,6 +78,8 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
62
78
|
"hosts": "some hosts",
|
|
63
79
|
"custom_mapping": None,
|
|
64
80
|
"index": "default",
|
|
81
|
+
"api_key": None,
|
|
82
|
+
"api_key_id": None,
|
|
65
83
|
"embedding_similarity_function": "cosine",
|
|
66
84
|
},
|
|
67
85
|
}
|
|
@@ -69,9 +87,135 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
69
87
|
assert document_store._hosts == "some hosts"
|
|
70
88
|
assert document_store._index == "default"
|
|
71
89
|
assert document_store._custom_mapping is None
|
|
90
|
+
assert document_store._api_key is None
|
|
91
|
+
assert document_store._api_key_id is None
|
|
72
92
|
assert document_store._embedding_similarity_function == "cosine"
|
|
73
93
|
|
|
74
94
|
|
|
95
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
96
|
+
def test_to_dict_with_api_keys_env_vars(_mock_elasticsearch_client, monkeypatch):
|
|
97
|
+
monkeypatch.setenv("ELASTIC_API_KEY", "test-api-key")
|
|
98
|
+
monkeypatch.setenv("ELASTIC_API_KEY_ID", "test-api-key-id")
|
|
99
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200")
|
|
100
|
+
document_store.client()
|
|
101
|
+
res = document_store.to_dict()
|
|
102
|
+
assert res["init_parameters"]["api_key"] == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}
|
|
103
|
+
assert res["init_parameters"]["api_key_id"] == {
|
|
104
|
+
"type": "env_var",
|
|
105
|
+
"env_vars": ["ELASTIC_API_KEY_ID"],
|
|
106
|
+
"strict": False,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
111
|
+
def test_to_dict_with_api_keys_as_secret(_mock_elasticsearch_client, monkeypatch):
|
|
112
|
+
monkeypatch.setenv("ELASTIC_API_KEY", "test-api-key")
|
|
113
|
+
monkeypatch.setenv("ELASTIC_API_KEY_ID", "test-api-key-id")
|
|
114
|
+
with pytest.raises(ValueError):
|
|
115
|
+
document_store = ElasticsearchDocumentStore(
|
|
116
|
+
hosts="https://localhost:9200",
|
|
117
|
+
api_key=TokenSecret(_token="test-api-key"),
|
|
118
|
+
api_key_id=TokenSecret(_token="test-api-key-id"),
|
|
119
|
+
)
|
|
120
|
+
document_store.client()
|
|
121
|
+
_ = document_store.to_dict()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
125
|
+
def test_from_dict_with_api_keys_env_vars(_mock_elasticsearch_client):
|
|
126
|
+
data = {
|
|
127
|
+
"type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore",
|
|
128
|
+
"init_parameters": {
|
|
129
|
+
"hosts": "some hosts",
|
|
130
|
+
"custom_mapping": None,
|
|
131
|
+
"index": "default",
|
|
132
|
+
"api_key": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False},
|
|
133
|
+
"api_key_id": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False},
|
|
134
|
+
"embedding_similarity_function": "cosine",
|
|
135
|
+
},
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
document_store = ElasticsearchDocumentStore.from_dict(data)
|
|
139
|
+
assert document_store._api_key == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}
|
|
140
|
+
assert document_store._api_key_id == {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
144
|
+
def test_api_key_validation_only_api_key(_mock_elasticsearch_client):
|
|
145
|
+
api_key = Secret.from_token("test_api_key")
|
|
146
|
+
|
|
147
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key)
|
|
148
|
+
document_store.client()
|
|
149
|
+
assert document_store._api_key == api_key
|
|
150
|
+
# not passing the api_key_id makes it default to reading from env var
|
|
151
|
+
assert document_store._api_key_id == Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
155
|
+
def test_api_key_validation_only_api_key_id_raises_error(_mock_elasticsearch_client):
|
|
156
|
+
api_key_id = Secret.from_token("test_api_key_id")
|
|
157
|
+
with pytest.raises(ValueError, match="api_key_id is provided but api_key is missing"):
|
|
158
|
+
es = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key_id=api_key_id)
|
|
159
|
+
es.client()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
163
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
|
|
164
|
+
def test_client_initialization_with_api_key_tuple(_mock_async_es, _mock_es):
|
|
165
|
+
api_key = Secret.from_token("test_api_key")
|
|
166
|
+
api_key_id = Secret.from_token("test_api_key_id")
|
|
167
|
+
|
|
168
|
+
# Mock the client.info() call to avoid actual connection
|
|
169
|
+
mock_client = Mock()
|
|
170
|
+
mock_client.info.return_value = {"version": {"number": "8.0.0"}}
|
|
171
|
+
_mock_es.return_value = mock_client
|
|
172
|
+
|
|
173
|
+
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key, api_key_id=api_key_id)
|
|
174
|
+
|
|
175
|
+
# Access client to trigger initialization
|
|
176
|
+
_ = document_store.client
|
|
177
|
+
|
|
178
|
+
# Check that Elasticsearch was called with the correct api_key tuple
|
|
179
|
+
_mock_es.assert_called_once()
|
|
180
|
+
call_args = _mock_es.call_args
|
|
181
|
+
assert call_args[0][0] == "https://localhost:9200" # hosts
|
|
182
|
+
assert call_args[1]["api_key"] == ("test_api_key_id", "test_api_key")
|
|
183
|
+
|
|
184
|
+
# Check that AsyncElasticsearch was called with the same api_key tuple
|
|
185
|
+
_mock_async_es.assert_called_once()
|
|
186
|
+
async_call_args = _mock_async_es.call_args
|
|
187
|
+
assert async_call_args[0][0] == "https://localhost:9200" # hosts
|
|
188
|
+
assert async_call_args[1]["api_key"] == ("test_api_key_id", "test_api_key")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
192
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
|
|
193
|
+
def test_client_initialization_with_api_key_string(_mock_async_es, _mock_es):
|
|
194
|
+
api_key = Secret.from_token("test_api_key")
|
|
195
|
+
|
|
196
|
+
# Mock the client.info() call to avoid actual connection
|
|
197
|
+
mock_client = Mock()
|
|
198
|
+
mock_client.info.return_value = {"version": {"number": "8.0.0"}}
|
|
199
|
+
_mock_es.return_value = mock_client
|
|
200
|
+
|
|
201
|
+
document_store = ElasticsearchDocumentStore(hosts="testhost", api_key=api_key)
|
|
202
|
+
|
|
203
|
+
# Access client to trigger initialization
|
|
204
|
+
_ = document_store.client
|
|
205
|
+
|
|
206
|
+
# Check that Elasticsearch was called with the correct api_key string
|
|
207
|
+
_mock_es.assert_called_once()
|
|
208
|
+
call_args = _mock_es.call_args
|
|
209
|
+
assert call_args[0][0] == "testhost" # hosts
|
|
210
|
+
assert call_args[1]["api_key"] == "test_api_key"
|
|
211
|
+
|
|
212
|
+
# Check that AsyncElasticsearch was called with the same api_key string
|
|
213
|
+
_mock_async_es.assert_called_once()
|
|
214
|
+
async_call_args = _mock_async_es.call_args
|
|
215
|
+
assert async_call_args[0][0] == "testhost" # hosts
|
|
216
|
+
assert async_call_args[1]["api_key"] == "test_api_key"
|
|
217
|
+
|
|
218
|
+
|
|
75
219
|
@pytest.mark.integration
|
|
76
220
|
class TestDocumentStore(DocumentStoreBaseTests):
|
|
77
221
|
"""
|
|
@@ -342,7 +486,6 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
342
486
|
|
|
343
487
|
@pytest.mark.integration
|
|
344
488
|
class TestElasticsearchDocumentStoreAsync:
|
|
345
|
-
|
|
346
489
|
@pytest.fixture
|
|
347
490
|
async def document_store(self, request):
|
|
348
491
|
"""
|
|
@@ -410,7 +553,6 @@ class TestElasticsearchDocumentStoreAsync:
|
|
|
410
553
|
|
|
411
554
|
@pytest.mark.asyncio
|
|
412
555
|
async def test_embedding_retrieval_async(self, document_store):
|
|
413
|
-
|
|
414
556
|
# init document store
|
|
415
557
|
docs = [
|
|
416
558
|
Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),
|
{elasticsearch_haystack-3.0.1 → elasticsearch_haystack-4.0.0}/tests/test_embedding_retriever.py
RENAMED
|
@@ -37,6 +37,20 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
37
37
|
"init_parameters": {
|
|
38
38
|
"document_store": {
|
|
39
39
|
"init_parameters": {
|
|
40
|
+
"api_key": {
|
|
41
|
+
"env_vars": [
|
|
42
|
+
"ELASTIC_API_KEY",
|
|
43
|
+
],
|
|
44
|
+
"strict": False,
|
|
45
|
+
"type": "env_var",
|
|
46
|
+
},
|
|
47
|
+
"api_key_id": {
|
|
48
|
+
"env_vars": [
|
|
49
|
+
"ELASTIC_API_KEY_ID",
|
|
50
|
+
],
|
|
51
|
+
"strict": False,
|
|
52
|
+
"type": "env_var",
|
|
53
|
+
},
|
|
40
54
|
"hosts": "some fake host",
|
|
41
55
|
"custom_mapping": None,
|
|
42
56
|
"index": "default",
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
[](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml)
|
|
2
|
-
|
|
3
|
-
[](https://pypi.org/project/elasticsearch-haystack)
|
|
4
|
-
[](https://pypi.org/project/elasticsearch-haystack)
|
|
5
|
-
|
|
6
|
-
# Elasticsearch Document Store
|
|
7
|
-
|
|
8
|
-
Document Store for Haystack 2.x, supports ElasticSearch 8.
|
|
9
|
-
|
|
10
|
-
## Installation
|
|
11
|
-
|
|
12
|
-
```console
|
|
13
|
-
pip install elasticsearch-haystack
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
## Testing
|
|
17
|
-
|
|
18
|
-
To run tests first start a Docker container running ElasticSearch. We provide a utility `docker-compose.yml` for that:
|
|
19
|
-
|
|
20
|
-
```console
|
|
21
|
-
docker-compose up
|
|
22
|
-
```
|
|
23
|
-
|
|
24
|
-
Then run tests:
|
|
25
|
-
|
|
26
|
-
```console
|
|
27
|
-
hatch run test
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
## License
|
|
31
|
-
|
|
32
|
-
`elasticsearch-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|