elasticsearch-haystack 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- elasticsearch_haystack-0.7.0/CHANGELOG.md +80 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/PKG-INFO +1 -1
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/pyproject.toml +24 -55
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +13 -3
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +13 -3
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +47 -35
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_bm25_retriever.py +12 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_document_store.py +9 -3
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_embedding_retriever.py +10 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/.gitignore +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/LICENSE +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/README.md +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/pydoc/config.yml +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_filters.py +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [unreleased]
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Defer the database connection to when it's needed (#766)
|
|
8
|
+
|
|
9
|
+
## [integrations/elasticsearch-v0.5.0] - 2024-05-24
|
|
10
|
+
|
|
11
|
+
### 🐛 Bug Fixes
|
|
12
|
+
|
|
13
|
+
- Add support for custom mapping in ElasticsearchDocumentStore (#721)
|
|
14
|
+
|
|
15
|
+
## [integrations/elasticsearch-v0.4.0] - 2024-04-03
|
|
16
|
+
|
|
17
|
+
### 📚 Documentation
|
|
18
|
+
|
|
19
|
+
- Docstring update (#525)
|
|
20
|
+
- Review Elastic (#541)
|
|
21
|
+
- Disable-class-def (#556)
|
|
22
|
+
|
|
23
|
+
## [integrations/elasticsearch-v0.3.0] - 2024-02-23
|
|
24
|
+
|
|
25
|
+
### 🐛 Bug Fixes
|
|
26
|
+
|
|
27
|
+
- Fix order of API docs (#447)
|
|
28
|
+
|
|
29
|
+
This PR will also push the docs to Readme
|
|
30
|
+
|
|
31
|
+
### 📚 Documentation
|
|
32
|
+
|
|
33
|
+
- Update category slug (#442)
|
|
34
|
+
|
|
35
|
+
### Elasticsearch
|
|
36
|
+
|
|
37
|
+
- Add user-agent header (#457)
|
|
38
|
+
|
|
39
|
+
### Feat
|
|
40
|
+
|
|
41
|
+
- Add filters to run function in retrievers of elasticsearch (#440)
|
|
42
|
+
|
|
43
|
+
### Elasticsearch
|
|
44
|
+
|
|
45
|
+
- Generate api docs (#322)
|
|
46
|
+
|
|
47
|
+
## [integrations/elasticsearch-v0.2.0] - 2024-01-19
|
|
48
|
+
|
|
49
|
+
## [integrations/elasticsearch-v0.1.3] - 2024-01-18
|
|
50
|
+
|
|
51
|
+
## [integrations/elasticsearch-v0.1.2] - 2023-12-20
|
|
52
|
+
|
|
53
|
+
### 🐛 Bug Fixes
|
|
54
|
+
|
|
55
|
+
- Fix project urls (#96)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
### 🚜 Refactor
|
|
60
|
+
|
|
61
|
+
- Use `hatch_vcs` to manage integrations versioning (#103)
|
|
62
|
+
|
|
63
|
+
## [integrations/elasticsearch-v0.1.1] - 2023-12-05
|
|
64
|
+
|
|
65
|
+
### 🐛 Bug Fixes
|
|
66
|
+
|
|
67
|
+
- Fix import and increase version (#77)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
## [integrations/elasticsearch-v0.1.0] - 2023-12-04
|
|
72
|
+
|
|
73
|
+
### 🐛 Bug Fixes
|
|
74
|
+
|
|
75
|
+
- Fix license headers
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
## [integrations/elasticsearch-v0.0.2] - 2023-11-29
|
|
79
|
+
|
|
80
|
+
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -10,9 +10,7 @@ readme = "README.md"
|
|
|
10
10
|
requires-python = ">=3.8"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
keywords = []
|
|
13
|
-
authors = [
|
|
14
|
-
{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" },
|
|
15
|
-
]
|
|
13
|
+
authors = [{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" }]
|
|
16
14
|
classifiers = [
|
|
17
15
|
"License :: OSI Approved :: Apache Software License",
|
|
18
16
|
"Development Status :: 4 - Beta",
|
|
@@ -24,10 +22,7 @@ classifiers = [
|
|
|
24
22
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
25
23
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
26
24
|
]
|
|
27
|
-
dependencies = [
|
|
28
|
-
"haystack-ai",
|
|
29
|
-
"elasticsearch>=8,<9",
|
|
30
|
-
]
|
|
25
|
+
dependencies = ["haystack-ai", "elasticsearch>=8,<9"]
|
|
31
26
|
|
|
32
27
|
[project.urls]
|
|
33
28
|
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme"
|
|
@@ -49,49 +44,28 @@ git_describe_command = 'git describe --tags --match="integrations/elasticsearch-
|
|
|
49
44
|
dependencies = [
|
|
50
45
|
"coverage[toml]>=6.5",
|
|
51
46
|
"pytest",
|
|
47
|
+
"pytest-rerunfailures",
|
|
52
48
|
"pytest-xdist",
|
|
53
49
|
"haystack-pydoc-tools",
|
|
54
50
|
]
|
|
55
51
|
[tool.hatch.envs.default.scripts]
|
|
56
|
-
test = "pytest {args:tests}"
|
|
57
|
-
test-cov = "coverage run -m pytest {args:tests}"
|
|
58
|
-
cov-report = [
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
]
|
|
62
|
-
cov = [
|
|
63
|
-
"test-cov",
|
|
64
|
-
"cov-report",
|
|
65
|
-
]
|
|
66
|
-
docs = [
|
|
67
|
-
"pydoc-markdown pydoc/config.yml"
|
|
68
|
-
]
|
|
52
|
+
test = "pytest --reruns 3 --reruns-delay 30 -x {args:tests}"
|
|
53
|
+
test-cov = "coverage run -m pytest --reruns 3 --reruns-delay 30 -x {args:tests}"
|
|
54
|
+
cov-report = ["- coverage combine", "coverage report"]
|
|
55
|
+
cov = ["test-cov", "cov-report"]
|
|
56
|
+
docs = ["pydoc-markdown pydoc/config.yml"]
|
|
69
57
|
|
|
70
58
|
[[tool.hatch.envs.all.matrix]]
|
|
71
59
|
python = ["3.8", "3.9", "3.10", "3.11"]
|
|
72
60
|
|
|
73
61
|
[tool.hatch.envs.lint]
|
|
74
62
|
detached = true
|
|
75
|
-
dependencies = [
|
|
76
|
-
"black>=23.1.0",
|
|
77
|
-
"mypy>=1.0.0",
|
|
78
|
-
"ruff>=0.0.243",
|
|
79
|
-
]
|
|
63
|
+
dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
|
|
80
64
|
[tool.hatch.envs.lint.scripts]
|
|
81
65
|
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
|
|
82
|
-
style = [
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
]
|
|
86
|
-
fmt = [
|
|
87
|
-
"black {args:.}",
|
|
88
|
-
"ruff --fix {args:.}",
|
|
89
|
-
"style",
|
|
90
|
-
]
|
|
91
|
-
all = [
|
|
92
|
-
"style",
|
|
93
|
-
"typing",
|
|
94
|
-
]
|
|
66
|
+
style = ["ruff check {args:.}", "black --check --diff {args:.}"]
|
|
67
|
+
fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
|
|
68
|
+
all = ["style", "typing"]
|
|
95
69
|
|
|
96
70
|
[tool.hatch.metadata]
|
|
97
71
|
allow-direct-references = true
|
|
@@ -137,9 +111,15 @@ ignore = [
|
|
|
137
111
|
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
|
138
112
|
"FBT003",
|
|
139
113
|
# Ignore checks for possible passwords
|
|
140
|
-
"S105",
|
|
114
|
+
"S105",
|
|
115
|
+
"S106",
|
|
116
|
+
"S107",
|
|
141
117
|
# Ignore complexity
|
|
142
|
-
"C901",
|
|
118
|
+
"C901",
|
|
119
|
+
"PLR0911",
|
|
120
|
+
"PLR0912",
|
|
121
|
+
"PLR0913",
|
|
122
|
+
"PLR0915",
|
|
143
123
|
]
|
|
144
124
|
unfixable = [
|
|
145
125
|
# Don't touch unused imports
|
|
@@ -164,25 +144,14 @@ parallel = false
|
|
|
164
144
|
|
|
165
145
|
[tool.coverage.report]
|
|
166
146
|
omit = ["*/tests/*", "*/__init__.py"]
|
|
167
|
-
show_missing=true
|
|
168
|
-
exclude_lines = [
|
|
169
|
-
"no cov",
|
|
170
|
-
"if __name__ == .__main__.:",
|
|
171
|
-
"if TYPE_CHECKING:",
|
|
172
|
-
]
|
|
147
|
+
show_missing = true
|
|
148
|
+
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
|
|
173
149
|
|
|
174
150
|
|
|
175
151
|
[tool.pytest.ini_options]
|
|
176
152
|
minversion = "6.0"
|
|
177
|
-
markers = [
|
|
178
|
-
"unit: unit tests",
|
|
179
|
-
"integration: integration tests"
|
|
180
|
-
]
|
|
153
|
+
markers = ["unit: unit tests", "integration: integration tests"]
|
|
181
154
|
|
|
182
155
|
[[tool.mypy.overrides]]
|
|
183
|
-
module = [
|
|
184
|
-
"haystack.*",
|
|
185
|
-
"haystack_integrations.*",
|
|
186
|
-
"pytest.*"
|
|
187
|
-
]
|
|
156
|
+
module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
|
|
188
157
|
ignore_missing_imports = true
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from haystack import component, default_from_dict, default_to_dict
|
|
7
7
|
from haystack.dataclasses import Document
|
|
8
|
+
from haystack.document_stores.types import FilterPolicy
|
|
9
|
+
from haystack.document_stores.types.filter_policy import apply_filter_policy
|
|
8
10
|
from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore
|
|
9
11
|
|
|
10
12
|
|
|
@@ -48,6 +50,7 @@ class ElasticsearchBM25Retriever:
|
|
|
48
50
|
fuzziness: str = "AUTO",
|
|
49
51
|
top_k: int = 10,
|
|
50
52
|
scale_score: bool = False,
|
|
53
|
+
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
51
54
|
):
|
|
52
55
|
"""
|
|
53
56
|
Initialize ElasticsearchBM25Retriever with an instance ElasticsearchDocumentStore.
|
|
@@ -60,6 +63,7 @@ class ElasticsearchBM25Retriever:
|
|
|
60
63
|
for more details.
|
|
61
64
|
:param top_k: Maximum number of Documents to return.
|
|
62
65
|
:param scale_score: If `True` scales the Document`s scores between 0 and 1.
|
|
66
|
+
:param filter_policy: Policy to determine how filters are applied.
|
|
63
67
|
:raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`.
|
|
64
68
|
"""
|
|
65
69
|
|
|
@@ -72,6 +76,7 @@ class ElasticsearchBM25Retriever:
|
|
|
72
76
|
self._fuzziness = fuzziness
|
|
73
77
|
self._top_k = top_k
|
|
74
78
|
self._scale_score = scale_score
|
|
79
|
+
self._filter_policy = FilterPolicy.from_str(filter_policy) if isinstance(filter_policy, str) else filter_policy
|
|
75
80
|
|
|
76
81
|
def to_dict(self) -> Dict[str, Any]:
|
|
77
82
|
"""
|
|
@@ -86,6 +91,7 @@ class ElasticsearchBM25Retriever:
|
|
|
86
91
|
fuzziness=self._fuzziness,
|
|
87
92
|
top_k=self._top_k,
|
|
88
93
|
scale_score=self._scale_score,
|
|
94
|
+
filter_policy=self._filter_policy.value,
|
|
89
95
|
document_store=self._document_store.to_dict(),
|
|
90
96
|
)
|
|
91
97
|
|
|
@@ -102,6 +108,7 @@ class ElasticsearchBM25Retriever:
|
|
|
102
108
|
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
|
|
103
109
|
data["init_parameters"]["document_store"]
|
|
104
110
|
)
|
|
111
|
+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
|
|
105
112
|
return default_from_dict(cls, data)
|
|
106
113
|
|
|
107
114
|
@component.output_types(documents=List[Document])
|
|
@@ -110,14 +117,17 @@ class ElasticsearchBM25Retriever:
|
|
|
110
117
|
Retrieve documents using the BM25 keyword-based algorithm.
|
|
111
118
|
|
|
112
119
|
:param query: String to search in `Document`s' text.
|
|
113
|
-
:param filters: Filters applied to the retrieved
|
|
120
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
121
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
122
|
+
details.
|
|
114
123
|
:param top_k: Maximum number of `Document` to return.
|
|
115
124
|
:returns: A dictionary with the following keys:
|
|
116
125
|
- `documents`: List of `Document`s that match the query.
|
|
117
126
|
"""
|
|
127
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
118
128
|
docs = self._document_store._bm25_retrieval(
|
|
119
129
|
query=query,
|
|
120
|
-
filters=filters
|
|
130
|
+
filters=filters,
|
|
121
131
|
fuzziness=self._fuzziness,
|
|
122
132
|
top_k=top_k or self._top_k,
|
|
123
133
|
scale_score=self._scale_score,
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from haystack import component, default_from_dict, default_to_dict
|
|
7
7
|
from haystack.dataclasses import Document
|
|
8
|
+
from haystack.document_stores.types import FilterPolicy
|
|
9
|
+
from haystack.document_stores.types.filter_policy import apply_filter_policy
|
|
8
10
|
from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore
|
|
9
11
|
|
|
10
12
|
|
|
@@ -49,6 +51,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
49
51
|
filters: Optional[Dict[str, Any]] = None,
|
|
50
52
|
top_k: int = 10,
|
|
51
53
|
num_candidates: Optional[int] = None,
|
|
54
|
+
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
52
55
|
):
|
|
53
56
|
"""
|
|
54
57
|
Create the ElasticsearchEmbeddingRetriever component.
|
|
@@ -61,6 +64,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
61
64
|
Increasing this value will improve search accuracy at the cost of slower search speeds.
|
|
62
65
|
You can read more about it in the Elasticsearch
|
|
63
66
|
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
|
|
67
|
+
:param filter_policy: Policy to determine how filters are applied.
|
|
64
68
|
:raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
|
|
65
69
|
"""
|
|
66
70
|
if not isinstance(document_store, ElasticsearchDocumentStore):
|
|
@@ -71,6 +75,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
71
75
|
self._filters = filters or {}
|
|
72
76
|
self._top_k = top_k
|
|
73
77
|
self._num_candidates = num_candidates
|
|
78
|
+
self._filter_policy = FilterPolicy.from_str(filter_policy) if isinstance(filter_policy, str) else filter_policy
|
|
74
79
|
|
|
75
80
|
def to_dict(self) -> Dict[str, Any]:
|
|
76
81
|
"""
|
|
@@ -84,6 +89,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
84
89
|
filters=self._filters,
|
|
85
90
|
top_k=self._top_k,
|
|
86
91
|
num_candidates=self._num_candidates,
|
|
92
|
+
filter_policy=self._filter_policy.value,
|
|
87
93
|
document_store=self._document_store.to_dict(),
|
|
88
94
|
)
|
|
89
95
|
|
|
@@ -100,6 +106,7 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
100
106
|
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
|
|
101
107
|
data["init_parameters"]["document_store"]
|
|
102
108
|
)
|
|
109
|
+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
|
|
103
110
|
return default_from_dict(cls, data)
|
|
104
111
|
|
|
105
112
|
@component.output_types(documents=List[Document])
|
|
@@ -108,14 +115,17 @@ class ElasticsearchEmbeddingRetriever:
|
|
|
108
115
|
Retrieve documents using a vector similarity metric.
|
|
109
116
|
|
|
110
117
|
:param query_embedding: Embedding of the query.
|
|
111
|
-
:param filters: Filters applied to the retrieved
|
|
118
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
119
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
120
|
+
details.
|
|
112
121
|
:param top_k: Maximum number of `Document`s to return.
|
|
113
122
|
:returns: A dictionary with the following keys:
|
|
114
123
|
- `documents`: List of `Document`s most similar to the given `query_embedding`
|
|
115
124
|
"""
|
|
125
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
116
126
|
docs = self._document_store._embedding_retrieval(
|
|
117
127
|
query_embedding=query_embedding,
|
|
118
|
-
filters=filters
|
|
128
|
+
filters=filters,
|
|
119
129
|
top_k=top_k or self._top_k,
|
|
120
130
|
num_candidates=self._num_candidates,
|
|
121
131
|
)
|
|
@@ -93,48 +93,60 @@ class ElasticsearchDocumentStore:
|
|
|
93
93
|
:param **kwargs: Optional arguments that `Elasticsearch` takes.
|
|
94
94
|
"""
|
|
95
95
|
self._hosts = hosts
|
|
96
|
-
self._client =
|
|
97
|
-
hosts,
|
|
98
|
-
headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
|
|
99
|
-
**kwargs,
|
|
100
|
-
)
|
|
96
|
+
self._client = None
|
|
101
97
|
self._index = index
|
|
102
98
|
self._embedding_similarity_function = embedding_similarity_function
|
|
103
99
|
self._custom_mapping = custom_mapping
|
|
104
100
|
self._kwargs = kwargs
|
|
105
101
|
|
|
106
|
-
# Check client connection, this will raise if not connected
|
|
107
|
-
self._client.info()
|
|
108
|
-
|
|
109
102
|
if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
|
|
110
103
|
msg = "custom_mapping must be a dictionary"
|
|
111
104
|
raise ValueError(msg)
|
|
112
105
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
"
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
106
|
+
@property
|
|
107
|
+
def client(self) -> Elasticsearch:
|
|
108
|
+
if self._client is None:
|
|
109
|
+
client = Elasticsearch(
|
|
110
|
+
self._hosts,
|
|
111
|
+
headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
|
|
112
|
+
**self._kwargs,
|
|
113
|
+
)
|
|
114
|
+
# Check client connection, this will raise if not connected
|
|
115
|
+
client.info()
|
|
116
|
+
|
|
117
|
+
if self._custom_mapping:
|
|
118
|
+
mappings = self._custom_mapping
|
|
119
|
+
else:
|
|
120
|
+
# Configure mapping for the embedding field if none is provided
|
|
121
|
+
mappings = {
|
|
122
|
+
"properties": {
|
|
123
|
+
"embedding": {
|
|
124
|
+
"type": "dense_vector",
|
|
125
|
+
"index": True,
|
|
126
|
+
"similarity": self._embedding_similarity_function,
|
|
127
|
+
},
|
|
128
|
+
"content": {"type": "text"},
|
|
129
|
+
},
|
|
130
|
+
"dynamic_templates": [
|
|
131
|
+
{
|
|
132
|
+
"strings": {
|
|
133
|
+
"path_match": "*",
|
|
134
|
+
"match_mapping_type": "string",
|
|
135
|
+
"mapping": {
|
|
136
|
+
"type": "keyword",
|
|
137
|
+
},
|
|
138
|
+
}
|
|
130
139
|
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
140
|
+
],
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Create the index if it doesn't exist
|
|
144
|
+
if not client.indices.exists(index=self._index):
|
|
145
|
+
client.indices.create(index=self._index, mappings=mappings)
|
|
146
|
+
|
|
147
|
+
self._client = client
|
|
134
148
|
|
|
135
|
-
|
|
136
|
-
if not self._client.indices.exists(index=index):
|
|
137
|
-
self._client.indices.create(index=index, mappings=mappings)
|
|
149
|
+
return self._client
|
|
138
150
|
|
|
139
151
|
def to_dict(self) -> Dict[str, Any]:
|
|
140
152
|
"""
|
|
@@ -172,7 +184,7 @@ class ElasticsearchDocumentStore:
|
|
|
172
184
|
Returns how many documents are present in the document store.
|
|
173
185
|
:returns: Number of documents in the document store.
|
|
174
186
|
"""
|
|
175
|
-
return self.
|
|
187
|
+
return self.client.count(index=self._index)["count"]
|
|
176
188
|
|
|
177
189
|
def _search_documents(self, **kwargs) -> List[Document]:
|
|
178
190
|
"""
|
|
@@ -187,7 +199,7 @@ class ElasticsearchDocumentStore:
|
|
|
187
199
|
from_ = 0
|
|
188
200
|
# Handle pagination
|
|
189
201
|
while True:
|
|
190
|
-
res = self.
|
|
202
|
+
res = self.client.search(
|
|
191
203
|
index=self._index,
|
|
192
204
|
from_=from_,
|
|
193
205
|
**kwargs,
|
|
@@ -261,7 +273,7 @@ class ElasticsearchDocumentStore:
|
|
|
261
273
|
)
|
|
262
274
|
|
|
263
275
|
documents_written, errors = helpers.bulk(
|
|
264
|
-
client=self.
|
|
276
|
+
client=self.client,
|
|
265
277
|
actions=elasticsearch_actions,
|
|
266
278
|
refresh="wait_for",
|
|
267
279
|
index=self._index,
|
|
@@ -317,7 +329,7 @@ class ElasticsearchDocumentStore:
|
|
|
317
329
|
"""
|
|
318
330
|
|
|
319
331
|
helpers.bulk(
|
|
320
|
-
client=self.
|
|
332
|
+
client=self.client,
|
|
321
333
|
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
322
334
|
refresh="wait_for",
|
|
323
335
|
index=self._index,
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
from unittest.mock import Mock, patch
|
|
5
5
|
|
|
6
|
+
import pytest
|
|
6
7
|
from haystack.dataclasses import Document
|
|
8
|
+
from haystack.document_stores.types import FilterPolicy
|
|
7
9
|
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
|
|
8
10
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
9
11
|
|
|
@@ -14,8 +16,15 @@ def test_init_default():
|
|
|
14
16
|
assert retriever._document_store == mock_store
|
|
15
17
|
assert retriever._filters == {}
|
|
16
18
|
assert retriever._top_k == 10
|
|
19
|
+
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
17
20
|
assert not retriever._scale_score
|
|
18
21
|
|
|
22
|
+
retriever = ElasticsearchBM25Retriever(document_store=mock_store, filter_policy="replace")
|
|
23
|
+
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
24
|
+
|
|
25
|
+
with pytest.raises(ValueError):
|
|
26
|
+
ElasticsearchBM25Retriever(document_store=mock_store, filter_policy="keep")
|
|
27
|
+
|
|
19
28
|
|
|
20
29
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
21
30
|
def test_to_dict(_mock_elasticsearch_client):
|
|
@@ -38,6 +47,7 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
38
47
|
"fuzziness": "AUTO",
|
|
39
48
|
"top_k": 10,
|
|
40
49
|
"scale_score": False,
|
|
50
|
+
"filter_policy": "replace",
|
|
41
51
|
},
|
|
42
52
|
}
|
|
43
53
|
|
|
@@ -55,6 +65,7 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
55
65
|
"fuzziness": "AUTO",
|
|
56
66
|
"top_k": 10,
|
|
57
67
|
"scale_score": True,
|
|
68
|
+
"filter_policy": "replace",
|
|
58
69
|
},
|
|
59
70
|
}
|
|
60
71
|
retriever = ElasticsearchBM25Retriever.from_dict(data)
|
|
@@ -63,6 +74,7 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
63
74
|
assert retriever._fuzziness == "AUTO"
|
|
64
75
|
assert retriever._top_k == 10
|
|
65
76
|
assert retriever._scale_score
|
|
77
|
+
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
66
78
|
|
|
67
79
|
|
|
68
80
|
def test_run():
|
|
@@ -15,6 +15,12 @@ from haystack.testing.document_store import DocumentStoreBaseTests
|
|
|
15
15
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
19
|
+
def test_init_is_lazy(_mock_es_client):
|
|
20
|
+
ElasticsearchDocumentStore(hosts="testhost")
|
|
21
|
+
_mock_es_client.assert_not_called()
|
|
22
|
+
|
|
23
|
+
|
|
18
24
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
19
25
|
def test_to_dict(_mock_elasticsearch_client):
|
|
20
26
|
document_store = ElasticsearchDocumentStore(hosts="some hosts")
|
|
@@ -73,7 +79,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
73
79
|
hosts=hosts, index=index, embedding_similarity_function=embedding_similarity_function
|
|
74
80
|
)
|
|
75
81
|
yield store
|
|
76
|
-
store.
|
|
82
|
+
store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
|
|
77
83
|
|
|
78
84
|
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
79
85
|
"""
|
|
@@ -101,7 +107,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
101
107
|
super().assert_documents_are_equal(received, expected)
|
|
102
108
|
|
|
103
109
|
def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
|
|
104
|
-
assert document_store.
|
|
110
|
+
assert document_store.client._headers["user-agent"].startswith("haystack-py-ds/")
|
|
105
111
|
|
|
106
112
|
def test_write_documents(self, document_store: ElasticsearchDocumentStore):
|
|
107
113
|
docs = [Document(id="1")]
|
|
@@ -308,7 +314,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
308
314
|
)
|
|
309
315
|
mock_elasticsearch.return_value = mock_client
|
|
310
316
|
|
|
311
|
-
ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping)
|
|
317
|
+
_ = ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping).client
|
|
312
318
|
mock_client.indices.create.assert_called_once_with(
|
|
313
319
|
index="default",
|
|
314
320
|
mappings=custom_mapping,
|
{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_embedding_retriever.py
RENAMED
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
from unittest.mock import Mock, patch
|
|
5
5
|
|
|
6
|
+
import pytest
|
|
6
7
|
from haystack.dataclasses import Document
|
|
8
|
+
from haystack.document_stores.types import FilterPolicy
|
|
7
9
|
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
|
|
8
10
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
9
11
|
|
|
@@ -16,6 +18,12 @@ def test_init_default():
|
|
|
16
18
|
assert retriever._top_k == 10
|
|
17
19
|
assert retriever._num_candidates is None
|
|
18
20
|
|
|
21
|
+
retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store, filter_policy="replace")
|
|
22
|
+
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
23
|
+
|
|
24
|
+
with pytest.raises(ValueError):
|
|
25
|
+
ElasticsearchEmbeddingRetriever(document_store=mock_store, filter_policy="keep")
|
|
26
|
+
|
|
19
27
|
|
|
20
28
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
21
29
|
def test_to_dict(_mock_elasticsearch_client):
|
|
@@ -37,6 +45,7 @@ def test_to_dict(_mock_elasticsearch_client):
|
|
|
37
45
|
},
|
|
38
46
|
"filters": {},
|
|
39
47
|
"top_k": 10,
|
|
48
|
+
"filter_policy": "replace",
|
|
40
49
|
"num_candidates": None,
|
|
41
50
|
},
|
|
42
51
|
}
|
|
@@ -54,6 +63,7 @@ def test_from_dict(_mock_elasticsearch_client):
|
|
|
54
63
|
},
|
|
55
64
|
"filters": {},
|
|
56
65
|
"top_k": 10,
|
|
66
|
+
"filter_policy": "replace",
|
|
57
67
|
"num_candidates": None,
|
|
58
68
|
},
|
|
59
69
|
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|