elasticsearch-haystack 1.0.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of elasticsearch-haystack might be problematic. Click here for more details.
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/CHANGELOG.md +14 -6
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/PKG-INFO +2 -2
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/pyproject.toml +12 -8
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +22 -1
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +6 -9
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_document_store.py +40 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/.gitignore +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/LICENSE +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/README.md +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/docker-compose.yml +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/pydoc/config.yml +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/__init__.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_bm25_retriever.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_embedding_retriever.py +0 -0
- {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_filters.py +0 -0
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
## [
|
|
3
|
+
## [integrations/elasticsearch-v1.0.1] - 2024-10-28
|
|
4
|
+
|
|
5
|
+
### ⚙️ Miscellaneous Tasks
|
|
6
|
+
|
|
7
|
+
- Update changelog after removing legacy filters (#1083)
|
|
8
|
+
- Update ruff linting scripts and settings (#1105)
|
|
9
|
+
- Adopt uv as installer (#1142)
|
|
10
|
+
|
|
11
|
+
## [integrations/elasticsearch-v1.0.0] - 2024-09-12
|
|
4
12
|
|
|
5
13
|
### 🚀 Features
|
|
6
14
|
|
|
@@ -11,10 +19,15 @@
|
|
|
11
19
|
|
|
12
20
|
- `ElasticSearch` - Fallback to default filter policy when deserializing retrievers without the init parameter (#898)
|
|
13
21
|
|
|
22
|
+
### 🧪 Testing
|
|
23
|
+
|
|
24
|
+
- Do not retry tests in `hatch run test` command (#954)
|
|
25
|
+
|
|
14
26
|
### ⚙️ Miscellaneous Tasks
|
|
15
27
|
|
|
16
28
|
- Retry tests to reduce flakyness (#836)
|
|
17
29
|
- Update ruff invocation to include check parameter (#853)
|
|
30
|
+
- ElasticSearch - remove legacy filters elasticsearch (#1078)
|
|
18
31
|
|
|
19
32
|
## [integrations/elasticsearch-v0.5.0] - 2024-05-24
|
|
20
33
|
|
|
@@ -64,8 +77,6 @@ This PR will also push the docs to Readme
|
|
|
64
77
|
|
|
65
78
|
- Fix project urls (#96)
|
|
66
79
|
|
|
67
|
-
|
|
68
|
-
|
|
69
80
|
### 🚜 Refactor
|
|
70
81
|
|
|
71
82
|
- Use `hatch_vcs` to manage integrations versioning (#103)
|
|
@@ -76,15 +87,12 @@ This PR will also push the docs to Readme
|
|
|
76
87
|
|
|
77
88
|
- Fix import and increase version (#77)
|
|
78
89
|
|
|
79
|
-
|
|
80
|
-
|
|
81
90
|
## [integrations/elasticsearch-v0.1.0] - 2023-12-04
|
|
82
91
|
|
|
83
92
|
### 🐛 Bug Fixes
|
|
84
93
|
|
|
85
94
|
- Fix license headers
|
|
86
95
|
|
|
87
|
-
|
|
88
96
|
## [integrations/elasticsearch-v0.0.2] - 2023-11-29
|
|
89
97
|
|
|
90
98
|
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: elasticsearch-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: Haystack 2.x Document Store for ElasticSearch
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -41,6 +41,7 @@ root = "../.."
|
|
|
41
41
|
git_describe_command = 'git describe --tags --match="integrations/elasticsearch-v[0-9]*"'
|
|
42
42
|
|
|
43
43
|
[tool.hatch.envs.default]
|
|
44
|
+
installer = "uv"
|
|
44
45
|
dependencies = [
|
|
45
46
|
"coverage[toml]>=6.5",
|
|
46
47
|
"pytest",
|
|
@@ -61,12 +62,13 @@ docs = ["pydoc-markdown pydoc/config.yml"]
|
|
|
61
62
|
python = ["3.8", "3.9", "3.10", "3.11"]
|
|
62
63
|
|
|
63
64
|
[tool.hatch.envs.lint]
|
|
65
|
+
installer = "uv"
|
|
64
66
|
detached = true
|
|
65
|
-
dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
|
|
67
|
+
dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
|
|
66
68
|
[tool.hatch.envs.lint.scripts]
|
|
67
69
|
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
|
|
68
|
-
style = ["ruff check {args
|
|
69
|
-
fmt = ["black {args:.}", "ruff --fix {args
|
|
70
|
+
style = ["ruff check {args:}", "black --check --diff {args:.}"]
|
|
71
|
+
fmt = ["black {args:.}", "ruff check --fix {args:}", "style"]
|
|
70
72
|
all = ["style", "typing"]
|
|
71
73
|
|
|
72
74
|
[tool.hatch.metadata]
|
|
@@ -80,6 +82,8 @@ skip-string-normalization = true
|
|
|
80
82
|
[tool.ruff]
|
|
81
83
|
target-version = "py38"
|
|
82
84
|
line-length = 120
|
|
85
|
+
|
|
86
|
+
[tool.ruff.lint]
|
|
83
87
|
select = [
|
|
84
88
|
"A",
|
|
85
89
|
"ARG",
|
|
@@ -128,13 +132,13 @@ unfixable = [
|
|
|
128
132
|
"F401",
|
|
129
133
|
]
|
|
130
134
|
|
|
131
|
-
[tool.ruff.isort]
|
|
132
|
-
known-first-party = ["
|
|
135
|
+
[tool.ruff.lint.isort]
|
|
136
|
+
known-first-party = ["haystack_integrations"]
|
|
133
137
|
|
|
134
|
-
[tool.ruff.flake8-tidy-imports]
|
|
138
|
+
[tool.ruff.lint.flake8-tidy-imports]
|
|
135
139
|
ban-relative-imports = "parents"
|
|
136
140
|
|
|
137
|
-
[tool.ruff.per-file-ignores]
|
|
141
|
+
[tool.ruff.lint.per-file-ignores]
|
|
138
142
|
# Tests can use magic values, assertions, and relative imports
|
|
139
143
|
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
140
144
|
|
|
@@ -155,5 +159,5 @@ minversion = "6.0"
|
|
|
155
159
|
markers = ["unit: unit tests", "integration: integration tests"]
|
|
156
160
|
|
|
157
161
|
[[tool.mypy.overrides]]
|
|
158
|
-
module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
|
|
162
|
+
module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
|
|
159
163
|
ignore_missing_imports = true
|
|
@@ -105,9 +105,12 @@ class ElasticsearchDocumentStore:
|
|
|
105
105
|
@property
|
|
106
106
|
def client(self) -> Elasticsearch:
|
|
107
107
|
if self._client is None:
|
|
108
|
+
headers = self._kwargs.pop("headers", {})
|
|
109
|
+
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
110
|
+
|
|
108
111
|
client = Elasticsearch(
|
|
109
112
|
self._hosts,
|
|
110
|
-
headers=
|
|
113
|
+
headers=headers,
|
|
111
114
|
**self._kwargs,
|
|
112
115
|
)
|
|
113
116
|
# Check client connection, this will raise if not connected
|
|
@@ -255,6 +258,15 @@ class ElasticsearchDocumentStore:
|
|
|
255
258
|
elasticsearch_actions = []
|
|
256
259
|
for doc in documents:
|
|
257
260
|
doc_dict = doc.to_dict()
|
|
261
|
+
if "dataframe" in doc_dict:
|
|
262
|
+
dataframe = doc_dict.pop("dataframe")
|
|
263
|
+
if dataframe:
|
|
264
|
+
logger.warning(
|
|
265
|
+
"Document %s has the `dataframe` field set,"
|
|
266
|
+
"ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
|
|
267
|
+
"The `dataframe` field will soon be removed from Haystack Document.",
|
|
268
|
+
doc.id,
|
|
269
|
+
)
|
|
258
270
|
if "sparse_embedding" in doc_dict:
|
|
259
271
|
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
260
272
|
if sparse_embedding:
|
|
@@ -319,6 +331,15 @@ class ElasticsearchDocumentStore:
|
|
|
319
331
|
data["metadata"]["highlighted"] = hit["highlight"]
|
|
320
332
|
data["score"] = hit["_score"]
|
|
321
333
|
|
|
334
|
+
if "dataframe" in data:
|
|
335
|
+
dataframe = data.pop("dataframe")
|
|
336
|
+
if dataframe:
|
|
337
|
+
logger.warning(
|
|
338
|
+
"Document %s has the `dataframe` field set,"
|
|
339
|
+
"ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
|
|
340
|
+
"The `dataframe` field will soon be removed from Haystack Document.",
|
|
341
|
+
data["id"],
|
|
342
|
+
)
|
|
322
343
|
return Document.from_dict(data)
|
|
323
344
|
|
|
324
345
|
def delete_documents(self, document_ids: List[str]) -> None:
|
|
@@ -5,7 +5,6 @@ from datetime import datetime
|
|
|
5
5
|
from typing import Any, Dict, List
|
|
6
6
|
|
|
7
7
|
from haystack.errors import FilterError
|
|
8
|
-
from pandas import DataFrame
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
|
|
@@ -57,7 +56,7 @@ def _equal(field: str, value: Any) -> Dict[str, Any]:
|
|
|
57
56
|
}
|
|
58
57
|
}
|
|
59
58
|
}
|
|
60
|
-
if field
|
|
59
|
+
if field == "text":
|
|
61
60
|
# We want to fully match the text field.
|
|
62
61
|
return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
|
|
63
62
|
return {"term": {field: value}}
|
|
@@ -69,7 +68,7 @@ def _not_equal(field: str, value: Any) -> Dict[str, Any]:
|
|
|
69
68
|
|
|
70
69
|
if isinstance(value, list):
|
|
71
70
|
return {"bool": {"must_not": {"terms": {field: value}}}}
|
|
72
|
-
if field
|
|
71
|
+
if field == "text":
|
|
73
72
|
# We want to fully match the text field.
|
|
74
73
|
return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
|
|
75
74
|
|
|
@@ -92,7 +91,7 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
|
|
|
92
91
|
"Strings are only comparable if they are ISO formatted dates."
|
|
93
92
|
)
|
|
94
93
|
raise FilterError(msg) from exc
|
|
95
|
-
if
|
|
94
|
+
if isinstance(value, list):
|
|
96
95
|
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
97
96
|
raise FilterError(msg)
|
|
98
97
|
return {"range": {field: {"gt": value}}}
|
|
@@ -114,7 +113,7 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
|
|
|
114
113
|
"Strings are only comparable if they are ISO formatted dates."
|
|
115
114
|
)
|
|
116
115
|
raise FilterError(msg) from exc
|
|
117
|
-
if
|
|
116
|
+
if isinstance(value, list):
|
|
118
117
|
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
119
118
|
raise FilterError(msg)
|
|
120
119
|
return {"range": {field: {"gte": value}}}
|
|
@@ -136,7 +135,7 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
|
|
|
136
135
|
"Strings are only comparable if they are ISO formatted dates."
|
|
137
136
|
)
|
|
138
137
|
raise FilterError(msg) from exc
|
|
139
|
-
if
|
|
138
|
+
if isinstance(value, list):
|
|
140
139
|
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
141
140
|
raise FilterError(msg)
|
|
142
141
|
return {"range": {field: {"lt": value}}}
|
|
@@ -158,7 +157,7 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
|
|
|
158
157
|
"Strings are only comparable if they are ISO formatted dates."
|
|
159
158
|
)
|
|
160
159
|
raise FilterError(msg) from exc
|
|
161
|
-
if
|
|
160
|
+
if isinstance(value, list):
|
|
162
161
|
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
|
|
163
162
|
raise FilterError(msg)
|
|
164
163
|
return {"range": {field: {"lte": value}}}
|
|
@@ -212,8 +211,6 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
212
211
|
raise FilterError(msg)
|
|
213
212
|
operator: str = condition["operator"]
|
|
214
213
|
value: Any = condition["value"]
|
|
215
|
-
if isinstance(value, DataFrame):
|
|
216
|
-
value = value.to_json()
|
|
217
214
|
|
|
218
215
|
return COMPARISON_OPERATORS[operator](field, value)
|
|
219
216
|
|
|
@@ -12,6 +12,7 @@ from haystack.dataclasses.document import Document
|
|
|
12
12
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
13
13
|
from haystack.document_stores.types import DuplicatePolicy
|
|
14
14
|
from haystack.testing.document_store import DocumentStoreBaseTests
|
|
15
|
+
from pandas import DataFrame
|
|
15
16
|
|
|
16
17
|
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
17
18
|
|
|
@@ -22,6 +23,20 @@ def test_init_is_lazy(_mock_es_client):
|
|
|
22
23
|
_mock_es_client.assert_not_called()
|
|
23
24
|
|
|
24
25
|
|
|
26
|
+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
27
|
+
def test_headers_are_supported(_mock_es_client):
|
|
28
|
+
_ = ElasticsearchDocumentStore(hosts="testhost", headers={"header1": "value1", "header2": "value2"}).client
|
|
29
|
+
|
|
30
|
+
assert _mock_es_client.call_count == 1
|
|
31
|
+
_, kwargs = _mock_es_client.call_args
|
|
32
|
+
|
|
33
|
+
headers_found = kwargs["headers"]
|
|
34
|
+
assert headers_found["header1"] == "value1"
|
|
35
|
+
assert headers_found["header2"] == "value2"
|
|
36
|
+
|
|
37
|
+
assert headers_found["user-agent"].startswith("haystack-py-ds/")
|
|
38
|
+
|
|
39
|
+
|
|
25
40
|
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
|
|
26
41
|
def test_to_dict(_mock_elasticsearch_client):
|
|
27
42
|
document_store = ElasticsearchDocumentStore(hosts="some hosts")
|
|
@@ -116,6 +131,31 @@ class TestDocumentStore(DocumentStoreBaseTests):
|
|
|
116
131
|
with pytest.raises(DuplicateDocumentError):
|
|
117
132
|
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
118
133
|
|
|
134
|
+
def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
|
|
135
|
+
doc = Document(id="1", content="test")
|
|
136
|
+
doc.dataframe = DataFrame({"a": [1, 2, 3]})
|
|
137
|
+
|
|
138
|
+
document_store.write_documents([doc])
|
|
139
|
+
|
|
140
|
+
res = document_store.filter_documents()
|
|
141
|
+
assert len(res) == 1
|
|
142
|
+
|
|
143
|
+
assert res[0].id == "1"
|
|
144
|
+
assert res[0].content == "test"
|
|
145
|
+
|
|
146
|
+
assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
|
|
147
|
+
|
|
148
|
+
def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
|
|
149
|
+
hit = {
|
|
150
|
+
"_source": {"id": "1", "content": "test", "dataframe": {"a": [1, 2, 3]}},
|
|
151
|
+
"_score": 1.0,
|
|
152
|
+
}
|
|
153
|
+
doc = document_store._deserialize_document(hit)
|
|
154
|
+
assert doc.id == "1"
|
|
155
|
+
assert doc.content == "test"
|
|
156
|
+
assert doc.score == 1.0
|
|
157
|
+
assert not hasattr(doc, "dataframe") or doc.dataframe is None
|
|
158
|
+
|
|
119
159
|
def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore):
|
|
120
160
|
document_store.write_documents(
|
|
121
161
|
[
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_embedding_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|