elasticsearch-haystack 1.0.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (19) hide show
  1. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/CHANGELOG.md +14 -6
  2. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/PKG-INFO +2 -2
  3. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/pyproject.toml +12 -8
  4. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +22 -1
  5. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +6 -9
  6. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_document_store.py +40 -0
  7. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/.gitignore +0 -0
  8. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/LICENSE +0 -0
  9. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/README.md +0 -0
  10. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/docker-compose.yml +0 -0
  11. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/pydoc/config.yml +0 -0
  12. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
  13. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
  14. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
  15. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
  16. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/__init__.py +0 -0
  17. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_bm25_retriever.py +0 -0
  18. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_embedding_retriever.py +0 -0
  19. {elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_filters.py +0 -0
@@ -1,6 +1,14 @@
1
1
  # Changelog
2
2
 
3
- ## [unreleased]
3
+ ## [integrations/elasticsearch-v1.0.1] - 2024-10-28
4
+
5
+ ### ⚙️ Miscellaneous Tasks
6
+
7
+ - Update changelog after removing legacy filters (#1083)
8
+ - Update ruff linting scripts and settings (#1105)
9
+ - Adopt uv as installer (#1142)
10
+
11
+ ## [integrations/elasticsearch-v1.0.0] - 2024-09-12
4
12
 
5
13
  ### 🚀 Features
6
14
 
@@ -11,10 +19,15 @@
11
19
 
12
20
  - `ElasticSearch` - Fallback to default filter policy when deserializing retrievers without the init parameter (#898)
13
21
 
22
+ ### 🧪 Testing
23
+
24
+ - Do not retry tests in `hatch run test` command (#954)
25
+
14
26
  ### ⚙️ Miscellaneous Tasks
15
27
 
16
28
  - Retry tests to reduce flakyness (#836)
17
29
  - Update ruff invocation to include check parameter (#853)
30
+ - ElasticSearch - remove legacy filters elasticsearch (#1078)
18
31
 
19
32
  ## [integrations/elasticsearch-v0.5.0] - 2024-05-24
20
33
 
@@ -64,8 +77,6 @@ This PR will also push the docs to Readme
64
77
 
65
78
  - Fix project urls (#96)
66
79
 
67
-
68
-
69
80
  ### 🚜 Refactor
70
81
 
71
82
  - Use `hatch_vcs` to manage integrations versioning (#103)
@@ -76,15 +87,12 @@ This PR will also push the docs to Readme
76
87
 
77
88
  - Fix import and increase version (#77)
78
89
 
79
-
80
-
81
90
  ## [integrations/elasticsearch-v0.1.0] - 2023-12-04
82
91
 
83
92
  ### 🐛 Bug Fixes
84
93
 
85
94
  - Fix license headers
86
95
 
87
-
88
96
  ## [integrations/elasticsearch-v0.0.2] - 2023-11-29
89
97
 
90
98
  <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: elasticsearch-haystack
3
- Version: 1.0.0
3
+ Version: 2.0.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -41,6 +41,7 @@ root = "../.."
41
41
  git_describe_command = 'git describe --tags --match="integrations/elasticsearch-v[0-9]*"'
42
42
 
43
43
  [tool.hatch.envs.default]
44
+ installer = "uv"
44
45
  dependencies = [
45
46
  "coverage[toml]>=6.5",
46
47
  "pytest",
@@ -61,12 +62,13 @@ docs = ["pydoc-markdown pydoc/config.yml"]
61
62
  python = ["3.8", "3.9", "3.10", "3.11"]
62
63
 
63
64
  [tool.hatch.envs.lint]
65
+ installer = "uv"
64
66
  detached = true
65
- dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
67
+ dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
66
68
  [tool.hatch.envs.lint.scripts]
67
69
  typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
68
- style = ["ruff check {args:. --exclude tests/}", "black --check --diff {args:.}"]
69
- fmt = ["black {args:.}", "ruff --fix {args:. --exclude tests/}", "style"]
70
+ style = ["ruff check {args:}", "black --check --diff {args:.}"]
71
+ fmt = ["black {args:.}", "ruff check --fix {args:}", "style"]
70
72
  all = ["style", "typing"]
71
73
 
72
74
  [tool.hatch.metadata]
@@ -80,6 +82,8 @@ skip-string-normalization = true
80
82
  [tool.ruff]
81
83
  target-version = "py38"
82
84
  line-length = 120
85
+
86
+ [tool.ruff.lint]
83
87
  select = [
84
88
  "A",
85
89
  "ARG",
@@ -128,13 +132,13 @@ unfixable = [
128
132
  "F401",
129
133
  ]
130
134
 
131
- [tool.ruff.isort]
132
- known-first-party = ["src"]
135
+ [tool.ruff.lint.isort]
136
+ known-first-party = ["haystack_integrations"]
133
137
 
134
- [tool.ruff.flake8-tidy-imports]
138
+ [tool.ruff.lint.flake8-tidy-imports]
135
139
  ban-relative-imports = "parents"
136
140
 
137
- [tool.ruff.per-file-ignores]
141
+ [tool.ruff.lint.per-file-ignores]
138
142
  # Tests can use magic values, assertions, and relative imports
139
143
  "tests/**/*" = ["PLR2004", "S101", "TID252"]
140
144
 
@@ -155,5 +159,5 @@ minversion = "6.0"
155
159
  markers = ["unit: unit tests", "integration: integration tests"]
156
160
 
157
161
  [[tool.mypy.overrides]]
158
- module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
162
+ module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
159
163
  ignore_missing_imports = true
@@ -105,9 +105,12 @@ class ElasticsearchDocumentStore:
105
105
  @property
106
106
  def client(self) -> Elasticsearch:
107
107
  if self._client is None:
108
+ headers = self._kwargs.pop("headers", {})
109
+ headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
110
+
108
111
  client = Elasticsearch(
109
112
  self._hosts,
110
- headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
113
+ headers=headers,
111
114
  **self._kwargs,
112
115
  )
113
116
  # Check client connection, this will raise if not connected
@@ -255,6 +258,15 @@ class ElasticsearchDocumentStore:
255
258
  elasticsearch_actions = []
256
259
  for doc in documents:
257
260
  doc_dict = doc.to_dict()
261
+ if "dataframe" in doc_dict:
262
+ dataframe = doc_dict.pop("dataframe")
263
+ if dataframe:
264
+ logger.warning(
265
+ "Document %s has the `dataframe` field set,"
266
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
267
+ "The `dataframe` field will soon be removed from Haystack Document.",
268
+ doc.id,
269
+ )
258
270
  if "sparse_embedding" in doc_dict:
259
271
  sparse_embedding = doc_dict.pop("sparse_embedding", None)
260
272
  if sparse_embedding:
@@ -319,6 +331,15 @@ class ElasticsearchDocumentStore:
319
331
  data["metadata"]["highlighted"] = hit["highlight"]
320
332
  data["score"] = hit["_score"]
321
333
 
334
+ if "dataframe" in data:
335
+ dataframe = data.pop("dataframe")
336
+ if dataframe:
337
+ logger.warning(
338
+ "Document %s has the `dataframe` field set,"
339
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
340
+ "The `dataframe` field will soon be removed from Haystack Document.",
341
+ data["id"],
342
+ )
322
343
  return Document.from_dict(data)
323
344
 
324
345
  def delete_documents(self, document_ids: List[str]) -> None:
@@ -5,7 +5,6 @@ from datetime import datetime
5
5
  from typing import Any, Dict, List
6
6
 
7
7
  from haystack.errors import FilterError
8
- from pandas import DataFrame
9
8
 
10
9
 
11
10
  def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
@@ -57,7 +56,7 @@ def _equal(field: str, value: Any) -> Dict[str, Any]:
57
56
  }
58
57
  }
59
58
  }
60
- if field in ["text", "dataframe"]:
59
+ if field == "text":
61
60
  # We want to fully match the text field.
62
61
  return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
63
62
  return {"term": {field: value}}
@@ -69,7 +68,7 @@ def _not_equal(field: str, value: Any) -> Dict[str, Any]:
69
68
 
70
69
  if isinstance(value, list):
71
70
  return {"bool": {"must_not": {"terms": {field: value}}}}
72
- if field in ["text", "dataframe"]:
71
+ if field == "text":
73
72
  # We want to fully match the text field.
74
73
  return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
75
74
 
@@ -92,7 +91,7 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
92
91
  "Strings are only comparable if they are ISO formatted dates."
93
92
  )
94
93
  raise FilterError(msg) from exc
95
- if type(value) in [list, DataFrame]:
94
+ if isinstance(value, list):
96
95
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
97
96
  raise FilterError(msg)
98
97
  return {"range": {field: {"gt": value}}}
@@ -114,7 +113,7 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
114
113
  "Strings are only comparable if they are ISO formatted dates."
115
114
  )
116
115
  raise FilterError(msg) from exc
117
- if type(value) in [list, DataFrame]:
116
+ if isinstance(value, list):
118
117
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
119
118
  raise FilterError(msg)
120
119
  return {"range": {field: {"gte": value}}}
@@ -136,7 +135,7 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
136
135
  "Strings are only comparable if they are ISO formatted dates."
137
136
  )
138
137
  raise FilterError(msg) from exc
139
- if type(value) in [list, DataFrame]:
138
+ if isinstance(value, list):
140
139
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
141
140
  raise FilterError(msg)
142
141
  return {"range": {field: {"lt": value}}}
@@ -158,7 +157,7 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
158
157
  "Strings are only comparable if they are ISO formatted dates."
159
158
  )
160
159
  raise FilterError(msg) from exc
161
- if type(value) in [list, DataFrame]:
160
+ if isinstance(value, list):
162
161
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
163
162
  raise FilterError(msg)
164
163
  return {"range": {field: {"lte": value}}}
@@ -212,8 +211,6 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
212
211
  raise FilterError(msg)
213
212
  operator: str = condition["operator"]
214
213
  value: Any = condition["value"]
215
- if isinstance(value, DataFrame):
216
- value = value.to_json()
217
214
 
218
215
  return COMPARISON_OPERATORS[operator](field, value)
219
216
 
@@ -12,6 +12,7 @@ from haystack.dataclasses.document import Document
12
12
  from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
13
13
  from haystack.document_stores.types import DuplicatePolicy
14
14
  from haystack.testing.document_store import DocumentStoreBaseTests
15
+ from pandas import DataFrame
15
16
 
16
17
  from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
17
18
 
@@ -22,6 +23,20 @@ def test_init_is_lazy(_mock_es_client):
22
23
  _mock_es_client.assert_not_called()
23
24
 
24
25
 
26
+ @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
27
+ def test_headers_are_supported(_mock_es_client):
28
+ _ = ElasticsearchDocumentStore(hosts="testhost", headers={"header1": "value1", "header2": "value2"}).client
29
+
30
+ assert _mock_es_client.call_count == 1
31
+ _, kwargs = _mock_es_client.call_args
32
+
33
+ headers_found = kwargs["headers"]
34
+ assert headers_found["header1"] == "value1"
35
+ assert headers_found["header2"] == "value2"
36
+
37
+ assert headers_found["user-agent"].startswith("haystack-py-ds/")
38
+
39
+
25
40
  @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
26
41
  def test_to_dict(_mock_elasticsearch_client):
27
42
  document_store = ElasticsearchDocumentStore(hosts="some hosts")
@@ -116,6 +131,31 @@ class TestDocumentStore(DocumentStoreBaseTests):
116
131
  with pytest.raises(DuplicateDocumentError):
117
132
  document_store.write_documents(docs, DuplicatePolicy.FAIL)
118
133
 
134
+ def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
135
+ doc = Document(id="1", content="test")
136
+ doc.dataframe = DataFrame({"a": [1, 2, 3]})
137
+
138
+ document_store.write_documents([doc])
139
+
140
+ res = document_store.filter_documents()
141
+ assert len(res) == 1
142
+
143
+ assert res[0].id == "1"
144
+ assert res[0].content == "test"
145
+
146
+ assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
147
+
148
+ def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
149
+ hit = {
150
+ "_source": {"id": "1", "content": "test", "dataframe": {"a": [1, 2, 3]}},
151
+ "_score": 1.0,
152
+ }
153
+ doc = document_store._deserialize_document(hit)
154
+ assert doc.id == "1"
155
+ assert doc.content == "test"
156
+ assert doc.score == 1.0
157
+ assert not hasattr(doc, "dataframe") or doc.dataframe is None
158
+
119
159
  def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore):
120
160
  document_store.write_documents(
121
161
  [