elasticsearch-haystack 1.0.1__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (19) hide show
  1. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/CHANGELOG.md +8 -5
  2. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/PKG-INFO +2 -2
  3. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/pyproject.toml +1 -1
  4. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py +18 -0
  5. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py +6 -9
  6. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/tests/test_document_store.py +26 -0
  7. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/.gitignore +0 -0
  8. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/LICENSE +0 -0
  9. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/README.md +0 -0
  10. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/docker-compose.yml +0 -0
  11. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/pydoc/config.yml +0 -0
  12. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py +0 -0
  13. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +0 -0
  14. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +0 -0
  15. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/__init__.py +0 -0
  16. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/tests/__init__.py +0 -0
  17. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/tests/test_bm25_retriever.py +0 -0
  18. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/tests/test_embedding_retriever.py +0 -0
  19. {elasticsearch_haystack-1.0.1 → elasticsearch_haystack-2.0.0}/tests/test_filters.py +0 -0
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/elasticsearch-v1.0.1] - 2024-10-28
4
+
5
+ ### ⚙️ Miscellaneous Tasks
6
+
7
+ - Update changelog after removing legacy filters (#1083)
8
+ - Update ruff linting scripts and settings (#1105)
9
+ - Adopt uv as installer (#1142)
10
+
3
11
  ## [integrations/elasticsearch-v1.0.0] - 2024-09-12
4
12
 
5
13
  ### 🚀 Features
@@ -69,8 +77,6 @@ This PR will also push the docs to Readme
69
77
 
70
78
  - Fix project urls (#96)
71
79
 
72
-
73
-
74
80
  ### 🚜 Refactor
75
81
 
76
82
  - Use `hatch_vcs` to manage integrations versioning (#103)
@@ -81,15 +87,12 @@ This PR will also push the docs to Readme
81
87
 
82
88
  - Fix import and increase version (#77)
83
89
 
84
-
85
-
86
90
  ## [integrations/elasticsearch-v0.1.0] - 2023-12-04
87
91
 
88
92
  ### 🐛 Bug Fixes
89
93
 
90
94
  - Fix license headers
91
95
 
92
-
93
96
  ## [integrations/elasticsearch-v0.0.2] - 2023-11-29
94
97
 
95
98
  <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: elasticsearch-haystack
3
- Version: 1.0.1
3
+ Version: 2.0.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -159,5 +159,5 @@ minversion = "6.0"
159
159
  markers = ["unit: unit tests", "integration: integration tests"]
160
160
 
161
161
  [[tool.mypy.overrides]]
162
- module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
162
+ module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
163
163
  ignore_missing_imports = true
@@ -258,6 +258,15 @@ class ElasticsearchDocumentStore:
258
258
  elasticsearch_actions = []
259
259
  for doc in documents:
260
260
  doc_dict = doc.to_dict()
261
+ if "dataframe" in doc_dict:
262
+ dataframe = doc_dict.pop("dataframe")
263
+ if dataframe:
264
+ logger.warning(
265
+ "Document %s has the `dataframe` field set,"
266
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
267
+ "The `dataframe` field will soon be removed from Haystack Document.",
268
+ doc.id,
269
+ )
261
270
  if "sparse_embedding" in doc_dict:
262
271
  sparse_embedding = doc_dict.pop("sparse_embedding", None)
263
272
  if sparse_embedding:
@@ -322,6 +331,15 @@ class ElasticsearchDocumentStore:
322
331
  data["metadata"]["highlighted"] = hit["highlight"]
323
332
  data["score"] = hit["_score"]
324
333
 
334
+ if "dataframe" in data:
335
+ dataframe = data.pop("dataframe")
336
+ if dataframe:
337
+ logger.warning(
338
+ "Document %s has the `dataframe` field set,"
339
+ "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
340
+ "The `dataframe` field will soon be removed from Haystack Document.",
341
+ data["id"],
342
+ )
325
343
  return Document.from_dict(data)
326
344
 
327
345
  def delete_documents(self, document_ids: List[str]) -> None:
@@ -5,7 +5,6 @@ from datetime import datetime
5
5
  from typing import Any, Dict, List
6
6
 
7
7
  from haystack.errors import FilterError
8
- from pandas import DataFrame
9
8
 
10
9
 
11
10
  def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
@@ -57,7 +56,7 @@ def _equal(field: str, value: Any) -> Dict[str, Any]:
57
56
  }
58
57
  }
59
58
  }
60
- if field in ["text", "dataframe"]:
59
+ if field == "text":
61
60
  # We want to fully match the text field.
62
61
  return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
63
62
  return {"term": {field: value}}
@@ -69,7 +68,7 @@ def _not_equal(field: str, value: Any) -> Dict[str, Any]:
69
68
 
70
69
  if isinstance(value, list):
71
70
  return {"bool": {"must_not": {"terms": {field: value}}}}
72
- if field in ["text", "dataframe"]:
71
+ if field == "text":
73
72
  # We want to fully match the text field.
74
73
  return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
75
74
 
@@ -92,7 +91,7 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
92
91
  "Strings are only comparable if they are ISO formatted dates."
93
92
  )
94
93
  raise FilterError(msg) from exc
95
- if type(value) in [list, DataFrame]:
94
+ if isinstance(value, list):
96
95
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
97
96
  raise FilterError(msg)
98
97
  return {"range": {field: {"gt": value}}}
@@ -114,7 +113,7 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
114
113
  "Strings are only comparable if they are ISO formatted dates."
115
114
  )
116
115
  raise FilterError(msg) from exc
117
- if type(value) in [list, DataFrame]:
116
+ if isinstance(value, list):
118
117
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
119
118
  raise FilterError(msg)
120
119
  return {"range": {field: {"gte": value}}}
@@ -136,7 +135,7 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
136
135
  "Strings are only comparable if they are ISO formatted dates."
137
136
  )
138
137
  raise FilterError(msg) from exc
139
- if type(value) in [list, DataFrame]:
138
+ if isinstance(value, list):
140
139
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
141
140
  raise FilterError(msg)
142
141
  return {"range": {field: {"lt": value}}}
@@ -158,7 +157,7 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
158
157
  "Strings are only comparable if they are ISO formatted dates."
159
158
  )
160
159
  raise FilterError(msg) from exc
161
- if type(value) in [list, DataFrame]:
160
+ if isinstance(value, list):
162
161
  msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
163
162
  raise FilterError(msg)
164
163
  return {"range": {field: {"lte": value}}}
@@ -212,8 +211,6 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
212
211
  raise FilterError(msg)
213
212
  operator: str = condition["operator"]
214
213
  value: Any = condition["value"]
215
- if isinstance(value, DataFrame):
216
- value = value.to_json()
217
214
 
218
215
  return COMPARISON_OPERATORS[operator](field, value)
219
216
 
@@ -12,6 +12,7 @@ from haystack.dataclasses.document import Document
12
12
  from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
13
13
  from haystack.document_stores.types import DuplicatePolicy
14
14
  from haystack.testing.document_store import DocumentStoreBaseTests
15
+ from pandas import DataFrame
15
16
 
16
17
  from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
17
18
 
@@ -130,6 +131,31 @@ class TestDocumentStore(DocumentStoreBaseTests):
130
131
  with pytest.raises(DuplicateDocumentError):
131
132
  document_store.write_documents(docs, DuplicatePolicy.FAIL)
132
133
 
134
+ def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
135
+ doc = Document(id="1", content="test")
136
+ doc.dataframe = DataFrame({"a": [1, 2, 3]})
137
+
138
+ document_store.write_documents([doc])
139
+
140
+ res = document_store.filter_documents()
141
+ assert len(res) == 1
142
+
143
+ assert res[0].id == "1"
144
+ assert res[0].content == "test"
145
+
146
+ assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
147
+
148
+ def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
149
+ hit = {
150
+ "_source": {"id": "1", "content": "test", "dataframe": {"a": [1, 2, 3]}},
151
+ "_score": 1.0,
152
+ }
153
+ doc = document_store._deserialize_document(hit)
154
+ assert doc.id == "1"
155
+ assert doc.content == "test"
156
+ assert doc.score == 1.0
157
+ assert not hasattr(doc, "dataframe") or doc.dataframe is None
158
+
133
159
  def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore):
134
160
  document_store.write_documents(
135
161
  [