PyPI - elasticsearch-haystack - Versions diffs - 1.0.0__tar.gz → 2.0.0__tar.gz - Mend

elasticsearch-haystack 1.0.0tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (19) hide show

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/CHANGELOG.md RENAMED Viewed

@@ -1,6 +1,14 @@
 # Changelog
-## [unreleased]
+## [integrations/elasticsearch-v1.0.1] - 2024-10-28
+### ⚙️ Miscellaneous Tasks
+- Update changelog after removing legacy filters (#1083)
+- Update ruff linting scripts and settings (#1105)
+- Adopt uv as installer (#1142)
+## [integrations/elasticsearch-v1.0.0] - 2024-09-12
 ### 🚀 Features
@@ -11,10 +19,15 @@
 - `ElasticSearch` - Fallback to default filter policy when deserializing retrievers without the init parameter (#898)
+### 🧪 Testing
+- Do not retry tests in `hatch run test` command (#954)
 ### ⚙️ Miscellaneous Tasks
 - Retry tests to reduce flakyness (#836)
 - Update ruff invocation to include check parameter (#853)
+- ElasticSearch - remove legacy filters elasticsearch (#1078)
 ## [integrations/elasticsearch-v0.5.0] - 2024-05-24
@@ -64,8 +77,6 @@ This PR will also push the docs to Readme
 - Fix project urls (#96)
 ### 🚜 Refactor
 - Use `hatch_vcs` to manage integrations versioning (#103)
@@ -76,15 +87,12 @@ This PR will also push the docs to Readme
 - Fix import and increase version (#77)
 ## [integrations/elasticsearch-v0.1.0] - 2023-12-04
 ### 🐛 Bug Fixes
 - Fix license headers
 ## [integrations/elasticsearch-v0.0.2] - 2023-11-29
 <!-- generated by git-cliff -->

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: elasticsearch-haystack
-Version: 1.0.0
+Version: 2.0.0
 Summary: Haystack 2.x Document Store for ElasticSearch
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
 Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/pyproject.toml RENAMED Viewed

@@ -41,6 +41,7 @@ root = "../.."
 git_describe_command = 'git describe --tags --match="integrations/elasticsearch-v[0-9]*"'
 [tool.hatch.envs.default]
+installer = "uv"
 dependencies = [
   "coverage[toml]>=6.5",
   "pytest",
@@ -61,12 +62,13 @@ docs = ["pydoc-markdown pydoc/config.yml"]
 python = ["3.8", "3.9", "3.10", "3.11"]
 [tool.hatch.envs.lint]
+installer = "uv"
 detached = true
-dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
+dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
 [tool.hatch.envs.lint.scripts]
 typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
-style = ["ruff check {args:. --exclude tests/}", "black --check --diff {args:.}"]
-fmt = ["black {args:.}", "ruff --fix {args:. --exclude tests/}", "style"]
+style = ["ruff check {args:}", "black --check --diff {args:.}"]
+fmt = ["black {args:.}", "ruff check --fix {args:}", "style"]
 all = ["style", "typing"]
 [tool.hatch.metadata]
@@ -80,6 +82,8 @@ skip-string-normalization = true
 [tool.ruff]
 target-version = "py38"
 line-length = 120
+[tool.ruff.lint]
 select = [
   "A",
   "ARG",
@@ -128,13 +132,13 @@ unfixable = [
   "F401",
 ]
-[tool.ruff.isort]
-known-first-party = ["src"]
+[tool.ruff.lint.isort]
+known-first-party = ["haystack_integrations"]
-[tool.ruff.flake8-tidy-imports]
+[tool.ruff.lint.flake8-tidy-imports]
 ban-relative-imports = "parents"
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
@@ -155,5 +159,5 @@ minversion = "6.0"
 markers = ["unit: unit tests", "integration: integration tests"]
 [[tool.mypy.overrides]]
-module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
+module = ["haystack.*", "haystack_integrations.*", "numpy.*", "pytest.*"]
 ignore_missing_imports = true

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py RENAMED Viewed

@@ -105,9 +105,12 @@ class ElasticsearchDocumentStore:
     @property
     def client(self) -> Elasticsearch:
         if self._client is None:
+            headers = self._kwargs.pop("headers", {})
+            headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
             client = Elasticsearch(
                 self._hosts,
-                headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
+                headers=headers,
                 **self._kwargs,
             )
             # Check client connection, this will raise if not connected
@@ -255,6 +258,15 @@ class ElasticsearchDocumentStore:
         elasticsearch_actions = []
         for doc in documents:
             doc_dict = doc.to_dict()
+            if "dataframe" in doc_dict:
+                dataframe = doc_dict.pop("dataframe")
+                if dataframe:
+                    logger.warning(
+                        "Document %s has the `dataframe` field set,"
+                        "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
+                        "The `dataframe` field will soon be removed from Haystack Document.",
+                        doc.id,
+                    )
             if "sparse_embedding" in doc_dict:
                 sparse_embedding = doc_dict.pop("sparse_embedding", None)
                 if sparse_embedding:
@@ -319,6 +331,15 @@ class ElasticsearchDocumentStore:
             data["metadata"]["highlighted"] = hit["highlight"]
         data["score"] = hit["_score"]
+        if "dataframe" in data:
+            dataframe = data.pop("dataframe")
+            if dataframe:
+                logger.warning(
+                    "Document %s has the `dataframe` field set,"
+                    "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
+                    "The `dataframe` field will soon be removed from Haystack Document.",
+                    data["id"],
+                )
         return Document.from_dict(data)
     def delete_documents(self, document_ids: List[str]) -> None:

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/src/haystack_integrations/document_stores/elasticsearch/filters.py RENAMED Viewed

@@ -5,7 +5,6 @@ from datetime import datetime
 from typing import Any, Dict, List
 from haystack.errors import FilterError
-from pandas import DataFrame
 def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
@@ -57,7 +56,7 @@ def _equal(field: str, value: Any) -> Dict[str, Any]:
                 }
             }
         }
-    if field in ["text", "dataframe"]:
+    if field == "text":
         # We want to fully match the text field.
         return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
     return {"term": {field: value}}
@@ -69,7 +68,7 @@ def _not_equal(field: str, value: Any) -> Dict[str, Any]:
     if isinstance(value, list):
         return {"bool": {"must_not": {"terms": {field: value}}}}
-    if field in ["text", "dataframe"]:
+    if field == "text":
         # We want to fully match the text field.
         return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
@@ -92,7 +91,7 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"gt": value}}}
@@ -114,7 +113,7 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"gte": value}}}
@@ -136,7 +135,7 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"lt": value}}}
@@ -158,7 +157,7 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"lte": value}}}
@@ -212,8 +211,6 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
         raise FilterError(msg)
     operator: str = condition["operator"]
     value: Any = condition["value"]
-    if isinstance(value, DataFrame):
-        value = value.to_json()
     return COMPARISON_OPERATORS[operator](field, value)

{elasticsearch_haystack-1.0.0 → elasticsearch_haystack-2.0.0}/tests/test_document_store.py RENAMED Viewed

@@ -12,6 +12,7 @@ from haystack.dataclasses.document import Document
 from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
 from haystack.document_stores.types import DuplicatePolicy
 from haystack.testing.document_store import DocumentStoreBaseTests
+from pandas import DataFrame
 from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
@@ -22,6 +23,20 @@ def test_init_is_lazy(_mock_es_client):
     _mock_es_client.assert_not_called()
+@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
+def test_headers_are_supported(_mock_es_client):
+    _ = ElasticsearchDocumentStore(hosts="testhost", headers={"header1": "value1", "header2": "value2"}).client
+    assert _mock_es_client.call_count == 1
+    _, kwargs = _mock_es_client.call_args
+    headers_found = kwargs["headers"]
+    assert headers_found["header1"] == "value1"
+    assert headers_found["header2"] == "value2"
+    assert headers_found["user-agent"].startswith("haystack-py-ds/")
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
 def test_to_dict(_mock_elasticsearch_client):
     document_store = ElasticsearchDocumentStore(hosts="some hosts")
@@ -116,6 +131,31 @@ class TestDocumentStore(DocumentStoreBaseTests):
         with pytest.raises(DuplicateDocumentError):
             document_store.write_documents(docs, DuplicatePolicy.FAIL)
+    def test_write_documents_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
+        doc = Document(id="1", content="test")
+        doc.dataframe = DataFrame({"a": [1, 2, 3]})
+        document_store.write_documents([doc])
+        res = document_store.filter_documents()
+        assert len(res) == 1
+        assert res[0].id == "1"
+        assert res[0].content == "test"
+        assert not hasattr(res[0], "dataframe") or res[0].dataframe is None
+    def test_deserialize_document_dataframe_ignored(self, document_store: ElasticsearchDocumentStore):
+        hit = {
+            "_source": {"id": "1", "content": "test", "dataframe": {"a": [1, 2, 3]}},
+            "_score": 1.0,
+        }
+        doc = document_store._deserialize_document(hit)
+        assert doc.id == "1"
+        assert doc.content == "test"
+        assert doc.score == 1.0
+        assert not hasattr(doc, "dataframe") or doc.dataframe is None
     def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore):
         document_store.write_documents(
             [