PyPI - elasticsearch-haystack - Versions diffs - 0.5.0__tar.gz → 0.7.0__tar.gz - Mend

elasticsearch-haystack 0.5.0tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (19) hide show

elasticsearch_haystack-0.7.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,80 @@
+# Changelog
+## [unreleased]
+### 🚀 Features
+- Defer the database connection to when it's needed (#766)
+## [integrations/elasticsearch-v0.5.0] - 2024-05-24
+### 🐛 Bug Fixes
+- Add support for custom mapping in ElasticsearchDocumentStore (#721)
+## [integrations/elasticsearch-v0.4.0] - 2024-04-03
+### 📚 Documentation
+- Docstring update  (#525)
+- Review Elastic (#541)
+- Disable-class-def (#556)
+## [integrations/elasticsearch-v0.3.0] - 2024-02-23
+### 🐛 Bug Fixes
+- Fix order of API docs (#447)
+This PR will also push the docs to Readme
+### 📚 Documentation
+- Update category slug (#442)
+### Elasticsearch
+- Add user-agent header (#457)
+### Feat
+- Add filters to run function in retrievers of elasticsearch (#440)
+### Elasticsearch
+- Generate api docs (#322)
+## [integrations/elasticsearch-v0.2.0] - 2024-01-19
+## [integrations/elasticsearch-v0.1.3] - 2024-01-18
+## [integrations/elasticsearch-v0.1.2] - 2023-12-20
+### 🐛 Bug Fixes
+- Fix project urls (#96)
+### 🚜 Refactor
+- Use `hatch_vcs` to manage integrations versioning (#103)
+## [integrations/elasticsearch-v0.1.1] - 2023-12-05
+### 🐛 Bug Fixes
+- Fix import and increase version (#77)
+## [integrations/elasticsearch-v0.1.0] - 2023-12-04
+### 🐛 Bug Fixes
+- Fix license headers
+## [integrations/elasticsearch-v0.0.2] - 2023-11-29
+<!-- generated by git-cliff -->

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: elasticsearch-haystack
-Version: 0.5.0
+Version: 0.7.0
 Summary: Haystack 2.x Document Store for ElasticSearch
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
 Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/pyproject.toml RENAMED Viewed

@@ -10,9 +10,7 @@ readme = "README.md"
 requires-python = ">=3.8"
 license = "Apache-2.0"
 keywords = []
-authors = [
-  { name = "Silvano Cerza", email = "silvanocerza@gmail.com" },
-]
+authors = [{ name = "Silvano Cerza", email = "silvanocerza@gmail.com" }]
 classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Development Status :: 4 - Beta",
@@ -24,10 +22,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = [
-  "haystack-ai",
-  "elasticsearch>=8,<9",
-]
+dependencies = ["haystack-ai", "elasticsearch>=8,<9"]
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme"
@@ -49,49 +44,28 @@ git_describe_command = 'git describe --tags --match="integrations/elasticsearch-
 dependencies = [
   "coverage[toml]>=6.5",
   "pytest",
+  "pytest-rerunfailures",
   "pytest-xdist",
   "haystack-pydoc-tools",
 ]
 [tool.hatch.envs.default.scripts]
-test = "pytest {args:tests}"
-test-cov = "coverage run -m pytest {args:tests}"
-cov-report = [
-  "- coverage combine",
-  "coverage report",
-]
-cov = [
-  "test-cov",
-  "cov-report",
-]
-docs = [
-  "pydoc-markdown pydoc/config.yml"
-]
+test = "pytest --reruns 3 --reruns-delay 30 -x {args:tests}"
+test-cov = "coverage run -m pytest --reruns 3 --reruns-delay 30 -x {args:tests}"
+cov-report = ["- coverage combine", "coverage report"]
+cov = ["test-cov", "cov-report"]
+docs = ["pydoc-markdown pydoc/config.yml"]
 [[tool.hatch.envs.all.matrix]]
 python = ["3.8", "3.9", "3.10", "3.11"]
 [tool.hatch.envs.lint]
 detached = true
-dependencies = [
-  "black>=23.1.0",
-  "mypy>=1.0.0",
-  "ruff>=0.0.243",
-]
+dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
 [tool.hatch.envs.lint.scripts]
 typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
-style = [
-  "ruff {args:.}",
-  "black --check --diff {args:.}",
-]
-fmt = [
-  "black {args:.}",
-  "ruff --fix {args:.}",
-  "style",
-]
-all = [
-  "style",
-  "typing",
-]
+style = ["ruff check {args:.}", "black --check --diff {args:.}"]
+fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
+all = ["style", "typing"]
 [tool.hatch.metadata]
 allow-direct-references = true
@@ -137,9 +111,15 @@ ignore = [
   # Allow boolean positional values in function calls, like `dict.get(... True)`
   "FBT003",
   # Ignore checks for possible passwords
-  "S105", "S106", "S107",
+  "S105",
+  "S106",
+  "S107",
   # Ignore complexity
-  "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
+  "C901",
+  "PLR0911",
+  "PLR0912",
+  "PLR0913",
+  "PLR0915",
 ]
 unfixable = [
   # Don't touch unused imports
@@ -164,25 +144,14 @@ parallel = false
 [tool.coverage.report]
 omit = ["*/tests/*", "*/__init__.py"]
-show_missing=true
-exclude_lines = [
-  "no cov",
-  "if __name__ == .__main__.:",
-  "if TYPE_CHECKING:",
-]
+show_missing = true
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
 [tool.pytest.ini_options]
 minversion = "6.0"
-markers = [
-  "unit: unit tests",
-  "integration: integration tests"
-]
+markers = ["unit: unit tests", "integration: integration tests"]
 [[tool.mypy.overrides]]
-module = [
-  "haystack.*",
-  "haystack_integrations.*",
-  "pytest.*"
-]
+module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
 ignore_missing_imports = true

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py RENAMED Viewed

@@ -1,10 +1,12 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 from haystack import component, default_from_dict, default_to_dict
 from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
+from haystack.document_stores.types.filter_policy import apply_filter_policy
 from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore
@@ -48,6 +50,7 @@ class ElasticsearchBM25Retriever:
         fuzziness: str = "AUTO",
         top_k: int = 10,
         scale_score: bool = False,
+        filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
     ):
         """
         Initialize ElasticsearchBM25Retriever with an instance ElasticsearchDocumentStore.
@@ -60,6 +63,7 @@ class ElasticsearchBM25Retriever:
             for more details.
         :param top_k: Maximum number of Documents to return.
         :param scale_score: If `True` scales the Document`s scores between 0 and 1.
+        :param filter_policy: Policy to determine how filters are applied.
         :raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`.
         """
@@ -72,6 +76,7 @@ class ElasticsearchBM25Retriever:
         self._fuzziness = fuzziness
         self._top_k = top_k
         self._scale_score = scale_score
+        self._filter_policy = FilterPolicy.from_str(filter_policy) if isinstance(filter_policy, str) else filter_policy
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -86,6 +91,7 @@ class ElasticsearchBM25Retriever:
             fuzziness=self._fuzziness,
             top_k=self._top_k,
             scale_score=self._scale_score,
+            filter_policy=self._filter_policy.value,
             document_store=self._document_store.to_dict(),
         )
@@ -102,6 +108,7 @@ class ElasticsearchBM25Retriever:
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
+        data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
         return default_from_dict(cls, data)
     @component.output_types(documents=List[Document])
@@ -110,14 +117,17 @@ class ElasticsearchBM25Retriever:
         Retrieve documents using the BM25 keyword-based algorithm.
         :param query: String to search in `Document`s' text.
-        :param filters: Filters applied to the retrieved `Document`s.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
         :param top_k: Maximum number of `Document` to return.
         :returns: A dictionary with the following keys:
             - `documents`: List of `Document`s that match the query.
         """
+        filters = apply_filter_policy(self._filter_policy, self._filters, filters)
         docs = self._document_store._bm25_retrieval(
             query=query,
-            filters=filters or self._filters,
+            filters=filters,
             fuzziness=self._fuzziness,
             top_k=top_k or self._top_k,
             scale_score=self._scale_score,

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py RENAMED Viewed

@@ -1,10 +1,12 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 from haystack import component, default_from_dict, default_to_dict
 from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
+from haystack.document_stores.types.filter_policy import apply_filter_policy
 from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore
@@ -49,6 +51,7 @@ class ElasticsearchEmbeddingRetriever:
         filters: Optional[Dict[str, Any]] = None,
         top_k: int = 10,
         num_candidates: Optional[int] = None,
+        filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
     ):
         """
         Create the ElasticsearchEmbeddingRetriever component.
@@ -61,6 +64,7 @@ class ElasticsearchEmbeddingRetriever:
             Increasing this value will improve search accuracy at the cost of slower search speeds.
             You can read more about it in the Elasticsearch
             [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
+        :param filter_policy: Policy to determine how filters are applied.
         :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore.
         """
         if not isinstance(document_store, ElasticsearchDocumentStore):
@@ -71,6 +75,7 @@ class ElasticsearchEmbeddingRetriever:
         self._filters = filters or {}
         self._top_k = top_k
         self._num_candidates = num_candidates
+        self._filter_policy = FilterPolicy.from_str(filter_policy) if isinstance(filter_policy, str) else filter_policy
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -84,6 +89,7 @@ class ElasticsearchEmbeddingRetriever:
             filters=self._filters,
             top_k=self._top_k,
             num_candidates=self._num_candidates,
+            filter_policy=self._filter_policy.value,
             document_store=self._document_store.to_dict(),
         )
@@ -100,6 +106,7 @@ class ElasticsearchEmbeddingRetriever:
         data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
             data["init_parameters"]["document_store"]
         )
+        data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
         return default_from_dict(cls, data)
     @component.output_types(documents=List[Document])
@@ -108,14 +115,17 @@ class ElasticsearchEmbeddingRetriever:
         Retrieve documents using a vector similarity metric.
         :param query_embedding: Embedding of the query.
-        :param filters: Filters applied to the retrieved `Document`s.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
         :param top_k: Maximum number of `Document`s to return.
         :returns: A dictionary with the following keys:
             - `documents`: List of `Document`s most similar to the given `query_embedding`
         """
+        filters = apply_filter_policy(self._filter_policy, self._filters, filters)
         docs = self._document_store._embedding_retrieval(
             query_embedding=query_embedding,
-            filters=filters or self._filters,
+            filters=filters,
             top_k=top_k or self._top_k,
             num_candidates=self._num_candidates,
         )

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/src/haystack_integrations/document_stores/elasticsearch/document_store.py RENAMED Viewed

@@ -93,48 +93,60 @@ class ElasticsearchDocumentStore:
         :param **kwargs: Optional arguments that `Elasticsearch` takes.
         """
         self._hosts = hosts
-        self._client = Elasticsearch(
-            hosts,
-            headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
-            **kwargs,
-        )
+        self._client = None
         self._index = index
         self._embedding_similarity_function = embedding_similarity_function
         self._custom_mapping = custom_mapping
         self._kwargs = kwargs
-        # Check client connection, this will raise if not connected
-        self._client.info()
         if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
             msg = "custom_mapping must be a dictionary"
             raise ValueError(msg)
-        if self._custom_mapping:
-            mappings = self._custom_mapping
-        else:
-            # Configure mapping for the embedding field if none is provided
-            mappings = {
-                "properties": {
-                    "embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function},
-                    "content": {"type": "text"},
-                },
-                "dynamic_templates": [
-                    {
-                        "strings": {
-                            "path_match": "*",
-                            "match_mapping_type": "string",
-                            "mapping": {
-                                "type": "keyword",
-                            },
+    @property
+    def client(self) -> Elasticsearch:
+        if self._client is None:
+            client = Elasticsearch(
+                self._hosts,
+                headers={"user-agent": f"haystack-py-ds/{haystack_version}"},
+                **self._kwargs,
+            )
+            # Check client connection, this will raise if not connected
+            client.info()
+            if self._custom_mapping:
+                mappings = self._custom_mapping
+            else:
+                # Configure mapping for the embedding field if none is provided
+                mappings = {
+                    "properties": {
+                        "embedding": {
+                            "type": "dense_vector",
+                            "index": True,
+                            "similarity": self._embedding_similarity_function,
+                        },
+                        "content": {"type": "text"},
+                    },
+                    "dynamic_templates": [
+                        {
+                            "strings": {
+                                "path_match": "*",
+                                "match_mapping_type": "string",
+                                "mapping": {
+                                    "type": "keyword",
+                                },
+                            }
                         }
-                    }
-                ],
-            }
+                    ],
+                }
+            # Create the index if it doesn't exist
+            if not client.indices.exists(index=self._index):
+                client.indices.create(index=self._index, mappings=mappings)
+            self._client = client
-        # Create the index if it doesn't exist
-        if not self._client.indices.exists(index=index):
-            self._client.indices.create(index=index, mappings=mappings)
+        return self._client
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -172,7 +184,7 @@ class ElasticsearchDocumentStore:
         Returns how many documents are present in the document store.
         :returns: Number of documents in the document store.
         """
-        return self._client.count(index=self._index)["count"]
+        return self.client.count(index=self._index)["count"]
     def _search_documents(self, **kwargs) -> List[Document]:
         """
@@ -187,7 +199,7 @@ class ElasticsearchDocumentStore:
         from_ = 0
         # Handle pagination
         while True:
-            res = self._client.search(
+            res = self.client.search(
                 index=self._index,
                 from_=from_,
                 **kwargs,
@@ -261,7 +273,7 @@ class ElasticsearchDocumentStore:
             )
         documents_written, errors = helpers.bulk(
-            client=self._client,
+            client=self.client,
             actions=elasticsearch_actions,
             refresh="wait_for",
             index=self._index,
@@ -317,7 +329,7 @@ class ElasticsearchDocumentStore:
         """
         helpers.bulk(
-            client=self._client,
+            client=self.client,
             actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
             refresh="wait_for",
             index=self._index,

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_bm25_retriever.py RENAMED Viewed

@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import Mock, patch
+import pytest
 from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
 from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
 from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
@@ -14,8 +16,15 @@ def test_init_default():
     assert retriever._document_store == mock_store
     assert retriever._filters == {}
     assert retriever._top_k == 10
+    assert retriever._filter_policy == FilterPolicy.REPLACE
     assert not retriever._scale_score
+    retriever = ElasticsearchBM25Retriever(document_store=mock_store, filter_policy="replace")
+    assert retriever._filter_policy == FilterPolicy.REPLACE
+    with pytest.raises(ValueError):
+        ElasticsearchBM25Retriever(document_store=mock_store, filter_policy="keep")
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
 def test_to_dict(_mock_elasticsearch_client):
@@ -38,6 +47,7 @@ def test_to_dict(_mock_elasticsearch_client):
             "fuzziness": "AUTO",
             "top_k": 10,
             "scale_score": False,
+            "filter_policy": "replace",
         },
     }
@@ -55,6 +65,7 @@ def test_from_dict(_mock_elasticsearch_client):
             "fuzziness": "AUTO",
             "top_k": 10,
             "scale_score": True,
+            "filter_policy": "replace",
         },
     }
     retriever = ElasticsearchBM25Retriever.from_dict(data)
@@ -63,6 +74,7 @@ def test_from_dict(_mock_elasticsearch_client):
     assert retriever._fuzziness == "AUTO"
     assert retriever._top_k == 10
     assert retriever._scale_score
+    assert retriever._filter_policy == FilterPolicy.REPLACE
 def test_run():

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_document_store.py RENAMED Viewed

@@ -15,6 +15,12 @@ from haystack.testing.document_store import DocumentStoreBaseTests
 from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
+def test_init_is_lazy(_mock_es_client):
+    ElasticsearchDocumentStore(hosts="testhost")
+    _mock_es_client.assert_not_called()
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
 def test_to_dict(_mock_elasticsearch_client):
     document_store = ElasticsearchDocumentStore(hosts="some hosts")
@@ -73,7 +79,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
             hosts=hosts, index=index, embedding_similarity_function=embedding_similarity_function
         )
         yield store
-        store._client.options(ignore_status=[400, 404]).indices.delete(index=index)
+        store.client.options(ignore_status=[400, 404]).indices.delete(index=index)
     def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
         """
@@ -101,7 +107,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
         super().assert_documents_are_equal(received, expected)
     def test_user_agent_header(self, document_store: ElasticsearchDocumentStore):
-        assert document_store._client._headers["user-agent"].startswith("haystack-py-ds/")
+        assert document_store.client._headers["user-agent"].startswith("haystack-py-ds/")
     def test_write_documents(self, document_store: ElasticsearchDocumentStore):
         docs = [Document(id="1")]
@@ -308,7 +314,7 @@ class TestDocumentStore(DocumentStoreBaseTests):
         )
         mock_elasticsearch.return_value = mock_client
-        ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping)
+        _ = ElasticsearchDocumentStore(hosts="some hosts", custom_mapping=custom_mapping).client
         mock_client.indices.create.assert_called_once_with(
             index="default",
             mappings=custom_mapping,

{elasticsearch_haystack-0.5.0 → elasticsearch_haystack-0.7.0}/tests/test_embedding_retriever.py RENAMED Viewed

@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import Mock, patch
+import pytest
 from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
 from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
 from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
@@ -16,6 +18,12 @@ def test_init_default():
     assert retriever._top_k == 10
     assert retriever._num_candidates is None
+    retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store, filter_policy="replace")
+    assert retriever._filter_policy == FilterPolicy.REPLACE
+    with pytest.raises(ValueError):
+        ElasticsearchEmbeddingRetriever(document_store=mock_store, filter_policy="keep")
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
 def test_to_dict(_mock_elasticsearch_client):
@@ -37,6 +45,7 @@ def test_to_dict(_mock_elasticsearch_client):
             },
             "filters": {},
             "top_k": 10,
+            "filter_policy": "replace",
             "num_candidates": None,
         },
     }
@@ -54,6 +63,7 @@ def test_from_dict(_mock_elasticsearch_client):
             },
             "filters": {},
             "top_k": 10,
+            "filter_policy": "replace",
             "num_candidates": None,
         },
     }