PyPI - elasticsearch-haystack - Versions diffs - 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

elasticsearch-haystack 1.0.1py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of elasticsearch-haystack might be problematic. Click here for more details.

Files changed (9) hide show

{elasticsearch_haystack-1.0.1.dist-info → elasticsearch_haystack-2.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: elasticsearch-haystack
-Version: 1.0.1
+Version: 2.1.0
 Summary: Haystack 2.x Document Store for ElasticSearch
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
 Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -11,13 +11,12 @@ License-File: LICENSE
 Classifier: Development Status :: 4 - Beta
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
-Requires-Python: >=3.8
+Requires-Python: >=3.9
 Requires-Dist: elasticsearch<9,>=8
 Requires-Dist: haystack-ai
 Description-Content-Type: text/markdown

elasticsearch_haystack-2.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+haystack_integrations/components/retrievers/elasticsearch/__init__.py,sha256=cSJBsYjz_T4kK-M-auAHVUnYIcgUqqwwQe_hsF0_IG4,307
+haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=ISHc6elYXoDXDvC62_3bMMCk_Dv67jvZIgQBCZ1ZHdw,7012
+haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=jHDLMeecpf-DhvbRM1AAq2kIJn7xMNTR9vkm-FhHH7k,7332
+haystack_integrations/document_stores/elasticsearch/__init__.py,sha256=YTfu94dtVUBogbJFr1aJrKuaI6-Bw9VuHfPoyU7M8os,207
+haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=xzMcKhWfVBZUxzzpchcsAf8qWjux-PfZ4zqa8kd4Hcg,28825
+haystack_integrations/document_stores/elasticsearch/filters.py,sha256=Umip-PP4uFjuWeB1JWkKhaKClQ0VpiykoDlDu99wIV0,9759
+elasticsearch_haystack-2.1.0.dist-info/METADATA,sha256=nemE4-8L0_hMZTDkLk6ubi2p4kT1ESaX_OLHD_8QQnQ,2118
+elasticsearch_haystack-2.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+elasticsearch_haystack-2.1.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
+elasticsearch_haystack-2.1.0.dist-info/RECORD,,

{elasticsearch_haystack-1.0.1.dist-info → elasticsearch_haystack-2.1.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py CHANGED Viewed

@@ -120,7 +120,7 @@ class ElasticsearchBM25Retriever:
         """
         Retrieve documents using the BM25 keyword-based algorithm.
-        :param query: String to search in `Document`s' text.
+        :param query: String to search in the `Document`s text.
         :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
                         the `filter_policy` chosen at retriever initialization. See init method docstring for more
                         details.
@@ -137,3 +137,26 @@ class ElasticsearchBM25Retriever:
             scale_score=self._scale_score,
         )
         return {"documents": docs}
+    @component.output_types(documents=List[Document])
+    async def run_async(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None):
+        """
+        Asynchronously retrieve documents using the BM25 keyword-based algorithm.
+        :param query: String to search in the `Document` text.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
+        :param top_k: Maximum number of `Document` to return.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of `Document`s that match the query.
+        """
+        filters = apply_filter_policy(self._filter_policy, self._filters, filters)
+        docs = await self._document_store._bm25_retrieval_async(
+            query=query,
+            filters=filters,
+            fuzziness=self._fuzziness,
+            top_k=top_k or self._top_k,
+            scale_score=self._scale_score,
+        )
+        return {"documents": docs}

haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py CHANGED Viewed

@@ -119,10 +119,11 @@ class ElasticsearchEmbeddingRetriever:
         Retrieve documents using a vector similarity metric.
         :param query_embedding: Embedding of the query.
-        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
-                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
-                        details.
-        :param top_k: Maximum number of `Document`s to return.
+        :param filters: Filters applied when fetching documents from the Document Store.
+            Filters are applied during the approximate kNN search to ensure the Retriever returns
+              `top_k` matching documents.
+            The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
+        :param top_k: Maximum number of documents to return.
         :returns: A dictionary with the following keys:
             - `documents`: List of `Document`s most similar to the given `query_embedding`
         """
@@ -134,3 +135,28 @@ class ElasticsearchEmbeddingRetriever:
             num_candidates=self._num_candidates,
         )
         return {"documents": docs}
+    @component.output_types(documents=List[Document])
+    async def run_async(
+        self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None
+    ):
+        """
+        Asynchronously retrieve documents using a vector similarity metric.
+        :param query_embedding: Embedding of the query.
+        :param filters: Filters applied when fetching documents from the Document Store.
+            Filters are applied during the approximate kNN search to ensure the Retriever returns
+              `top_k` matching documents.
+            The way runtime filters are applied depends on the `filter_policy` selected when initializing the Retriever.
+        :param top_k: Maximum number of documents to return.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of `Document`s that match the query.
+        """
+        filters = apply_filter_policy(self._filter_policy, self._filters, filters)
+        docs = await self._document_store._embedding_retrieval_async(
+            query_embedding=query_embedding,
+            filters=filters,
+            top_k=top_k or self._top_k,
+            num_candidates=self._num_candidates,
+        )
+        return {"documents": docs}

haystack_integrations/document_stores/elasticsearch/document_store.py CHANGED Viewed

@@ -2,7 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from typing import Any, Dict, List, Literal, Mapping, Optional, Union
+from collections.abc import Mapping
+from typing import Any, Dict, List, Literal, Optional, Union
 import numpy as np
@@ -14,7 +15,7 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumen
 from haystack.document_stores.types import DuplicatePolicy
 from haystack.version import __version__ as haystack_version
-from elasticsearch import Elasticsearch, helpers  # type: ignore[import-not-found]
+from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers  # type: ignore[import-not-found]
 from .filters import _normalize_filters
@@ -30,6 +31,7 @@ Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]]]
 # Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
 # all be mapped to scores ~1.
 BM25_SCALING_FACTOR = 8
+DOC_ALREADY_EXISTS = 409
 class ElasticsearchDocumentStore:
@@ -93,28 +95,39 @@ class ElasticsearchDocumentStore:
         """
         self._hosts = hosts
         self._client = None
+        self._async_client = None
         self._index = index
         self._embedding_similarity_function = embedding_similarity_function
         self._custom_mapping = custom_mapping
         self._kwargs = kwargs
+        self._initialized = False
         if self._custom_mapping and not isinstance(self._custom_mapping, Dict):
             msg = "custom_mapping must be a dictionary"
             raise ValueError(msg)
-    @property
-    def client(self) -> Elasticsearch:
-        if self._client is None:
+    def _ensure_initialized(self):
+        """
+        Ensures both sync and async clients are initialized and the index exists.
+        """
+        if not self._initialized:
             headers = self._kwargs.pop("headers", {})
             headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
-            client = Elasticsearch(
+            # Initialize both sync and async clients
+            self._client = Elasticsearch(
                 self._hosts,
                 headers=headers,
                 **self._kwargs,
             )
+            self._async_client = AsyncElasticsearch(
+                self._hosts,
+                headers=headers,
+                **self._kwargs,
+            )
             # Check client connection, this will raise if not connected
-            client.info()
+            self._client.info()
             if self._custom_mapping:
                 mappings = self._custom_mapping
@@ -143,13 +156,27 @@ class ElasticsearchDocumentStore:
                 }
             # Create the index if it doesn't exist
-            if not client.indices.exists(index=self._index):
-                client.indices.create(index=self._index, mappings=mappings)
+            if not self._client.indices.exists(index=self._index):
+                self._client.indices.create(index=self._index, mappings=mappings)
-            self._client = client
+            self._initialized = True
+    @property
+    def client(self) -> Elasticsearch:
+        """
+        Returns the synchronous Elasticsearch client, initializing it if necessary.
+        """
+        self._ensure_initialized()
         return self._client
+    @property
+    def async_client(self) -> AsyncElasticsearch:
+        """
+        Returns the asynchronous Elasticsearch client, initializing it if necessary.
+        """
+        self._ensure_initialized()
+        return self._async_client
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes the component to a dictionary.
@@ -184,15 +211,26 @@ class ElasticsearchDocumentStore:
     def count_documents(self) -> int:
         """
         Returns how many documents are present in the document store.
-        :returns: Number of documents in the document store.
+        :returns:
+            Number of documents in the document store.
         """
+        self._ensure_initialized()
         return self.client.count(index=self._index)["count"]
+    async def count_documents_async(self) -> int:
+        """
+        Asynchronously returns how many documents are present in the document store.
+        :returns: Number of documents in the document store.
+        """
+        self._ensure_initialized()
+        result = await self._async_client.count(index=self._index)  # type: ignore
+        return result["count"]
     def _search_documents(self, **kwargs) -> List[Document]:
         """
         Calls the Elasticsearch client's search method and handles pagination.
         """
         top_k = kwargs.get("size")
         if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
             top_k = kwargs["knn"]["k"]
@@ -207,13 +245,38 @@ class ElasticsearchDocumentStore:
                 **kwargs,
             )
-            documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
+            documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])  # type: ignore
+            from_ = len(documents)
+            if top_k is not None and from_ >= top_k:
+                break
+            if from_ >= res["hits"]["total"]["value"]:
+                break
+        return documents
+    async def _search_documents_async(self, **kwargs) -> List[Document]:
+        """
+        Asynchronously calls the Elasticsearch client's search method and handles pagination.
+        """
+        top_k = kwargs.get("size")
+        if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
+            top_k = kwargs["knn"]["k"]
+        documents: List[Document] = []
+        from_ = 0
+        # handle pagination
+        while True:
+            res = await self._async_client.search(index=self._index, from_=from_, **kwargs)  # type: ignore
+            documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])  # type: ignore
             from_ = len(documents)
             if top_k is not None and from_ >= top_k:
                 break
             if from_ >= res["hits"]["total"]["value"]:
                 break
         return documents
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
@@ -229,10 +292,54 @@ class ElasticsearchDocumentStore:
             msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
             raise ValueError(msg)
+        self._ensure_initialized()
         query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
         documents = self._search_documents(query=query)
         return documents
+    async def filter_documents_async(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
+        """
+        Asynchronously retrieves all documents that match the filters.
+        :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
+            see the official Elasticsearch
+            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
+        :returns: List of `Document`s that match the filters.
+        """
+        if filters and "operator" not in filters and "conditions" not in filters:
+            msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
+            raise ValueError(msg)
+        self._ensure_initialized()
+        query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
+        documents = await self._search_documents_async(query=query)
+        return documents
+    @staticmethod
+    def _deserialize_document(hit: Dict[str, Any]) -> Document:
+        """
+        Creates a `Document` from the search hit provided.
+        This is mostly useful in self.filter_documents().
+        :param hit: A search hit from Elasticsearch.
+        :returns: `Document` created from the search hit.
+        """
+        data = hit["_source"]
+        if "highlight" in hit:
+            data["metadata"]["highlighted"] = hit["highlight"]
+        data["score"] = hit["_score"]
+        if "dataframe" in data:
+            dataframe = data.pop("dataframe")
+            if dataframe:
+                logger.warning(
+                    "Document %s has the `dataframe` field set,"
+                    "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
+                    "The `dataframe` field will soon be removed from Haystack Document.",
+                    data["id"],
+                )
+        return Document.from_dict(data)
     def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
         """
         Writes `Document`s to Elasticsearch.
@@ -258,6 +365,15 @@ class ElasticsearchDocumentStore:
         elasticsearch_actions = []
         for doc in documents:
             doc_dict = doc.to_dict()
+            if "dataframe" in doc_dict:
+                dataframe = doc_dict.pop("dataframe")
+                if dataframe:
+                    logger.warning(
+                        "Document %s has the `dataframe` field set,"
+                        "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
+                        "The `dataframe` field will soon be removed from Haystack Document.",
+                        doc.id,
+                    )
             if "sparse_embedding" in doc_dict:
                 sparse_embedding = doc_dict.pop("sparse_embedding", None)
                 if sparse_embedding:
@@ -306,31 +422,86 @@ class ElasticsearchDocumentStore:
         return documents_written
-    @staticmethod
-    def _deserialize_document(hit: Dict[str, Any]) -> Document:
+    async def write_documents_async(
+        self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE
+    ) -> int:
         """
-        Creates a `Document` from the search hit provided.
+        Asynchronously writes `Document`s to Elasticsearch.
-        This is mostly useful in self.filter_documents().
-        :param hit: A search hit from Elasticsearch.
-        :returns: `Document` created from the search hit.
+        :param documents: List of Documents to write to the document store.
+        :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
+        :raises ValueError: If `documents` is not a list of `Document`s.
+        :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
+            `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
+        :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
+        :returns: Number of documents written to the document store.
         """
-        data = hit["_source"]
+        self._ensure_initialized()
-        if "highlight" in hit:
-            data["metadata"]["highlighted"] = hit["highlight"]
-        data["score"] = hit["_score"]
+        if len(documents) > 0:
+            if not isinstance(documents[0], Document):
+                msg = "param 'documents' must contain a list of objects of type Document"
+                raise ValueError(msg)
-        return Document.from_dict(data)
+        if policy == DuplicatePolicy.NONE:
+            policy = DuplicatePolicy.FAIL
+        actions = []
+        for doc in documents:
+            doc_dict = doc.to_dict()
+            if "dataframe" in doc_dict:
+                dataframe = doc_dict.pop("dataframe")
+                if dataframe:
+                    logger.warning(
+                        "Document {id} has the `dataframe` field set,"
+                        "ElasticsearchDocumentStore no longer supports dataframes and this field will be ignored. "
+                        "The `dataframe` field will soon be removed from Haystack Document.",
+                    )
+            if "sparse_embedding" in doc_dict:
+                sparse_embedding = doc_dict.pop("sparse_embedding", None)
+                if sparse_embedding:
+                    logger.warning(
+                        "Document %s has the `sparse_embedding` field set,"
+                        "but storing sparse embeddings in Elasticsearch is not currently supported."
+                        "The `sparse_embedding` field will be ignored.",
+                        doc.id,
+                    )
+            action = {
+                "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
+                "_id": doc.id,
+                "_source": doc_dict,
+            }
+            actions.append(action)
+        try:
+            success, failed = await helpers.async_bulk(
+                client=self._async_client,
+                actions=actions,
+                index=self._index,
+                refresh=True,
+                raise_on_error=False,
+            )
+            if failed:
+                if policy == DuplicatePolicy.FAIL:
+                    for error in failed:
+                        if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
+                            msg = f"ID '{error['create']['_id']}' already exists in the document store"
+                            raise DuplicateDocumentError(msg)
+                msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
+                raise DocumentStoreError(msg)
+            return success
+        except Exception as e:
+            msg = f"Failed to write documents to Elasticsearch: {e!s}"
+            raise DocumentStoreError(msg) from e
     def delete_documents(self, document_ids: List[str]) -> None:
         """
-        Deletes all `Document`s with a matching `document_ids` from the document store.
+        Deletes all documents with a matching document_ids from the document store.
-        :param document_ids: the object IDs to delete
+        :param document_ids: the document ids to delete
         """
         helpers.bulk(
             client=self.client,
             actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
@@ -339,6 +510,25 @@ class ElasticsearchDocumentStore:
             raise_on_error=False,
         )
+    async def delete_documents_async(self, document_ids: List[str]) -> None:
+        """
+        Asynchronously deletes all documents with a matching document_ids from the document store.
+        :param document_ids: the document ids to delete
+        """
+        self._ensure_initialized()
+        try:
+            await helpers.async_bulk(
+                client=self._async_client,
+                actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
+                index=self._index,
+                refresh=True,
+            )
+        except Exception as e:
+            msg = f"Failed to delete documents from Elasticsearch: {e!s}"
+            raise DocumentStoreError(msg) from e
     def _bm25_retrieval(
         self,
         query: str,
@@ -349,27 +539,15 @@ class ElasticsearchDocumentStore:
         scale_score: bool = False,
     ) -> List[Document]:
         """
-        Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
-        Even though this method is called `bm25_retrieval` it searches for `query`
-        using the search algorithm `_client` was configured with.
-        This method is not meant to be part of the public interface of
-        `ElasticsearchDocumentStore` nor called directly.
-        `ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
-        :param query: String to search in saved `Document`s' text.
-        :param filters: Filters applied to the retrieved `Document`s, for more info
-                        see `ElasticsearchDocumentStore.filter_documents`.
-        :param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
-            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
-            for valid values.
-        :param top_k: Maximum number of `Document`s to return.
-        :param scale_score: If `True` scales the `Document``s scores between 0 and 1.
-        :raises ValueError: If `query` is an empty string
-        :returns: List of `Document` that match `query`
+        Retrieves documents using BM25 retrieval.
+        :param query: The query string to search for
+        :param filters: Optional filters to narrow down the search space
+        :param fuzziness: Fuzziness parameter for the search query
+        :param top_k: Maximum number of documents to return
+        :param scale_score: Whether to scale the similarity score to the range [0,1]
+        :returns: List of Documents that match the query
         """
         if not query:
             msg = "query must be a non empty string"
             raise ValueError(msg)
@@ -403,35 +581,79 @@ class ElasticsearchDocumentStore:
         return documents
-    def _embedding_retrieval(
+    async def _bm25_retrieval_async(
         self,
-        query_embedding: List[float],
+        query: str,
         *,
         filters: Optional[Dict[str, Any]] = None,
+        fuzziness: str = "AUTO",
         top_k: int = 10,
-        num_candidates: Optional[int] = None,
+        scale_score: bool = False,
     ) -> List[Document]:
         """
-        Retrieves documents that are most similar to the query embedding using a vector similarity metric.
+        Asynchronously retrieves documents using BM25 retrieval.
+        :param query: The query string to search for
+        :param filters: Optional filters to narrow down the search space
+        :param fuzziness: Fuzziness parameter for the search query
+        :param top_k: Maximum number of documents to return
+        :param scale_score: Whether to scale the similarity score to the range [0,1]
+        :returns: List of Documents that match the query
+        """
+        self._ensure_initialized()
+        if not query:
+            msg = "query must be a non empty string"
+            raise ValueError(msg)
+        # Prepare the search body
+        search_body = {
+            "size": top_k,
+            "query": {
+                "bool": {
+                    "must": [
+                        {
+                            "multi_match": {
+                                "query": query,
+                                "type": "most_fields",
+                                "operator": "OR",
+                                "fuzziness": fuzziness,
+                            }
+                        }
+                    ]
+                }
+            },
+        }
+        if filters:
+            search_body["query"]["bool"]["filter"] = _normalize_filters(filters)  # type:ignore
-        It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
+        documents = await self._search_documents_async(**search_body)
+        if scale_score:
+            for doc in documents:
+                if doc.score is not None:
+                    doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
-        This method is not meant to be part of the public interface of
-        `ElasticsearchDocumentStore` nor called directly.
-        `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
+        return documents
-        :param query_embedding: Embedding of the query.
-        :param filters: Filters applied to the retrieved `Document`s.
-            Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
-        :param top_k: Maximum number of `Document`s to return.
-        :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
-            Increasing this value will improve search accuracy at the cost of slower search speeds.
-            You can read more about it in the Elasticsearch
-            [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
-        :raises ValueError: If `query_embedding` is an empty list.
-        :returns: List of `Document` that are most similar to `query_embedding`.
+    def _embedding_retrieval(
+        self,
+        query_embedding: List[float],
+        *,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10,
+        num_candidates: Optional[int] = None,
+    ) -> List[Document]:
         """
+        Retrieves documents using dense vector similarity search.
+        :param query_embedding: Embedding vector to search for
+        :param filters: Optional filters to narrow down the search space
+        :param top_k: Maximum number of documents to return
+        :param num_candidates: Number of candidates to consider in the search
+        :returns: List of Documents most similar to query_embedding
+        """
         if not query_embedding:
             msg = "query_embedding must be a non-empty list of floats"
             raise ValueError(msg)
@@ -453,3 +675,45 @@ class ElasticsearchDocumentStore:
         docs = self._search_documents(**body)
         return docs
+    async def _embedding_retrieval_async(
+        self,
+        query_embedding: List[float],
+        *,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10,
+        num_candidates: Optional[int] = None,
+    ) -> List[Document]:
+        """
+        Asynchronously retrieves documents using dense vector similarity search.
+        :param query_embedding: Embedding vector to search for
+        :param filters: Optional filters to narrow down the search space
+        :param top_k: Maximum number of documents to return
+        :param num_candidates: Number of candidates to consider in the search
+        :returns: List of Documents most similar to query_embedding
+        """
+        self._ensure_initialized()
+        if not query_embedding:
+            msg = "query_embedding must be a non-empty list of floats"
+            raise ValueError(msg)
+        # If num_candidates is not set, use top_k * 10 as default
+        if num_candidates is None:
+            num_candidates = top_k * 10
+        # Prepare the search body
+        search_body = {
+            "knn": {
+                "field": "embedding",
+                "query_vector": query_embedding,
+                "k": top_k,
+                "num_candidates": num_candidates,
+            },
+        }
+        if filters:
+            search_body["knn"]["filter"] = _normalize_filters(filters)
+        return await self._search_documents_async(**search_body)

haystack_integrations/document_stores/elasticsearch/filters.py CHANGED Viewed

@@ -5,7 +5,6 @@ from datetime import datetime
 from typing import Any, Dict, List
 from haystack.errors import FilterError
-from pandas import DataFrame
 def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
@@ -57,7 +56,7 @@ def _equal(field: str, value: Any) -> Dict[str, Any]:
                 }
             }
         }
-    if field in ["text", "dataframe"]:
+    if field == "text":
         # We want to fully match the text field.
         return {"match": {field: {"query": value, "minimum_should_match": "100%"}}}
     return {"term": {field: value}}
@@ -69,7 +68,7 @@ def _not_equal(field: str, value: Any) -> Dict[str, Any]:
     if isinstance(value, list):
         return {"bool": {"must_not": {"terms": {field: value}}}}
-    if field in ["text", "dataframe"]:
+    if field == "text":
         # We want to fully match the text field.
         return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}}
@@ -92,7 +91,7 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"gt": value}}}
@@ -114,7 +113,7 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"gte": value}}}
@@ -136,7 +135,7 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"lt": value}}}
@@ -158,7 +157,7 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(value) in [list, DataFrame]:
+    if isinstance(value, list):
         msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
         raise FilterError(msg)
     return {"range": {field: {"lte": value}}}
@@ -212,8 +211,6 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
         raise FilterError(msg)
     operator: str = condition["operator"]
     value: Any = condition["value"]
-    if isinstance(value, DataFrame):
-        value = value.to_json()
     return COMPARISON_OPERATORS[operator](field, value)

elasticsearch_haystack-1.0.1.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-haystack_integrations/components/retrievers/elasticsearch/__init__.py,sha256=cSJBsYjz_T4kK-M-auAHVUnYIcgUqqwwQe_hsF0_IG4,307
-haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha256=XA6UiNFb59CMM5LSoPmNDe3IzZ7ty7HViSaU2ZT4--w,5851
-haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=ZL9kHi6tCzks1_GXoOIRVLcN4BWnaMqN6t-JcwdTfao,5992
-haystack_integrations/document_stores/elasticsearch/__init__.py,sha256=YTfu94dtVUBogbJFr1aJrKuaI6-Bw9VuHfPoyU7M8os,207
-haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=B2B0F2AHsoP1-BykF_xqfRAYeQPmsiBn0QCIfTqk-pc,18871
-haystack_integrations/document_stores/elasticsearch/filters.py,sha256=L1tN7YCIDuNdhGrBQdPoqXFk37x__2-K038xZ6PRdNQ,9923
-elasticsearch_haystack-1.0.1.dist-info/METADATA,sha256=hTImF5-zddncU9m31MLAS4eDtlShxI_gb5lgQFMlCbI,2168
-elasticsearch_haystack-1.0.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-elasticsearch_haystack-1.0.1.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
-elasticsearch_haystack-1.0.1.dist-info/RECORD,,

{elasticsearch_haystack-1.0.1.dist-info → elasticsearch_haystack-2.1.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

elasticsearch-haystack 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

Potentially problematic release.

elasticsearch-haystack 1.0.1py3-none-any.whl → 2.1.0py3-none-any.whl