PyPI - qdrant-haystack - Versions diffs - 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl - Mend

qdrant-haystack 9.0.0py3-none-any.whl → 9.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

haystack_integrations/document_stores/qdrant/document_store.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import inspect
 from itertools import islice
-from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Union
+from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union
 import numpy as np
 import qdrant_client
@@ -216,6 +216,7 @@ class QdrantDocumentStore:
         """
         self._client = None
+        self._async_client = None
         # Store the Qdrant client specific attributes
         self.location = location
@@ -257,24 +258,10 @@ class QdrantDocumentStore:
         self.write_batch_size = write_batch_size
         self.scroll_size = scroll_size
-    @property
-    def client(self):
-        if not self._client:
-            self._client = qdrant_client.QdrantClient(
-                location=self.location,
-                url=self.url,
-                port=self.port,
-                grpc_port=self.grpc_port,
-                prefer_grpc=self.prefer_grpc,
-                https=self.https,
-                api_key=self.api_key.resolve_value() if self.api_key else None,
-                prefix=self.prefix,
-                timeout=self.timeout,
-                host=self.host,
-                path=self.path,
-                metadata=self.metadata,
-                force_disable_check_same_thread=self.force_disable_check_same_thread,
-            )
+    def _initialize_client(self):
+        if self._client is None:
+            client_params = self._prepare_client_params()
+            self._client = qdrant_client.QdrantClient(**client_params)
             # Make sure the collection is properly set up
             self._set_up_collection(
                 self.index,
@@ -286,14 +273,52 @@ class QdrantDocumentStore:
                 self.on_disk,
                 self.payload_fields_to_index,
             )
-        return self._client
+    async def _initialize_async_client(self):
+        """
+        Returns the asynchronous Qdrant client, initializing it if necessary.
+        """
+        if self._async_client is None:
+            client_params = self._prepare_client_params()
+            self._async_client = qdrant_client.AsyncQdrantClient(
+                **client_params,
+            )
+            await self._set_up_collection_async(
+                self.index,
+                self.embedding_dim,
+                self.recreate_index,
+                self.similarity,
+                self.use_sparse_embeddings,
+                self.sparse_idf,
+                self.on_disk,
+                self.payload_fields_to_index,
+            )
     def count_documents(self) -> int:
         """
         Returns the number of documents present in the Document Store.
         """
+        self._initialize_client()
+        assert self._client is not None
+        try:
+            response = self._client.count(
+                collection_name=self.index,
+            )
+            return response.count
+        except (UnexpectedResponse, ValueError):
+            # Qdrant local raises ValueError if the collection is not found, but
+            # with the remote server UnexpectedResponse is raised. Until that's unified,
+            # we need to catch both.
+            return 0
+    async def count_documents_async(self) -> int:
+        """
+        Asynchronously returns the number of documents present in the document dtore.
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
         try:
-            response = self.client.count(
+            response = await self._async_client.count(
                 collection_name=self.index,
             )
             return response.count
@@ -316,19 +341,29 @@ class QdrantDocumentStore:
         :param filters: The filters to apply to the document list.
         :returns: A list of documents that match the given filters.
         """
-        if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
-            msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
-            raise ValueError(msg)
+        # No need to initialize client here as _get_documents_generator
+        # will handle client initialization internally
-        if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
-            msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
-            raise ValueError(msg)
+        self._validate_filters(filters)
         return list(
-            self.get_documents_generator(
+            self._get_documents_generator(
                 filters,
             )
         )
+    async def filter_documents_async(
+        self,
+        filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
+    ) -> List[Document]:
+        """
+        Asynchronously returns the documents that match the provided filters.
+        """
+        # No need to initialize client here as _get_documents_generator_async
+        # will handle client initialization internally
+        self._validate_filters(filters)
+        return [doc async for doc in self._get_documents_generator_async(filters)]
     def write_documents(
         self,
         documents: List[Document],
@@ -347,13 +382,14 @@ class QdrantDocumentStore:
         :returns: The number of documents written to the document store.
         """
+        self._initialize_client()
+        assert self._client is not None
         for doc in documents:
             if not isinstance(doc, Document):
                 msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
                 raise ValueError(msg)
-        self._set_up_collection(
-            self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
-        )
         if len(documents) == 0:
             logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
@@ -372,7 +408,61 @@ class QdrantDocumentStore:
                     use_sparse_embeddings=self.use_sparse_embeddings,
                 )
-                self.client.upsert(
+                self._client.upsert(
+                    collection_name=self.index,
+                    points=batch,
+                    wait=self.wait_result_from_api,
+                )
+                progress_bar.update(self.write_batch_size)
+        return len(document_objects)
+    async def write_documents_async(
+        self,
+        documents: List[Document],
+        policy: DuplicatePolicy = DuplicatePolicy.FAIL,
+    ) -> int:
+        """
+        Asynchronously writes documents to Qdrant using the specified policy.
+        The QdrantDocumentStore can handle duplicate documents based on the given policy.
+        The available policies are:
+        - `FAIL`: The operation will raise an error if any document already exists.
+        - `OVERWRITE`: Existing documents will be overwritten with the new ones.
+        - `SKIP`: Existing documents will be skipped, and only new documents will be added.
+        :param documents: A list of Document objects to write to Qdrant.
+        :param policy: The policy for handling duplicate documents.
+        :returns: The number of documents written to the document store.
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
+        for doc in documents:
+            if not isinstance(doc, Document):
+                msg = f"""DocumentStore.write_documents_async() expects a list of
+                Documents but got an element of {type(doc)}."""
+                raise ValueError(msg)
+        if len(documents) == 0:
+            logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
+            return 0
+        document_objects = await self._handle_duplicate_documents_async(
+            documents=documents,
+            policy=policy,
+        )
+        batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
+        with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
+            for document_batch in batched_documents:
+                batch = convert_haystack_documents_to_qdrant_points(
+                    document_batch,
+                    use_sparse_embeddings=self.use_sparse_embeddings,
+                )
+                await self._async_client.upsert(
                     collection_name=self.index,
                     points=batch,
                     wait=self.wait_result_from_api,
@@ -387,9 +477,13 @@ class QdrantDocumentStore:
         :param document_ids: the document ids to delete
         """
+        self._initialize_client()
+        assert self._client is not None
         ids = [convert_id(_id) for _id in document_ids]
         try:
-            self.client.delete(
+            self._client.delete(
                 collection_name=self.index,
                 points_selector=ids,
                 wait=self.wait_result_from_api,
@@ -399,6 +493,28 @@ class QdrantDocumentStore:
                 "Called QdrantDocumentStore.delete_documents() on a non-existing ID",
             )
+    async def delete_documents_async(self, document_ids: List[str]) -> None:
+        """
+        Asynchronously deletes documents that match the provided `document_ids` from the document store.
+        :param document_ids: the document ids to delete
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
+        ids = [convert_id(_id) for _id in document_ids]
+        try:
+            await self._async_client.delete(
+                collection_name=self.index,
+                points_selector=ids,
+                wait=self.wait_result_from_api,
+            )
+        except KeyError:
+            logger.warning(
+                "Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
+            )
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
         """
@@ -429,7 +545,7 @@ class QdrantDocumentStore:
             **init_params,
         )
-    def get_documents_generator(
+    def _get_documents_generator(
         self,
         filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
     ) -> Generator[Document, None, None]:
@@ -440,13 +556,53 @@ class QdrantDocumentStore:
         :returns: A generator that yields documents retrieved from Qdrant.
         """
+        self._initialize_client()
+        assert self._client is not None
+        index = self.index
+        qdrant_filters = convert_filters_to_qdrant(filters)
+        next_offset = None
+        stop_scrolling = False
+        while not stop_scrolling:
+            records, next_offset = self._client.scroll(
+                collection_name=index,
+                scroll_filter=qdrant_filters,
+                limit=self.scroll_size,
+                offset=next_offset,
+                with_payload=True,
+                with_vectors=True,
+            )
+            stop_scrolling = next_offset is None or (
+                isinstance(next_offset, grpc.PointId) and next_offset.num == 0 and next_offset.uuid == ""
+            )
+            for record in records:
+                yield convert_qdrant_point_to_haystack_document(
+                    record, use_sparse_embeddings=self.use_sparse_embeddings
+                )
+    async def _get_documents_generator_async(
+        self,
+        filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
+    ) -> AsyncGenerator[Document, None]:
+        """
+        Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
+        :param filters: Filters applied to the retrieved documents.
+        :returns: An asynchronous generator that yields documents retrieved from Qdrant.
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
         index = self.index
         qdrant_filters = convert_filters_to_qdrant(filters)
         next_offset = None
         stop_scrolling = False
         while not stop_scrolling:
-            records, next_offset = self.client.scroll(
+            records, next_offset = await self._async_client.scroll(
                 collection_name=index,
                 scroll_filter=qdrant_filters,
                 limit=self.scroll_size,
@@ -479,8 +635,44 @@ class QdrantDocumentStore:
         """
         documents: List[Document] = []
+        self._initialize_client()
+        assert self._client is not None
+        ids = [convert_id(_id) for _id in ids]
+        records = self._client.retrieve(
+            collection_name=self.index,
+            ids=ids,
+            with_payload=True,
+            with_vectors=True,
+        )
+        for record in records:
+            documents.append(
+                convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
+            )
+        return documents
+    async def get_documents_by_id_async(
+        self,
+        ids: List[str],
+    ) -> List[Document]:
+        """
+        Retrieves documents from Qdrant by their IDs.
+        :param ids:
+            A list of document IDs to retrieve.
+        :param index:
+            The name of the index to retrieve documents from.
+        :returns:
+            A list of documents.
+        """
+        documents: List[Document] = []
+        await self._initialize_async_client()
+        assert self._async_client is not None
         ids = [convert_id(_id) for _id in ids]
-        records = self.client.retrieve(
+        records = await self._async_client.retrieve(
             collection_name=self.index,
             ids=ids,
             with_payload=True,
@@ -526,6 +718,8 @@ class QdrantDocumentStore:
         :raises QdrantStoreError:
             If the Document Store was initialized with `use_sparse_embeddings=False`.
         """
+        self._initialize_client()
+        assert self._client is not None
         if not self.use_sparse_embeddings:
             message = (
@@ -538,7 +732,7 @@ class QdrantDocumentStore:
         query_indices = query_sparse_embedding.indices
         query_values = query_sparse_embedding.values
         if group_by:
-            groups = self.client.query_points_groups(
+            groups = self._client.query_points_groups(
                 collection_name=self.index,
                 query=rest.SparseVector(
                     indices=query_indices,
@@ -552,17 +746,9 @@ class QdrantDocumentStore:
                 with_vectors=return_embedding,
                 score_threshold=score_threshold,
             ).groups
-            results = (
-                [
-                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-                    for group in groups
-                    for point in group.hits
-                ]
-                if groups
-                else []
-            )
+            return self._process_group_results(groups)
         else:
-            points = self.client.query_points(
+            points = self._client.query_points(
                 collection_name=self.index,
                 query=rest.SparseVector(
                     indices=query_indices,
@@ -574,16 +760,7 @@ class QdrantDocumentStore:
                 with_vectors=return_embedding,
                 score_threshold=score_threshold,
             ).points
-            results = [
-                convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-                for point in points
-            ]
-        if scale_score:
-            for document in results:
-                score = document.score
-                score = float(1 / (1 + np.exp(-score / 100)))
-                document.score = score
-        return results
+            return self._process_query_point_results(points, scale_score=scale_score)
     def _query_by_embedding(
         self,
@@ -615,9 +792,12 @@ class QdrantDocumentStore:
         :returns: List of documents that are most similar to `query_embedding`.
         """
+        self._initialize_client()
+        assert self._client is not None
         qdrant_filters = convert_filters_to_qdrant(filters)
         if group_by:
-            groups = self.client.query_points_groups(
+            groups = self._client.query_points_groups(
                 collection_name=self.index,
                 query=query_embedding,
                 using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
@@ -628,17 +808,10 @@ class QdrantDocumentStore:
                 with_vectors=return_embedding,
                 score_threshold=score_threshold,
             ).groups
-            results = (
-                [
-                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-                    for group in groups
-                    for point in group.hits
-                ]
-                if groups
-                else []
-            )
+            return self._process_group_results(groups)
         else:
-            points = self.client.query_points(
+            points = self._client.query_points(
                 collection_name=self.index,
                 query=query_embedding,
                 using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
@@ -647,20 +820,7 @@ class QdrantDocumentStore:
                 with_vectors=return_embedding,
                 score_threshold=score_threshold,
             ).points
-            results = [
-                convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-                for point in points
-            ]
-        if scale_score:
-            for document in results:
-                score = document.score
-                if self.similarity == "cosine":
-                    score = (score + 1) / 2
-                else:
-                    score = float(1 / (1 + np.exp(-score / 100)))
-                document.score = score
-        return results
+            return self._process_query_point_results(points, scale_score=scale_score)
     def _query_hybrid(
         self,
@@ -701,6 +861,10 @@ class QdrantDocumentStore:
         # This implementation is based on the code from the Python Qdrant client:
         # https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
+        self._initialize_client()
+        assert self._client is not None
         if not self.use_sparse_embeddings:
             message = (
                 "You are trying to query using sparse embeddings, but the Document Store "
@@ -712,7 +876,7 @@ class QdrantDocumentStore:
         try:
             if group_by:
-                groups = self.client.query_points_groups(
+                groups = self._client.query_points_groups(
                     collection_name=self.index,
                     prefetch=[
                         rest.Prefetch(
@@ -738,7 +902,7 @@ class QdrantDocumentStore:
                     with_vectors=return_embedding,
                 ).groups
             else:
-                points = self.client.query_points(
+                points = self._client.query_points(
                     collection_name=self.index,
                     prefetch=[
                         rest.Prefetch(
@@ -767,71 +931,339 @@ class QdrantDocumentStore:
             raise QdrantStoreError(msg) from e
         if group_by:
-            results = (
-                [
-                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-                    for group in groups
-                    for point in group.hits
-                ]
-                if groups
-                else []
-            )
+            return self._process_group_results(groups)
         else:
-            results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
+            return self._process_query_point_results(points)
-        return results
-    def get_distance(self, similarity: str) -> rest.Distance:
+    async def _query_by_sparse_async(
+        self,
+        query_sparse_embedding: SparseEmbedding,
+        filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
+        top_k: int = 10,
+        scale_score: bool = False,
+        return_embedding: bool = False,
+        score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
+    ) -> List[Document]:
         """
-        Retrieves the distance metric for the specified similarity measure.
+        Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
+        :param query_sparse_embedding: Sparse embedding of the query.
+        :param filters: Filters applied to the retrieved documents.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
+        :param scale_score: Whether to scale the scores of the retrieved documents.
+        :param return_embedding: Whether to return the embeddings of the retrieved documents.
+        :param score_threshold: A minimal score threshold for the result.
+            Score of the returned result might be higher or smaller than the threshold
+             depending on the Distance function used.
+            E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
+        :returns: List of documents that are most similar to `query_sparse_embedding`.
-        :param similarity:
-            The similarity measure to retrieve the distance.
-        :returns:
-            The corresponding rest.Distance object.
         :raises QdrantStoreError:
-            If the provided similarity measure is not supported.
+            If the Document Store was initialized with `use_sparse_embeddings=False`.
         """
-        try:
-            return self.SIMILARITY[similarity]
-        except KeyError as ke:
-            msg = (
-                f"Provided similarity '{similarity}' is not supported by Qdrant "
-                f"document store. Please choose one of the options: "
-                f"{', '.join(self.SIMILARITY.keys())}"
-            )
-            raise QdrantStoreError(msg) from ke
-    def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
-        """
-        Create payload index for the collection if payload_fields_to_index is provided
-        See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
-        """
-        if payload_fields_to_index is not None:
-            for payload_index in payload_fields_to_index:
-                self.client.create_payload_index(
-                    collection_name=collection_name,
-                    field_name=payload_index["field_name"],
-                    field_schema=payload_index["field_schema"],
-                )
+        await self._initialize_async_client()
+        assert self._async_client is not None
-    def _set_up_collection(
-        self,
-        collection_name: str,
-        embedding_dim: int,
-        recreate_collection: bool,
-        similarity: str,
-        use_sparse_embeddings: bool,
-        sparse_idf: bool,
-        on_disk: bool = False,
-        payload_fields_to_index: Optional[List[dict]] = None,
-    ):
-        """
-        Sets up the Qdrant collection with the specified parameters.
-        :param collection_name:
-            The name of the collection to set up.
-        :param embedding_dim:
-            The dimension of the embeddings.
+        if not self.use_sparse_embeddings:
+            message = (
+                "You are trying to query using sparse embeddings, but the Document Store "
+                "was initialized with `use_sparse_embeddings=False`. "
+            )
+            raise QdrantStoreError(message)
+        qdrant_filters = convert_filters_to_qdrant(filters)
+        query_indices = query_sparse_embedding.indices
+        query_values = query_sparse_embedding.values
+        if group_by:
+            response = await self._async_client.query_points_groups(
+                collection_name=self.index,
+                query=rest.SparseVector(
+                    indices=query_indices,
+                    values=query_values,
+                ),
+                using=SPARSE_VECTORS_NAME,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                group_by=group_by,
+                group_size=group_size,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            )
+            groups = response.groups
+            return self._process_group_results(groups)
+        else:
+            response = await self._async_client.query_points(
+                collection_name=self.index,
+                query=rest.SparseVector(
+                    indices=query_indices,
+                    values=query_values,
+                ),
+                using=SPARSE_VECTORS_NAME,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            )
+            points = response.points
+            return self._process_query_point_results(points, scale_score=scale_score)
+    async def _query_by_embedding_async(
+        self,
+        query_embedding: List[float],
+        filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
+        top_k: int = 10,
+        scale_score: bool = False,
+        return_embedding: bool = False,
+        score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
+    ) -> List[Document]:
+        """
+        Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
+        :param query_embedding: Dense embedding of the query.
+        :param filters: Filters applied to the retrieved documents.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
+        :param scale_score: Whether to scale the scores of the retrieved documents.
+        :param return_embedding: Whether to return the embeddings of the retrieved documents.
+        :param score_threshold: A minimal score threshold for the result.
+            Score of the returned result might be higher or smaller than the threshold
+             depending on the Distance function used.
+            E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
+        :returns: List of documents that are most similar to `query_embedding`.
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
+        qdrant_filters = convert_filters_to_qdrant(filters)
+        if group_by:
+            response = await self._async_client.query_points_groups(
+                collection_name=self.index,
+                query=query_embedding,
+                using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                group_by=group_by,
+                group_size=group_size,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            )
+            groups = response.groups
+            return self._process_group_results(groups)
+        else:
+            response = await self._async_client.query_points(
+                collection_name=self.index,
+                query=query_embedding,
+                using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            )
+            points = response.points
+            return self._process_query_point_results(points, scale_score=scale_score)
+    async def _query_hybrid_async(
+        self,
+        query_embedding: List[float],
+        query_sparse_embedding: SparseEmbedding,
+        filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
+        top_k: int = 10,
+        return_embedding: bool = False,
+        score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
+    ) -> List[Document]:
+        """
+        Asynchronously retrieves documents based on dense and sparse embeddings and fuses
+        the results using Reciprocal Rank Fusion.
+        This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
+        Use the `QdrantHybridRetriever` instead.
+        :param query_embedding: Dense embedding of the query.
+        :param query_sparse_embedding: Sparse embedding of the query.
+        :param filters: Filters applied to the retrieved documents.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
+        :param return_embedding: Whether to return the embeddings of the retrieved documents.
+        :param score_threshold: A minimal score threshold for the result.
+            Score of the returned result might be higher or smaller than the threshold
+             depending on the Distance function used.
+            E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
+        :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
+        :raises QdrantStoreError:
+            If the Document Store was initialized with `use_sparse_embeddings=False`.
+        """
+        await self._initialize_async_client()
+        assert self._async_client is not None
+        if not self.use_sparse_embeddings:
+            message = (
+                "You are trying to query using sparse embeddings, but the Document Store "
+                "was initialized with `use_sparse_embeddings=False`. "
+            )
+            raise QdrantStoreError(message)
+        qdrant_filters = convert_filters_to_qdrant(filters)
+        try:
+            if group_by:
+                response = await self._async_client.query_points_groups(
+                    collection_name=self.index,
+                    prefetch=[
+                        rest.Prefetch(
+                            query=rest.SparseVector(
+                                indices=query_sparse_embedding.indices,
+                                values=query_sparse_embedding.values,
+                            ),
+                            using=SPARSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                        rest.Prefetch(
+                            query=query_embedding,
+                            using=DENSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                    ],
+                    query=rest.FusionQuery(fusion=rest.Fusion.RRF),
+                    limit=top_k,
+                    group_by=group_by,
+                    group_size=group_size,
+                    score_threshold=score_threshold,
+                    with_payload=True,
+                    with_vectors=return_embedding,
+                )
+                groups = response.groups
+            else:
+                response = await self._async_client.query_points(
+                    collection_name=self.index,
+                    prefetch=[
+                        rest.Prefetch(
+                            query=rest.SparseVector(
+                                indices=query_sparse_embedding.indices,
+                                values=query_sparse_embedding.values,
+                            ),
+                            using=SPARSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                        rest.Prefetch(
+                            query=query_embedding,
+                            using=DENSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                    ],
+                    query=rest.FusionQuery(fusion=rest.Fusion.RRF),
+                    limit=top_k,
+                    score_threshold=score_threshold,
+                    with_payload=True,
+                    with_vectors=return_embedding,
+                )
+                points = response.points
+        except Exception as e:
+            msg = "Error during hybrid search"
+            raise QdrantStoreError(msg) from e
+        if group_by:
+            return self._process_group_results(groups)
+        else:
+            return self._process_query_point_results(points)
+    def get_distance(self, similarity: str) -> rest.Distance:
+        """
+        Retrieves the distance metric for the specified similarity measure.
+        :param similarity:
+            The similarity measure to retrieve the distance.
+        :returns:
+            The corresponding rest.Distance object.
+        :raises QdrantStoreError:
+            If the provided similarity measure is not supported.
+        """
+        try:
+            return self.SIMILARITY[similarity]
+        except KeyError as ke:
+            msg = (
+                f"Provided similarity '{similarity}' is not supported by Qdrant "
+                f"document store. Please choose one of the options: "
+                f"{', '.join(self.SIMILARITY.keys())}"
+            )
+            raise QdrantStoreError(msg) from ke
+    def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
+        """
+        Create payload index for the collection if payload_fields_to_index is provided
+        See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
+        """
+        if payload_fields_to_index is not None:
+            for payload_index in payload_fields_to_index:
+                # self._client is initialized at this point
+                # since _initialize_client() is called before this method is executed
+                assert self._client is not None
+                self._client.create_payload_index(
+                    collection_name=collection_name,
+                    field_name=payload_index["field_name"],
+                    field_schema=payload_index["field_schema"],
+                )
+    async def _create_payload_index_async(
+        self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None
+    ):
+        """
+        Asynchronously create payload index for the collection if payload_fields_to_index is provided
+        See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
+        """
+        if payload_fields_to_index is not None:
+            for payload_index in payload_fields_to_index:
+                # self._async_client is initialized at this point
+                # since _initialize_async_client() is called before this method is executed
+                assert self._async_client is not None
+                await self._async_client.create_payload_index(
+                    collection_name=collection_name,
+                    field_name=payload_index["field_name"],
+                    field_schema=payload_index["field_schema"],
+                )
+    def _set_up_collection(
+        self,
+        collection_name: str,
+        embedding_dim: int,
+        recreate_collection: bool,
+        similarity: str,
+        use_sparse_embeddings: bool,
+        sparse_idf: bool,
+        on_disk: bool = False,
+        payload_fields_to_index: Optional[List[dict]] = None,
+    ):
+        """
+        Sets up the Qdrant collection with the specified parameters.
+        :param collection_name:
+            The name of the collection to set up.
+        :param embedding_dim:
+            The dimension of the embeddings.
         :param recreate_collection:
             Whether to recreate the collection if it already exists.
         :param similarity:
@@ -851,9 +1283,13 @@ class QdrantDocumentStore:
             If the collection exists with a different similarity measure or embedding dimension.
         """
+        self._initialize_client()
+        assert self._client is not None
         distance = self.get_distance(similarity)
-        if recreate_collection or not self.client.collection_exists(collection_name):
+        if recreate_collection or not self._client.collection_exists(collection_name):
             # There is no need to verify the current configuration of that
             # collection. It might be just recreated again or does not exist yet.
             self.recreate_collection(
@@ -863,64 +1299,65 @@ class QdrantDocumentStore:
             self._create_payload_index(collection_name, payload_fields_to_index)
             return
-        collection_info = self.client.get_collection(collection_name)
+        collection_info = self._client.get_collection(collection_name)
-        has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
+        self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
-        if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
-            msg = (
-                f"Collection '{collection_name}' already exists in Qdrant, "
-                f"but it has been originally created outside of Haystack and is not supported. "
-                f"If possible, you should create a new Document Store with Haystack. "
-                f"In case you want to migrate the existing collection, see an example script in "
-                f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
-                f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
-            )
-            raise QdrantStoreError(msg)
+    async def _set_up_collection_async(
+        self,
+        collection_name: str,
+        embedding_dim: int,
+        recreate_collection: bool,
+        similarity: str,
+        use_sparse_embeddings: bool,
+        sparse_idf: bool,
+        on_disk: bool = False,
+        payload_fields_to_index: Optional[List[dict]] = None,
+    ):
+        """
+        Asynchronously sets up the Qdrant collection with the specified parameters.
+        :param collection_name:
+            The name of the collection to set up.
+        :param embedding_dim:
+            The dimension of the embeddings.
+        :param recreate_collection:
+            Whether to recreate the collection if it already exists.
+        :param similarity:
+            The similarity measure to use.
+        :param use_sparse_embeddings:
+            Whether to use sparse embeddings.
+        :param sparse_idf:
+            Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
+        :param on_disk:
+            Whether to store the collection on disk.
+        :param payload_fields_to_index:
+            List of payload fields to index.
-        if self.use_sparse_embeddings and not has_named_vectors:
-            msg = (
-                f"Collection '{collection_name}' already exists in Qdrant, "
-                f"but it has been originally created without sparse embedding vectors. "
-                f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
-                f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
-                f"See `migrate_to_sparse_embeddings_support` function in "
-                f"`haystack_integrations.document_stores.qdrant`."
-            )
-            raise QdrantStoreError(msg)
+        :raises QdrantStoreError:
+            If the collection exists with incompatible settings.
+        :raises ValueError:
+            If the collection exists with a different similarity measure or embedding dimension.
-        if not self.use_sparse_embeddings and has_named_vectors:
-            msg = (
-                f"Collection '{collection_name}' already exists in Qdrant, "
-                f"but it has been originally created with sparse embedding vectors."
-                f"If you want to use that collection, please set `use_sparse_embeddings=True`."
-            )
-            raise QdrantStoreError(msg)
+        """
-        if self.use_sparse_embeddings:
-            current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
-            current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
-        else:
-            current_distance = collection_info.config.params.vectors.distance
-            current_vector_size = collection_info.config.params.vectors.size
+        await self._initialize_async_client()
+        assert self._async_client is not None
-        if current_distance != distance:
-            msg = (
-                f"Collection '{collection_name}' already exists in Qdrant, "
-                f"but it is configured with a similarity '{current_distance.name}'. "
-                f"If you want to use that collection, but with a different "
-                f"similarity, please set `recreate_collection=True` argument."
-            )
-            raise ValueError(msg)
+        distance = self.get_distance(similarity)
-        if current_vector_size != embedding_dim:
-            msg = (
-                f"Collection '{collection_name}' already exists in Qdrant, "
-                f"but it is configured with a vector size '{current_vector_size}'. "
-                f"If you want to use that collection, but with a different "
-                f"vector size, please set `recreate_collection=True` argument."
+        if recreate_collection or not await self._async_client.collection_exists(collection_name):
+            # There is no need to verify the current configuration of that
+            # collection. It might be just recreated again or does not exist yet.
+            await self.recreate_collection_async(
+                collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
             )
-            raise ValueError(msg)
+            # Create Payload index if payload_fields_to_index is provided
+            await self._create_payload_index_async(collection_name, payload_fields_to_index)
+            return
+        collection_info = await self._async_client.get_collection(collection_name)
+        self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
     def recreate_collection(
         self,
@@ -947,44 +1384,65 @@ class QdrantDocumentStore:
         :param sparse_idf:
             Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
         """
-        if on_disk is None:
-            on_disk = self.on_disk
+        vectors_config, sparse_vectors_config = self._prepare_collection_config(
+            embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
+        )
+        collection_params = self._prepare_collection_params()
-        if use_sparse_embeddings is None:
-            use_sparse_embeddings = self.use_sparse_embeddings
+        self._initialize_client()
+        assert self._client is not None
-        # dense vectors configuration
-        vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
+        if self._client.collection_exists(collection_name):
+            self._client.delete_collection(collection_name)
-        if use_sparse_embeddings:
-            # in this case, we need to define named vectors
-            vectors_config = {DENSE_VECTORS_NAME: vectors_config}
+        self._client.create_collection(
+            collection_name=collection_name,
+            vectors_config=vectors_config,
+            sparse_vectors_config=sparse_vectors_config,
+            **collection_params,
+        )
-            sparse_vectors_config = {
-                SPARSE_VECTORS_NAME: rest.SparseVectorParams(
-                    index=rest.SparseIndexParams(
-                        on_disk=on_disk,
-                    ),
-                    modifier=rest.Modifier.IDF if sparse_idf else None,
-                ),
-            }
+    async def recreate_collection_async(
+        self,
+        collection_name: str,
+        distance,
+        embedding_dim: int,
+        on_disk: Optional[bool] = None,
+        use_sparse_embeddings: Optional[bool] = None,
+        sparse_idf: bool = False,
+    ):
+        """
+        Asynchronously recreates the Qdrant collection with the specified parameters.
+        :param collection_name:
+            The name of the collection to recreate.
+        :param distance:
+            The distance metric to use for the collection.
+        :param embedding_dim:
+            The dimension of the embeddings.
+        :param on_disk:
+            Whether to store the collection on disk.
+        :param use_sparse_embeddings:
+            Whether to use sparse embeddings.
+        :param sparse_idf:
+            Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
+        """
+        vectors_config, sparse_vectors_config = self._prepare_collection_config(
+            embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
+        )
+        collection_params = self._prepare_collection_params()
+        await self._initialize_async_client()
+        assert self._async_client is not None
-        if self.client.collection_exists(collection_name):
-            self.client.delete_collection(collection_name)
+        if await self._async_client.collection_exists(collection_name):
+            await self._async_client.delete_collection(collection_name)
-        self.client.create_collection(
+        await self._async_client.create_collection(
             collection_name=collection_name,
             vectors_config=vectors_config,
-            sparse_vectors_config=sparse_vectors_config if use_sparse_embeddings else None,
-            shard_number=self.shard_number,
-            replication_factor=self.replication_factor,
-            write_consistency_factor=self.write_consistency_factor,
-            on_disk_payload=self.on_disk_payload,
-            hnsw_config=self.hnsw_config,
-            optimizers_config=self.optimizers_config,
-            wal_config=self.wal_config,
-            quantization_config=self.quantization_config,
-            init_from=self.init_from,
+            sparse_vectors_config=sparse_vectors_config,
+            **collection_params,
         )
     def _handle_duplicate_documents(
@@ -1014,12 +1472,38 @@ class QdrantDocumentStore:
         return documents
-    def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
+    async def _handle_duplicate_documents_async(
+        self,
+        documents: List[Document],
+        policy: DuplicatePolicy = None,
+    ):
         """
-        Drop duplicate documents based on same hash ID.
+        Asynchronously checks whether any of the passed documents is already existing
+        in the chosen index and returns a list of
+        documents that are not in the index yet.
         :param documents: A list of Haystack Document objects.
+        :param policy: The duplicate policy to use when writing documents.
         :returns: A list of Haystack Document objects.
+        """
+        if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
+            documents = self._drop_duplicate_documents(documents)
+            documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
+            ids_exist_in_db: List[str] = [doc.id for doc in documents_found]
+            if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
+                msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
+                raise DuplicateDocumentError(msg)
+            documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
+        return documents
+    def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
+        """
+        Drop duplicate documents based on same hash ID.
         """
         _hash_ids: Set = set()
         _documents: List[Document] = []
@@ -1027,12 +1511,202 @@ class QdrantDocumentStore:
         for document in documents:
             if document.id in _hash_ids:
                 logger.info(
-                    "Duplicate Documents: Document with id '%s' already exists in index '%s'",
-                    document.id,
-                    self.index,
+                    "Duplicate Documents: Document with id '{document_id}' already exists in index '{index}'",
+                    document_id=document.id,
+                    index=self.index,
                 )
                 continue
             _documents.append(document)
             _hash_ids.add(document.id)
         return _documents
+    def _prepare_collection_params(self):
+        """
+        Prepares the common parameters for collection creation.
+        """
+        return {
+            "shard_number": self.shard_number,
+            "replication_factor": self.replication_factor,
+            "write_consistency_factor": self.write_consistency_factor,
+            "on_disk_payload": self.on_disk_payload,
+            "hnsw_config": self.hnsw_config,
+            "optimizers_config": self.optimizers_config,
+            "wal_config": self.wal_config,
+            "quantization_config": self.quantization_config,
+            "init_from": self.init_from,
+        }
+    def _prepare_client_params(self):
+        """
+        Prepares the common parameters for client initialization.
+        """
+        return {
+            "location": self.location,
+            "url": self.url,
+            "port": self.port,
+            "grpc_port": self.grpc_port,
+            "prefer_grpc": self.prefer_grpc,
+            "https": self.https,
+            "api_key": self.api_key.resolve_value() if self.api_key else None,
+            "prefix": self.prefix,
+            "timeout": self.timeout,
+            "host": self.host,
+            "path": self.path,
+            "metadata": self.metadata,
+            "force_disable_check_same_thread": self.force_disable_check_same_thread,
+        }
+    def _prepare_collection_config(
+        self,
+        embedding_dim: int,
+        distance,
+        on_disk: Optional[bool] = None,
+        use_sparse_embeddings: Optional[bool] = None,
+        sparse_idf: bool = False,
+    ):
+        """
+        Prepares the configuration for creating or recreating a Qdrant collection.
+        """
+        if on_disk is None:
+            on_disk = self.on_disk
+        if use_sparse_embeddings is None:
+            use_sparse_embeddings = self.use_sparse_embeddings
+        # dense vectors configuration
+        vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
+        sparse_vectors_config = None
+        if use_sparse_embeddings:
+            # in this case, we need to define named vectors
+            vectors_config = {DENSE_VECTORS_NAME: vectors_config}
+            sparse_vectors_config = {
+                SPARSE_VECTORS_NAME: rest.SparseVectorParams(
+                    index=rest.SparseIndexParams(
+                        on_disk=on_disk,
+                    ),
+                    modifier=rest.Modifier.IDF if sparse_idf else None,
+                ),
+            }
+        return vectors_config, sparse_vectors_config
+    def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None):
+        """
+        Validates the filters provided for querying.
+        """
+        if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
+            msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
+            raise ValueError(msg)
+        if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
+            msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
+            raise ValueError(msg)
+    def _process_query_point_results(self, results, scale_score: bool = False):
+        """
+        Processes query results from Qdrant.
+        """
+        documents = [
+            convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+            for point in results
+        ]
+        if scale_score:
+            for document in documents:
+                score = document.score
+                if self.similarity == "cosine":
+                    score = (score + 1) / 2
+                else:
+                    score = float(1 / (1 + np.exp(-score / 100)))
+                document.score = score
+        return documents
+    def _process_group_results(self, groups):
+        """
+        Processes grouped query results from Qdrant.
+        """
+        if not groups:
+            return []
+        return [
+            convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+            for group in groups
+            for point in group.hits
+        ]
+    def _validate_collection_compatibility(
+        self,
+        collection_name: str,
+        collection_info,
+        distance,
+        embedding_dim: int,
+    ):
+        """
+        Validates that an existing collection is compatible with the current configuration.
+        """
+        has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
+        if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
+            msg = (
+                f"Collection '{collection_name}' already exists in Qdrant, "
+                f"but it has been originally created outside of Haystack and is not supported. "
+                f"If possible, you should create a new Document Store with Haystack. "
+                f"In case you want to migrate the existing collection, see an example script in "
+                f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
+                f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
+            )
+            raise QdrantStoreError(msg)
+        if self.use_sparse_embeddings and not has_named_vectors:
+            msg = (
+                f"Collection '{collection_name}' already exists in Qdrant, "
+                f"but it has been originally created without sparse embedding vectors. "
+                f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
+                f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
+                f"See `migrate_to_sparse_embeddings_support` function in "
+                f"`haystack_integrations.document_stores.qdrant`."
+            )
+            raise QdrantStoreError(msg)
+        if not self.use_sparse_embeddings and has_named_vectors:
+            msg = (
+                f"Collection '{collection_name}' already exists in Qdrant, "
+                f"but it has been originally created with sparse embedding vectors."
+                f"If you want to use that collection, please set `use_sparse_embeddings=True`."
+            )
+            raise QdrantStoreError(msg)
+        # Get current distance and vector size based on collection configuration
+        if self.use_sparse_embeddings:
+            current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
+            current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
+        else:
+            current_distance = collection_info.config.params.vectors.distance
+            current_vector_size = collection_info.config.params.vectors.size
+        # Validate distance metric
+        if current_distance != distance:
+            msg = (
+                f"Collection '{collection_name}' already exists in Qdrant, "
+                f"but it is configured with a similarity '{current_distance.name}'. "
+                f"If you want to use that collection, but with a different "
+                f"similarity, please set `recreate_collection=True` argument."
+            )
+            raise ValueError(msg)
+        # Validate embedding dimension
+        if current_vector_size != embedding_dim:
+            msg = (
+                f"Collection '{collection_name}' already exists in Qdrant, "
+                f"but it is configured with a vector size '{current_vector_size}'. "
+                f"If you want to use that collection, but with a different "
+                f"vector size, please set `recreate_collection=True` argument."
+            )
+            raise ValueError(msg)

qdrant-haystack 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl

qdrant-haystack 9.0.0py3-none-any.whl → 9.1.1py3-none-any.whl