PyPI - qdrant-haystack - Versions diffs - 4.2.0__tar.gz → 5.1.0__tar.gz - Mend

qdrant-haystack 4.2.0tar.gz → 5.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,17 @@
 # Changelog
+## [integrations/qdrant-v5.0.0] - 2024-09-02
+## [integrations/qdrant-v4.2.0] - 2024-08-27
+### 🚜 Refactor
+- Qdrant Query API (#1025)
+### 🧪 Testing
+- Do not retry tests in `hatch run test` command (#954)
 ## [integrations/qdrant-v4.1.2] - 2024-07-15
 ### 🐛 Bug Fixes

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: qdrant-haystack
-Version: 4.2.0
+Version: 5.1.0
 Summary: An integration of Qdrant ANN vector database backend with Haystack
 Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py RENAMED Viewed

@@ -44,13 +44,16 @@ class QdrantEmbeddingRetriever:
         return_embedding: bool = False,
         filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Create a QdrantEmbeddingRetriever component.
         :param document_store: An instance of QdrantDocumentStore.
         :param filters: A dictionary with filters to narrow down the search space.
-        :param top_k: The maximum number of documents to retrieve.
+        :param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents or not.
         :param return_embedding: Whether to return the embedding of the retrieved Documents.
         :param filter_policy: Policy to determine how filters are applied.
@@ -58,6 +61,9 @@ class QdrantEmbeddingRetriever:
             Score of the returned result might be higher or smaller than the threshold
              depending on the `similarity` function specified in the Document Store.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+            value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
         """
@@ -75,6 +81,8 @@ class QdrantEmbeddingRetriever:
             filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
         )
         self._score_threshold = score_threshold
+        self._group_by = group_by
+        self._group_size = group_size
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -92,6 +100,8 @@ class QdrantEmbeddingRetriever:
             scale_score=self._scale_score,
             return_embedding=self._return_embedding,
             score_threshold=self._score_threshold,
+            group_by=self._group_by,
+            group_size=self._group_size,
         )
         d["init_parameters"]["document_store"] = self._document_store.to_dict()
@@ -124,16 +134,22 @@ class QdrantEmbeddingRetriever:
         scale_score: Optional[bool] = None,
         return_embedding: Optional[bool] = None,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Run the Embedding Retriever on the given input data.
         :param query_embedding: Embedding of the query.
         :param filters: A dictionary with filters to narrow down the search space.
-        :param top_k: The maximum number of documents to return.
+        :param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents or not.
         :param return_embedding: Whether to return the embedding of the retrieved Documents.
         :param score_threshold: A minimal score threshold for the result.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+            value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns:
             The retrieved documents.
@@ -147,6 +163,8 @@ class QdrantEmbeddingRetriever:
             scale_score=scale_score or self._scale_score,
             return_embedding=return_embedding or self._return_embedding,
             score_threshold=score_threshold or self._score_threshold,
+            group_by=group_by or self._group_by,
+            group_size=group_size or self._group_size,
         )
         return {"documents": docs}
@@ -188,13 +206,16 @@ class QdrantSparseEmbeddingRetriever:
         return_embedding: bool = False,
         filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Create a QdrantSparseEmbeddingRetriever component.
         :param document_store: An instance of QdrantDocumentStore.
         :param filters: A dictionary with filters to narrow down the search space.
-        :param top_k: The maximum number of documents to retrieve.
+        :param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents or not.
         :param return_embedding: Whether to return the sparse embedding of the retrieved Documents.
         :param filter_policy: Policy to determine how filters are applied. Defaults to "replace".
@@ -202,6 +223,9 @@ class QdrantSparseEmbeddingRetriever:
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+            value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
         """
@@ -219,6 +243,8 @@ class QdrantSparseEmbeddingRetriever:
             filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
         )
         self._score_threshold = score_threshold
+        self._group_by = group_by
+        self._group_size = group_size
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -236,6 +262,8 @@ class QdrantSparseEmbeddingRetriever:
             filter_policy=self._filter_policy.value,
             return_embedding=self._return_embedding,
             score_threshold=self._score_threshold,
+            group_by=self._group_by,
+            group_size=self._group_size,
         )
         d["init_parameters"]["document_store"] = self._document_store.to_dict()
@@ -268,6 +296,8 @@ class QdrantSparseEmbeddingRetriever:
         scale_score: Optional[bool] = None,
         return_embedding: Optional[bool] = None,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Run the Sparse Embedding Retriever on the given input data.
@@ -276,13 +306,17 @@ class QdrantSparseEmbeddingRetriever:
         :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
                         the `filter_policy` chosen at retriever initialization. See init method docstring for more
                         details.
-        :param top_k: The maximum number of documents to return.
+        :param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents or not.
         :param return_embedding: Whether to return the embedding of the retrieved Documents.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+            value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns:
             The retrieved documents.
@@ -296,6 +330,8 @@ class QdrantSparseEmbeddingRetriever:
             scale_score=scale_score or self._scale_score,
             return_embedding=return_embedding or self._return_embedding,
             score_threshold=score_threshold or self._score_threshold,
+            group_by=group_by or self._group_by,
+            group_size=group_size or self._group_size,
         )
         return {"documents": docs}
@@ -342,19 +378,25 @@ class QdrantHybridRetriever:
         return_embedding: bool = False,
         filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Create a QdrantHybridRetriever component.
         :param document_store: An instance of QdrantDocumentStore.
         :param filters: A dictionary with filters to narrow down the search space.
-        :param top_k: The maximum number of documents to retrieve.
+        :param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
+             groups to return.
         :param return_embedding: Whether to return the embeddings of the retrieved Documents.
         :param filter_policy: Policy to determine how filters are applied.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
         """
@@ -371,6 +413,8 @@ class QdrantHybridRetriever:
             filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
         )
         self._score_threshold = score_threshold
+        self._group_by = group_by
+        self._group_size = group_size
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -387,6 +431,8 @@ class QdrantHybridRetriever:
             filter_policy=self._filter_policy.value,
             return_embedding=self._return_embedding,
             score_threshold=self._score_threshold,
+            group_by=self._group_by,
+            group_size=self._group_size,
         )
     @classmethod
@@ -416,6 +462,8 @@ class QdrantHybridRetriever:
         top_k: Optional[int] = None,
         return_embedding: Optional[bool] = None,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ):
         """
         Run the Sparse Embedding Retriever on the given input data.
@@ -425,12 +473,16 @@ class QdrantHybridRetriever:
         :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
                         the `filter_policy` chosen at retriever initialization. See init method docstring for more
                         details.
-        :param top_k: The maximum number of documents to return.
+        :param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param return_embedding: Whether to return the embedding of the retrieved Documents.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns:
             The retrieved documents.
@@ -444,6 +496,8 @@ class QdrantHybridRetriever:
             top_k=top_k or self._top_k,
             return_embedding=return_embedding or self._return_embedding,
             score_threshold=score_threshold or self._score_threshold,
+            group_by=group_by or self._group_by,
+            group_size=group_size or self._group_size,
         )
         return {"documents": docs}

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/document_store.py RENAMED Viewed

@@ -334,7 +334,7 @@ class QdrantDocumentStore:
         self,
         documents: List[Document],
         policy: DuplicatePolicy = DuplicatePolicy.FAIL,
-    ):
+    ) -> int:
         """
         Writes documents to Qdrant using the specified policy.
         The QdrantDocumentStore can handle duplicate documents based on the given policy.
@@ -358,7 +358,7 @@ class QdrantDocumentStore:
         if len(documents) == 0:
             logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
-            return
+            return 0
         document_objects = self._handle_duplicate_documents(
             documents=documents,
@@ -383,13 +383,13 @@ class QdrantDocumentStore:
                 progress_bar.update(self.write_batch_size)
         return len(document_objects)
-    def delete_documents(self, ids: List[str]):
+    def delete_documents(self, document_ids: List[str]) -> None:
         """
         Deletes documents that match the provided `document_ids` from the document store.
         :param document_ids: the document ids to delete
         """
-        ids = [convert_id(_id) for _id in ids]
+        ids = [convert_id(_id) for _id in document_ids]
         try:
             self.client.delete(
                 collection_name=self.index,
@@ -506,19 +506,25 @@ class QdrantDocumentStore:
         scale_score: bool = False,
         return_embedding: bool = False,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ) -> List[Document]:
         """
         Queries Qdrant using a sparse embedding and returns the most relevant documents.
         :param query_sparse_embedding: Sparse embedding of the query.
         :param filters: Filters applied to the retrieved documents.
-        :param top_k: Maximum number of documents to return.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents.
         :param return_embedding: Whether to return the embeddings of the retrieved documents.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns: List of documents that are most similar to `query_sparse_embedding`.
@@ -536,22 +542,47 @@ class QdrantDocumentStore:
         qdrant_filters = convert_filters_to_qdrant(filters)
         query_indices = query_sparse_embedding.indices
         query_values = query_sparse_embedding.values
-        points = self.client.query_points(
-            collection_name=self.index,
-            query=rest.SparseVector(
-                indices=query_indices,
-                values=query_values,
-            ),
-            using=SPARSE_VECTORS_NAME,
-            query_filter=qdrant_filters,
-            limit=top_k,
-            with_vectors=return_embedding,
-            score_threshold=score_threshold,
-        ).points
-        results = [
-            convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-            for point in points
-        ]
+        if group_by:
+            groups = self.client.query_points_groups(
+                collection_name=self.index,
+                query=rest.SparseVector(
+                    indices=query_indices,
+                    values=query_values,
+                ),
+                using=SPARSE_VECTORS_NAME,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                group_by=group_by,
+                group_size=group_size,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            ).groups
+            results = (
+                [
+                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+                    for group in groups
+                    for point in group.hits
+                ]
+                if groups
+                else []
+            )
+        else:
+            points = self.client.query_points(
+                collection_name=self.index,
+                query=rest.SparseVector(
+                    indices=query_indices,
+                    values=query_values,
+                ),
+                using=SPARSE_VECTORS_NAME,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            ).points
+            results = [
+                convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+                for point in points
+            ]
         if scale_score:
             for document in results:
                 score = document.score
@@ -567,37 +598,65 @@ class QdrantDocumentStore:
         scale_score: bool = False,
         return_embedding: bool = False,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ) -> List[Document]:
         """
         Queries Qdrant using a dense embedding and returns the most relevant documents.
         :param query_embedding: Dense embedding of the query.
         :param filters: Filters applied to the retrieved documents.
-        :param top_k: Maximum number of documents to return.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param scale_score: Whether to scale the scores of the retrieved documents.
         :param return_embedding: Whether to return the embeddings of the retrieved documents.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns: List of documents that are most similar to `query_embedding`.
         """
         qdrant_filters = convert_filters_to_qdrant(filters)
+        if group_by:
+            groups = self.client.query_points_groups(
+                collection_name=self.index,
+                query=query_embedding,
+                using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                group_by=group_by,
+                group_size=group_size,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            ).groups
+            results = (
+                [
+                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+                    for group in groups
+                    for point in group.hits
+                ]
+                if groups
+                else []
+            )
+        else:
+            points = self.client.query_points(
+                collection_name=self.index,
+                query=query_embedding,
+                using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
+                query_filter=qdrant_filters,
+                limit=top_k,
+                with_vectors=return_embedding,
+                score_threshold=score_threshold,
+            ).points
+            results = [
+                convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+                for point in points
+            ]
-        points = self.client.query_points(
-            collection_name=self.index,
-            query=query_embedding,
-            using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
-            query_filter=qdrant_filters,
-            limit=top_k,
-            with_vectors=return_embedding,
-            score_threshold=score_threshold,
-        ).points
-        results = [
-            convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
-            for point in points
-        ]
         if scale_score:
             for document in results:
                 score = document.score
@@ -616,6 +675,8 @@ class QdrantDocumentStore:
         top_k: int = 10,
         return_embedding: bool = False,
         score_threshold: Optional[float] = None,
+        group_by: Optional[str] = None,
+        group_size: Optional[int] = None,
     ) -> List[Document]:
         """
         Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
@@ -626,12 +687,16 @@ class QdrantDocumentStore:
         :param query_embedding: Dense embedding of the query.
         :param query_sparse_embedding: Sparse embedding of the query.
         :param filters: Filters applied to the retrieved documents.
-        :param top_k: Maximum number of documents to return.
+        :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
+             groups to return.
         :param return_embedding: Whether to return the embeddings of the retrieved documents.
         :param score_threshold: A minimal score threshold for the result.
             Score of the returned result might be higher or smaller than the threshold
              depending on the Distance function used.
             E.g. for cosine similarity only higher scores will be returned.
+        :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
+             value, all values will be used for grouping. One point can be in multiple groups.
+        :param group_size: Maximum amount of points to return per group. Default is 3.
         :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
@@ -651,34 +716,73 @@ class QdrantDocumentStore:
         qdrant_filters = convert_filters_to_qdrant(filters)
         try:
-            points = self.client.query_points(
-                collection_name=self.index,
-                prefetch=[
-                    rest.Prefetch(
-                        query=rest.SparseVector(
-                            indices=query_sparse_embedding.indices,
-                            values=query_sparse_embedding.values,
+            if group_by:
+                groups = self.client.query_points_groups(
+                    collection_name=self.index,
+                    prefetch=[
+                        rest.Prefetch(
+                            query=rest.SparseVector(
+                                indices=query_sparse_embedding.indices,
+                                values=query_sparse_embedding.values,
+                            ),
+                            using=SPARSE_VECTORS_NAME,
+                            filter=qdrant_filters,
                         ),
-                        using=SPARSE_VECTORS_NAME,
-                        filter=qdrant_filters,
-                    ),
-                    rest.Prefetch(
-                        query=query_embedding,
-                        using=DENSE_VECTORS_NAME,
-                        filter=qdrant_filters,
-                    ),
-                ],
-                query=rest.FusionQuery(fusion=rest.Fusion.RRF),
-                limit=top_k,
-                score_threshold=score_threshold,
-                with_payload=True,
-                with_vectors=return_embedding,
-            ).points
+                        rest.Prefetch(
+                            query=query_embedding,
+                            using=DENSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                    ],
+                    query=rest.FusionQuery(fusion=rest.Fusion.RRF),
+                    limit=top_k,
+                    group_by=group_by,
+                    group_size=group_size,
+                    score_threshold=score_threshold,
+                    with_payload=True,
+                    with_vectors=return_embedding,
+                ).groups
+            else:
+                points = self.client.query_points(
+                    collection_name=self.index,
+                    prefetch=[
+                        rest.Prefetch(
+                            query=rest.SparseVector(
+                                indices=query_sparse_embedding.indices,
+                                values=query_sparse_embedding.values,
+                            ),
+                            using=SPARSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                        rest.Prefetch(
+                            query=query_embedding,
+                            using=DENSE_VECTORS_NAME,
+                            filter=qdrant_filters,
+                        ),
+                    ],
+                    query=rest.FusionQuery(fusion=rest.Fusion.RRF),
+                    limit=top_k,
+                    score_threshold=score_threshold,
+                    with_payload=True,
+                    with_vectors=return_embedding,
+                ).points
         except Exception as e:
             msg = "Error during hybrid search"
             raise QdrantStoreError(msg) from e
-        results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
+        if group_by:
+            results = (
+                [
+                    convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
+                    for group in groups
+                    for point in group.hits
+                ]
+                if groups
+                else []
+            )
+        else:
+            results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
         return results

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_document_store.py RENAMED Viewed

@@ -97,6 +97,39 @@ class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocu
             assert document.sparse_embedding
             assert document.embedding
+    def test_query_hybrid_with_group_by(self, generate_sparse_embedding):
+        document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
+        docs = []
+        for i in range(20):
+            docs.append(
+                Document(
+                    content=f"doc {i}",
+                    sparse_embedding=generate_sparse_embedding(),
+                    embedding=_random_embeddings(768),
+                    meta={"group_field": i // 2},
+                )
+            )
+        document_store.write_documents(docs)
+        sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
+        embedding = [0.1] * 768
+        results: List[Document] = document_store._query_hybrid(
+            query_sparse_embedding=sparse_embedding,
+            query_embedding=embedding,
+            top_k=3,
+            return_embedding=True,
+            group_by="meta.group_field",
+            group_size=2,
+        )
+        assert len(results) == 6
+        for document in results:
+            assert document.sparse_embedding
+            assert document.embedding
     def test_query_hybrid_fail_without_sparse_embedding(self, document_store):
         sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
         embedding = [0.1] * 768

{qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_retriever.py RENAMED Viewed

@@ -27,6 +27,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
         assert retriever._filter_policy == FilterPolicy.REPLACE
         assert retriever._return_embedding is False
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
         retriever = QdrantEmbeddingRetriever(document_store=document_store, filter_policy="replace")
         assert retriever._filter_policy == FilterPolicy.REPLACE
@@ -87,6 +89,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
                 "scale_score": False,
                 "return_embedding": False,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
@@ -104,6 +108,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
                 "scale_score": False,
                 "return_embedding": True,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
         retriever = QdrantEmbeddingRetriever.from_dict(data)
@@ -115,6 +121,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
         assert retriever._scale_score is False
         assert retriever._return_embedding is True
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
     def test_run(self, filterable_docs: List[Document]):
         document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False)
@@ -200,6 +208,26 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
         for document in results:
             assert document.embedding is None
+    def test_run_with_group_by(self, filterable_docs: List[Document]):
+        document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
+        # Add group_field metadata to documents
+        for index, doc in enumerate(filterable_docs):
+            doc.meta = {"group_field": index // 2}  # So at least two docs have same group each time
+        document_store.write_documents(filterable_docs)
+        retriever = QdrantEmbeddingRetriever(document_store=document_store)
+        results = retriever.run(
+            query_embedding=_random_embeddings(768),
+            top_k=3,
+            return_embedding=False,
+            group_by="meta.group_field",
+            group_size=2,
+        )["documents"]
+        assert len(results) >= 3  # This test is Flaky
+        assert len(results) <= 6  # This test is Flaky
+        for document in results:
+            assert document.embedding is None
 class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
     def test_init_default(self):
@@ -211,6 +239,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
         assert retriever._filter_policy == FilterPolicy.REPLACE
         assert retriever._return_embedding is False
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
         retriever = QdrantSparseEmbeddingRetriever(document_store=document_store, filter_policy="replace")
         assert retriever._filter_policy == FilterPolicy.REPLACE
@@ -271,6 +301,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
                 "return_embedding": False,
                 "filter_policy": "replace",
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
@@ -288,6 +320,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
                 "return_embedding": True,
                 "filter_policy": "replace",
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
         retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
@@ -299,6 +333,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
         assert retriever._scale_score is False
         assert retriever._return_embedding is True
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
     def test_from_dict_no_filter_policy(self):
         data = {
@@ -313,6 +349,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
                 "scale_score": False,
                 "return_embedding": True,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
         retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
@@ -324,6 +362,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
         assert retriever._scale_score is False
         assert retriever._return_embedding is True
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
     def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
         document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
@@ -345,6 +385,29 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
         for document in results:
             assert document.sparse_embedding
+    def test_run_with_group_by(self, filterable_docs: List[Document], generate_sparse_embedding):
+        document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
+        # Add fake sparse embedding to documents
+        for index, doc in enumerate(filterable_docs):
+            doc.sparse_embedding = generate_sparse_embedding()
+            doc.meta = {"group_field": index // 2}  # So at least two docs have same group each time
+        document_store.write_documents(filterable_docs)
+        retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
+        sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
+        results = retriever.run(
+            query_sparse_embedding=sparse_embedding,
+            top_k=3,
+            return_embedding=True,
+            group_by="meta.group_field",
+            group_size=2,
+        )["documents"]
+        assert len(results) >= 3  # This test is Flaky
+        assert len(results) <= 6  # This test is Flaky
+        for document in results:
+            assert document.sparse_embedding
 class TestQdrantHybridRetriever:
     def test_init_default(self):
@@ -357,6 +420,8 @@ class TestQdrantHybridRetriever:
         assert retriever._filter_policy == FilterPolicy.REPLACE
         assert retriever._return_embedding is False
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
         retriever = QdrantHybridRetriever(document_store=document_store, filter_policy="replace")
         assert retriever._filter_policy == FilterPolicy.REPLACE
@@ -416,6 +481,8 @@ class TestQdrantHybridRetriever:
                 "filter_policy": "replace",
                 "return_embedding": True,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
@@ -432,6 +499,8 @@ class TestQdrantHybridRetriever:
                 "filter_policy": "replace",
                 "return_embedding": True,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
         retriever = QdrantHybridRetriever.from_dict(data)
@@ -442,6 +511,8 @@ class TestQdrantHybridRetriever:
         assert retriever._filter_policy == FilterPolicy.REPLACE
         assert retriever._return_embedding
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
     def test_from_dict_no_filter_policy(self):
         data = {
@@ -455,6 +526,8 @@ class TestQdrantHybridRetriever:
                 "top_k": 5,
                 "return_embedding": True,
                 "score_threshold": None,
+                "group_by": None,
+                "group_size": None,
             },
         }
         retriever = QdrantHybridRetriever.from_dict(data)
@@ -465,6 +538,8 @@ class TestQdrantHybridRetriever:
         assert retriever._filter_policy == FilterPolicy.REPLACE  # defaults to REPLACE
         assert retriever._return_embedding
         assert retriever._score_threshold is None
+        assert retriever._group_by is None
+        assert retriever._group_size is None
     def test_run(self):
         mock_store = Mock(spec=QdrantDocumentStore)
@@ -488,3 +563,31 @@ class TestQdrantHybridRetriever:
         assert res["documents"][0].content == "Test doc"
         assert res["documents"][0].embedding == [0.1, 0.2]
         assert res["documents"][0].sparse_embedding == sparse_embedding
+    def test_run_with_group_by(self):
+        mock_store = Mock(spec=QdrantDocumentStore)
+        sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
+        mock_store._query_hybrid.return_value = [
+            Document(content="Test doc", embedding=[0.1, 0.2], sparse_embedding=sparse_embedding)
+        ]
+        retriever = QdrantHybridRetriever(document_store=mock_store)
+        res = retriever.run(
+            query_embedding=[0.5, 0.7],
+            query_sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.7]),
+            group_by="meta.group_field",
+            group_size=2,
+        )
+        call_args = mock_store._query_hybrid.call_args
+        assert call_args[1]["query_embedding"] == [0.5, 0.7]
+        assert call_args[1]["query_sparse_embedding"].indices == [0, 5]
+        assert call_args[1]["query_sparse_embedding"].values == [0.1, 0.7]
+        assert call_args[1]["top_k"] == 10
+        assert call_args[1]["return_embedding"] is False
+        assert call_args[1]["group_by"] == "meta.group_field"
+        assert call_args[1]["group_size"] == 2
+        assert res["documents"][0].content == "Test doc"
+        assert res["documents"][0].embedding == [0.1, 0.2]
+        assert res["documents"][0].sparse_embedding == sparse_embedding