qdrant-haystack 4.2.0__tar.gz → 5.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/CHANGELOG.md +12 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/PKG-INFO +1 -1
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +60 -6
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +163 -59
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_document_store.py +33 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_retriever.py +103 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/.gitignore +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/LICENSE.txt +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/README.md +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/examples/embedding_retrieval.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/pydoc/config.yml +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/pyproject.toml +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/components/retrievers/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/filters.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/__init__.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/conftest.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_converters.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_dict_converters.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_filters.py +0 -0
- {qdrant_haystack-4.2.0 → qdrant_haystack-5.1.0}/tests/test_legacy_filters.py +0 -0
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/qdrant-v5.0.0] - 2024-09-02
|
|
4
|
+
|
|
5
|
+
## [integrations/qdrant-v4.2.0] - 2024-08-27
|
|
6
|
+
|
|
7
|
+
### 🚜 Refactor
|
|
8
|
+
|
|
9
|
+
- Qdrant Query API (#1025)
|
|
10
|
+
|
|
11
|
+
### 🧪 Testing
|
|
12
|
+
|
|
13
|
+
- Do not retry tests in `hatch run test` command (#954)
|
|
14
|
+
|
|
3
15
|
## [integrations/qdrant-v4.1.2] - 2024-07-15
|
|
4
16
|
|
|
5
17
|
### 🐛 Bug Fixes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 5.1.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -44,13 +44,16 @@ class QdrantEmbeddingRetriever:
|
|
|
44
44
|
return_embedding: bool = False,
|
|
45
45
|
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
46
46
|
score_threshold: Optional[float] = None,
|
|
47
|
+
group_by: Optional[str] = None,
|
|
48
|
+
group_size: Optional[int] = None,
|
|
47
49
|
):
|
|
48
50
|
"""
|
|
49
51
|
Create a QdrantEmbeddingRetriever component.
|
|
50
52
|
|
|
51
53
|
:param document_store: An instance of QdrantDocumentStore.
|
|
52
54
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
53
|
-
:param top_k: The maximum number of documents to retrieve.
|
|
55
|
+
:param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
|
|
56
|
+
groups to return.
|
|
54
57
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
55
58
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
56
59
|
:param filter_policy: Policy to determine how filters are applied.
|
|
@@ -58,6 +61,9 @@ class QdrantEmbeddingRetriever:
|
|
|
58
61
|
Score of the returned result might be higher or smaller than the threshold
|
|
59
62
|
depending on the `similarity` function specified in the Document Store.
|
|
60
63
|
E.g. for cosine similarity only higher scores will be returned.
|
|
64
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
65
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
66
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
61
67
|
|
|
62
68
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
63
69
|
"""
|
|
@@ -75,6 +81,8 @@ class QdrantEmbeddingRetriever:
|
|
|
75
81
|
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
76
82
|
)
|
|
77
83
|
self._score_threshold = score_threshold
|
|
84
|
+
self._group_by = group_by
|
|
85
|
+
self._group_size = group_size
|
|
78
86
|
|
|
79
87
|
def to_dict(self) -> Dict[str, Any]:
|
|
80
88
|
"""
|
|
@@ -92,6 +100,8 @@ class QdrantEmbeddingRetriever:
|
|
|
92
100
|
scale_score=self._scale_score,
|
|
93
101
|
return_embedding=self._return_embedding,
|
|
94
102
|
score_threshold=self._score_threshold,
|
|
103
|
+
group_by=self._group_by,
|
|
104
|
+
group_size=self._group_size,
|
|
95
105
|
)
|
|
96
106
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
97
107
|
|
|
@@ -124,16 +134,22 @@ class QdrantEmbeddingRetriever:
|
|
|
124
134
|
scale_score: Optional[bool] = None,
|
|
125
135
|
return_embedding: Optional[bool] = None,
|
|
126
136
|
score_threshold: Optional[float] = None,
|
|
137
|
+
group_by: Optional[str] = None,
|
|
138
|
+
group_size: Optional[int] = None,
|
|
127
139
|
):
|
|
128
140
|
"""
|
|
129
141
|
Run the Embedding Retriever on the given input data.
|
|
130
142
|
|
|
131
143
|
:param query_embedding: Embedding of the query.
|
|
132
144
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
133
|
-
:param top_k: The maximum number of documents to return.
|
|
145
|
+
:param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
146
|
+
groups to return.
|
|
134
147
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
135
148
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
136
149
|
:param score_threshold: A minimal score threshold for the result.
|
|
150
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
151
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
152
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
137
153
|
:returns:
|
|
138
154
|
The retrieved documents.
|
|
139
155
|
|
|
@@ -147,6 +163,8 @@ class QdrantEmbeddingRetriever:
|
|
|
147
163
|
scale_score=scale_score or self._scale_score,
|
|
148
164
|
return_embedding=return_embedding or self._return_embedding,
|
|
149
165
|
score_threshold=score_threshold or self._score_threshold,
|
|
166
|
+
group_by=group_by or self._group_by,
|
|
167
|
+
group_size=group_size or self._group_size,
|
|
150
168
|
)
|
|
151
169
|
|
|
152
170
|
return {"documents": docs}
|
|
@@ -188,13 +206,16 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
188
206
|
return_embedding: bool = False,
|
|
189
207
|
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
190
208
|
score_threshold: Optional[float] = None,
|
|
209
|
+
group_by: Optional[str] = None,
|
|
210
|
+
group_size: Optional[int] = None,
|
|
191
211
|
):
|
|
192
212
|
"""
|
|
193
213
|
Create a QdrantSparseEmbeddingRetriever component.
|
|
194
214
|
|
|
195
215
|
:param document_store: An instance of QdrantDocumentStore.
|
|
196
216
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
197
|
-
:param top_k: The maximum number of documents to retrieve.
|
|
217
|
+
:param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
|
|
218
|
+
groups to return.
|
|
198
219
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
199
220
|
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents.
|
|
200
221
|
:param filter_policy: Policy to determine how filters are applied. Defaults to "replace".
|
|
@@ -202,6 +223,9 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
202
223
|
Score of the returned result might be higher or smaller than the threshold
|
|
203
224
|
depending on the Distance function used.
|
|
204
225
|
E.g. for cosine similarity only higher scores will be returned.
|
|
226
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
227
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
228
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
205
229
|
|
|
206
230
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
207
231
|
"""
|
|
@@ -219,6 +243,8 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
219
243
|
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
220
244
|
)
|
|
221
245
|
self._score_threshold = score_threshold
|
|
246
|
+
self._group_by = group_by
|
|
247
|
+
self._group_size = group_size
|
|
222
248
|
|
|
223
249
|
def to_dict(self) -> Dict[str, Any]:
|
|
224
250
|
"""
|
|
@@ -236,6 +262,8 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
236
262
|
filter_policy=self._filter_policy.value,
|
|
237
263
|
return_embedding=self._return_embedding,
|
|
238
264
|
score_threshold=self._score_threshold,
|
|
265
|
+
group_by=self._group_by,
|
|
266
|
+
group_size=self._group_size,
|
|
239
267
|
)
|
|
240
268
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
241
269
|
|
|
@@ -268,6 +296,8 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
268
296
|
scale_score: Optional[bool] = None,
|
|
269
297
|
return_embedding: Optional[bool] = None,
|
|
270
298
|
score_threshold: Optional[float] = None,
|
|
299
|
+
group_by: Optional[str] = None,
|
|
300
|
+
group_size: Optional[int] = None,
|
|
271
301
|
):
|
|
272
302
|
"""
|
|
273
303
|
Run the Sparse Embedding Retriever on the given input data.
|
|
@@ -276,13 +306,17 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
276
306
|
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
277
307
|
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
278
308
|
details.
|
|
279
|
-
:param top_k: The maximum number of documents to return.
|
|
309
|
+
:param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
310
|
+
groups to return.
|
|
280
311
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
281
312
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
282
313
|
:param score_threshold: A minimal score threshold for the result.
|
|
283
314
|
Score of the returned result might be higher or smaller than the threshold
|
|
284
315
|
depending on the Distance function used.
|
|
285
316
|
E.g. for cosine similarity only higher scores will be returned.
|
|
317
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
318
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
319
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
286
320
|
:returns:
|
|
287
321
|
The retrieved documents.
|
|
288
322
|
|
|
@@ -296,6 +330,8 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
296
330
|
scale_score=scale_score or self._scale_score,
|
|
297
331
|
return_embedding=return_embedding or self._return_embedding,
|
|
298
332
|
score_threshold=score_threshold or self._score_threshold,
|
|
333
|
+
group_by=group_by or self._group_by,
|
|
334
|
+
group_size=group_size or self._group_size,
|
|
299
335
|
)
|
|
300
336
|
|
|
301
337
|
return {"documents": docs}
|
|
@@ -342,19 +378,25 @@ class QdrantHybridRetriever:
|
|
|
342
378
|
return_embedding: bool = False,
|
|
343
379
|
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
344
380
|
score_threshold: Optional[float] = None,
|
|
381
|
+
group_by: Optional[str] = None,
|
|
382
|
+
group_size: Optional[int] = None,
|
|
345
383
|
):
|
|
346
384
|
"""
|
|
347
385
|
Create a QdrantHybridRetriever component.
|
|
348
386
|
|
|
349
387
|
:param document_store: An instance of QdrantDocumentStore.
|
|
350
388
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
351
|
-
:param top_k: The maximum number of documents to retrieve.
|
|
389
|
+
:param top_k: The maximum number of documents to retrieve. If using `group_by` parameters, maximum number of
|
|
390
|
+
groups to return.
|
|
352
391
|
:param return_embedding: Whether to return the embeddings of the retrieved Documents.
|
|
353
392
|
:param filter_policy: Policy to determine how filters are applied.
|
|
354
393
|
:param score_threshold: A minimal score threshold for the result.
|
|
355
394
|
Score of the returned result might be higher or smaller than the threshold
|
|
356
395
|
depending on the Distance function used.
|
|
357
396
|
E.g. for cosine similarity only higher scores will be returned.
|
|
397
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
398
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
399
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
358
400
|
|
|
359
401
|
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
360
402
|
"""
|
|
@@ -371,6 +413,8 @@ class QdrantHybridRetriever:
|
|
|
371
413
|
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
372
414
|
)
|
|
373
415
|
self._score_threshold = score_threshold
|
|
416
|
+
self._group_by = group_by
|
|
417
|
+
self._group_size = group_size
|
|
374
418
|
|
|
375
419
|
def to_dict(self) -> Dict[str, Any]:
|
|
376
420
|
"""
|
|
@@ -387,6 +431,8 @@ class QdrantHybridRetriever:
|
|
|
387
431
|
filter_policy=self._filter_policy.value,
|
|
388
432
|
return_embedding=self._return_embedding,
|
|
389
433
|
score_threshold=self._score_threshold,
|
|
434
|
+
group_by=self._group_by,
|
|
435
|
+
group_size=self._group_size,
|
|
390
436
|
)
|
|
391
437
|
|
|
392
438
|
@classmethod
|
|
@@ -416,6 +462,8 @@ class QdrantHybridRetriever:
|
|
|
416
462
|
top_k: Optional[int] = None,
|
|
417
463
|
return_embedding: Optional[bool] = None,
|
|
418
464
|
score_threshold: Optional[float] = None,
|
|
465
|
+
group_by: Optional[str] = None,
|
|
466
|
+
group_size: Optional[int] = None,
|
|
419
467
|
):
|
|
420
468
|
"""
|
|
421
469
|
Run the Sparse Embedding Retriever on the given input data.
|
|
@@ -425,12 +473,16 @@ class QdrantHybridRetriever:
|
|
|
425
473
|
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
426
474
|
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
427
475
|
details.
|
|
428
|
-
:param top_k: The maximum number of documents to return.
|
|
476
|
+
:param top_k: The maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
477
|
+
groups to return.
|
|
429
478
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
430
479
|
:param score_threshold: A minimal score threshold for the result.
|
|
431
480
|
Score of the returned result might be higher or smaller than the threshold
|
|
432
481
|
depending on the Distance function used.
|
|
433
482
|
E.g. for cosine similarity only higher scores will be returned.
|
|
483
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
484
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
485
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
434
486
|
:returns:
|
|
435
487
|
The retrieved documents.
|
|
436
488
|
|
|
@@ -444,6 +496,8 @@ class QdrantHybridRetriever:
|
|
|
444
496
|
top_k=top_k or self._top_k,
|
|
445
497
|
return_embedding=return_embedding or self._return_embedding,
|
|
446
498
|
score_threshold=score_threshold or self._score_threshold,
|
|
499
|
+
group_by=group_by or self._group_by,
|
|
500
|
+
group_size=group_size or self._group_size,
|
|
447
501
|
)
|
|
448
502
|
|
|
449
503
|
return {"documents": docs}
|
|
@@ -334,7 +334,7 @@ class QdrantDocumentStore:
|
|
|
334
334
|
self,
|
|
335
335
|
documents: List[Document],
|
|
336
336
|
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
337
|
-
):
|
|
337
|
+
) -> int:
|
|
338
338
|
"""
|
|
339
339
|
Writes documents to Qdrant using the specified policy.
|
|
340
340
|
The QdrantDocumentStore can handle duplicate documents based on the given policy.
|
|
@@ -358,7 +358,7 @@ class QdrantDocumentStore:
|
|
|
358
358
|
|
|
359
359
|
if len(documents) == 0:
|
|
360
360
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
361
|
-
return
|
|
361
|
+
return 0
|
|
362
362
|
|
|
363
363
|
document_objects = self._handle_duplicate_documents(
|
|
364
364
|
documents=documents,
|
|
@@ -383,13 +383,13 @@ class QdrantDocumentStore:
|
|
|
383
383
|
progress_bar.update(self.write_batch_size)
|
|
384
384
|
return len(document_objects)
|
|
385
385
|
|
|
386
|
-
def delete_documents(self,
|
|
386
|
+
def delete_documents(self, document_ids: List[str]) -> None:
|
|
387
387
|
"""
|
|
388
388
|
Deletes documents that match the provided `document_ids` from the document store.
|
|
389
389
|
|
|
390
390
|
:param document_ids: the document ids to delete
|
|
391
391
|
"""
|
|
392
|
-
ids = [convert_id(_id) for _id in
|
|
392
|
+
ids = [convert_id(_id) for _id in document_ids]
|
|
393
393
|
try:
|
|
394
394
|
self.client.delete(
|
|
395
395
|
collection_name=self.index,
|
|
@@ -506,19 +506,25 @@ class QdrantDocumentStore:
|
|
|
506
506
|
scale_score: bool = False,
|
|
507
507
|
return_embedding: bool = False,
|
|
508
508
|
score_threshold: Optional[float] = None,
|
|
509
|
+
group_by: Optional[str] = None,
|
|
510
|
+
group_size: Optional[int] = None,
|
|
509
511
|
) -> List[Document]:
|
|
510
512
|
"""
|
|
511
513
|
Queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
512
514
|
|
|
513
515
|
:param query_sparse_embedding: Sparse embedding of the query.
|
|
514
516
|
:param filters: Filters applied to the retrieved documents.
|
|
515
|
-
:param top_k: Maximum number of documents to return.
|
|
517
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
518
|
+
groups to return.
|
|
516
519
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
517
520
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
518
521
|
:param score_threshold: A minimal score threshold for the result.
|
|
519
522
|
Score of the returned result might be higher or smaller than the threshold
|
|
520
523
|
depending on the Distance function used.
|
|
521
524
|
E.g. for cosine similarity only higher scores will be returned.
|
|
525
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
526
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
527
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
522
528
|
|
|
523
529
|
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
524
530
|
|
|
@@ -536,22 +542,47 @@ class QdrantDocumentStore:
|
|
|
536
542
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
537
543
|
query_indices = query_sparse_embedding.indices
|
|
538
544
|
query_values = query_sparse_embedding.values
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
545
|
+
if group_by:
|
|
546
|
+
groups = self.client.query_points_groups(
|
|
547
|
+
collection_name=self.index,
|
|
548
|
+
query=rest.SparseVector(
|
|
549
|
+
indices=query_indices,
|
|
550
|
+
values=query_values,
|
|
551
|
+
),
|
|
552
|
+
using=SPARSE_VECTORS_NAME,
|
|
553
|
+
query_filter=qdrant_filters,
|
|
554
|
+
limit=top_k,
|
|
555
|
+
group_by=group_by,
|
|
556
|
+
group_size=group_size,
|
|
557
|
+
with_vectors=return_embedding,
|
|
558
|
+
score_threshold=score_threshold,
|
|
559
|
+
).groups
|
|
560
|
+
results = (
|
|
561
|
+
[
|
|
562
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
563
|
+
for group in groups
|
|
564
|
+
for point in group.hits
|
|
565
|
+
]
|
|
566
|
+
if groups
|
|
567
|
+
else []
|
|
568
|
+
)
|
|
569
|
+
else:
|
|
570
|
+
points = self.client.query_points(
|
|
571
|
+
collection_name=self.index,
|
|
572
|
+
query=rest.SparseVector(
|
|
573
|
+
indices=query_indices,
|
|
574
|
+
values=query_values,
|
|
575
|
+
),
|
|
576
|
+
using=SPARSE_VECTORS_NAME,
|
|
577
|
+
query_filter=qdrant_filters,
|
|
578
|
+
limit=top_k,
|
|
579
|
+
with_vectors=return_embedding,
|
|
580
|
+
score_threshold=score_threshold,
|
|
581
|
+
).points
|
|
582
|
+
results = [
|
|
583
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
584
|
+
for point in points
|
|
585
|
+
]
|
|
555
586
|
if scale_score:
|
|
556
587
|
for document in results:
|
|
557
588
|
score = document.score
|
|
@@ -567,37 +598,65 @@ class QdrantDocumentStore:
|
|
|
567
598
|
scale_score: bool = False,
|
|
568
599
|
return_embedding: bool = False,
|
|
569
600
|
score_threshold: Optional[float] = None,
|
|
601
|
+
group_by: Optional[str] = None,
|
|
602
|
+
group_size: Optional[int] = None,
|
|
570
603
|
) -> List[Document]:
|
|
571
604
|
"""
|
|
572
605
|
Queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
573
606
|
|
|
574
607
|
:param query_embedding: Dense embedding of the query.
|
|
575
608
|
:param filters: Filters applied to the retrieved documents.
|
|
576
|
-
:param top_k: Maximum number of documents to return.
|
|
609
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
610
|
+
groups to return.
|
|
577
611
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
578
612
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
579
613
|
:param score_threshold: A minimal score threshold for the result.
|
|
580
614
|
Score of the returned result might be higher or smaller than the threshold
|
|
581
615
|
depending on the Distance function used.
|
|
582
616
|
E.g. for cosine similarity only higher scores will be returned.
|
|
617
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
618
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
619
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
583
620
|
|
|
584
621
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
585
622
|
"""
|
|
586
623
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
624
|
+
if group_by:
|
|
625
|
+
groups = self.client.query_points_groups(
|
|
626
|
+
collection_name=self.index,
|
|
627
|
+
query=query_embedding,
|
|
628
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
629
|
+
query_filter=qdrant_filters,
|
|
630
|
+
limit=top_k,
|
|
631
|
+
group_by=group_by,
|
|
632
|
+
group_size=group_size,
|
|
633
|
+
with_vectors=return_embedding,
|
|
634
|
+
score_threshold=score_threshold,
|
|
635
|
+
).groups
|
|
636
|
+
results = (
|
|
637
|
+
[
|
|
638
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
639
|
+
for group in groups
|
|
640
|
+
for point in group.hits
|
|
641
|
+
]
|
|
642
|
+
if groups
|
|
643
|
+
else []
|
|
644
|
+
)
|
|
645
|
+
else:
|
|
646
|
+
points = self.client.query_points(
|
|
647
|
+
collection_name=self.index,
|
|
648
|
+
query=query_embedding,
|
|
649
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
650
|
+
query_filter=qdrant_filters,
|
|
651
|
+
limit=top_k,
|
|
652
|
+
with_vectors=return_embedding,
|
|
653
|
+
score_threshold=score_threshold,
|
|
654
|
+
).points
|
|
655
|
+
results = [
|
|
656
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
657
|
+
for point in points
|
|
658
|
+
]
|
|
587
659
|
|
|
588
|
-
points = self.client.query_points(
|
|
589
|
-
collection_name=self.index,
|
|
590
|
-
query=query_embedding,
|
|
591
|
-
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
592
|
-
query_filter=qdrant_filters,
|
|
593
|
-
limit=top_k,
|
|
594
|
-
with_vectors=return_embedding,
|
|
595
|
-
score_threshold=score_threshold,
|
|
596
|
-
).points
|
|
597
|
-
results = [
|
|
598
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
599
|
-
for point in points
|
|
600
|
-
]
|
|
601
660
|
if scale_score:
|
|
602
661
|
for document in results:
|
|
603
662
|
score = document.score
|
|
@@ -616,6 +675,8 @@ class QdrantDocumentStore:
|
|
|
616
675
|
top_k: int = 10,
|
|
617
676
|
return_embedding: bool = False,
|
|
618
677
|
score_threshold: Optional[float] = None,
|
|
678
|
+
group_by: Optional[str] = None,
|
|
679
|
+
group_size: Optional[int] = None,
|
|
619
680
|
) -> List[Document]:
|
|
620
681
|
"""
|
|
621
682
|
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
@@ -626,12 +687,16 @@ class QdrantDocumentStore:
|
|
|
626
687
|
:param query_embedding: Dense embedding of the query.
|
|
627
688
|
:param query_sparse_embedding: Sparse embedding of the query.
|
|
628
689
|
:param filters: Filters applied to the retrieved documents.
|
|
629
|
-
:param top_k: Maximum number of documents to return.
|
|
690
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
691
|
+
groups to return.
|
|
630
692
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
631
693
|
:param score_threshold: A minimal score threshold for the result.
|
|
632
694
|
Score of the returned result might be higher or smaller than the threshold
|
|
633
695
|
depending on the Distance function used.
|
|
634
696
|
E.g. for cosine similarity only higher scores will be returned.
|
|
697
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
698
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
699
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
635
700
|
|
|
636
701
|
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
637
702
|
|
|
@@ -651,34 +716,73 @@ class QdrantDocumentStore:
|
|
|
651
716
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
652
717
|
|
|
653
718
|
try:
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
719
|
+
if group_by:
|
|
720
|
+
groups = self.client.query_points_groups(
|
|
721
|
+
collection_name=self.index,
|
|
722
|
+
prefetch=[
|
|
723
|
+
rest.Prefetch(
|
|
724
|
+
query=rest.SparseVector(
|
|
725
|
+
indices=query_sparse_embedding.indices,
|
|
726
|
+
values=query_sparse_embedding.values,
|
|
727
|
+
),
|
|
728
|
+
using=SPARSE_VECTORS_NAME,
|
|
729
|
+
filter=qdrant_filters,
|
|
661
730
|
),
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
731
|
+
rest.Prefetch(
|
|
732
|
+
query=query_embedding,
|
|
733
|
+
using=DENSE_VECTORS_NAME,
|
|
734
|
+
filter=qdrant_filters,
|
|
735
|
+
),
|
|
736
|
+
],
|
|
737
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
738
|
+
limit=top_k,
|
|
739
|
+
group_by=group_by,
|
|
740
|
+
group_size=group_size,
|
|
741
|
+
score_threshold=score_threshold,
|
|
742
|
+
with_payload=True,
|
|
743
|
+
with_vectors=return_embedding,
|
|
744
|
+
).groups
|
|
745
|
+
else:
|
|
746
|
+
points = self.client.query_points(
|
|
747
|
+
collection_name=self.index,
|
|
748
|
+
prefetch=[
|
|
749
|
+
rest.Prefetch(
|
|
750
|
+
query=rest.SparseVector(
|
|
751
|
+
indices=query_sparse_embedding.indices,
|
|
752
|
+
values=query_sparse_embedding.values,
|
|
753
|
+
),
|
|
754
|
+
using=SPARSE_VECTORS_NAME,
|
|
755
|
+
filter=qdrant_filters,
|
|
756
|
+
),
|
|
757
|
+
rest.Prefetch(
|
|
758
|
+
query=query_embedding,
|
|
759
|
+
using=DENSE_VECTORS_NAME,
|
|
760
|
+
filter=qdrant_filters,
|
|
761
|
+
),
|
|
762
|
+
],
|
|
763
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
764
|
+
limit=top_k,
|
|
765
|
+
score_threshold=score_threshold,
|
|
766
|
+
with_payload=True,
|
|
767
|
+
with_vectors=return_embedding,
|
|
768
|
+
).points
|
|
769
|
+
|
|
677
770
|
except Exception as e:
|
|
678
771
|
msg = "Error during hybrid search"
|
|
679
772
|
raise QdrantStoreError(msg) from e
|
|
680
773
|
|
|
681
|
-
|
|
774
|
+
if group_by:
|
|
775
|
+
results = (
|
|
776
|
+
[
|
|
777
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
778
|
+
for group in groups
|
|
779
|
+
for point in group.hits
|
|
780
|
+
]
|
|
781
|
+
if groups
|
|
782
|
+
else []
|
|
783
|
+
)
|
|
784
|
+
else:
|
|
785
|
+
results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
|
|
682
786
|
|
|
683
787
|
return results
|
|
684
788
|
|
|
@@ -97,6 +97,39 @@ class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocu
|
|
|
97
97
|
assert document.sparse_embedding
|
|
98
98
|
assert document.embedding
|
|
99
99
|
|
|
100
|
+
def test_query_hybrid_with_group_by(self, generate_sparse_embedding):
|
|
101
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
102
|
+
|
|
103
|
+
docs = []
|
|
104
|
+
for i in range(20):
|
|
105
|
+
docs.append(
|
|
106
|
+
Document(
|
|
107
|
+
content=f"doc {i}",
|
|
108
|
+
sparse_embedding=generate_sparse_embedding(),
|
|
109
|
+
embedding=_random_embeddings(768),
|
|
110
|
+
meta={"group_field": i // 2},
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
document_store.write_documents(docs)
|
|
115
|
+
|
|
116
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
117
|
+
embedding = [0.1] * 768
|
|
118
|
+
|
|
119
|
+
results: List[Document] = document_store._query_hybrid(
|
|
120
|
+
query_sparse_embedding=sparse_embedding,
|
|
121
|
+
query_embedding=embedding,
|
|
122
|
+
top_k=3,
|
|
123
|
+
return_embedding=True,
|
|
124
|
+
group_by="meta.group_field",
|
|
125
|
+
group_size=2,
|
|
126
|
+
)
|
|
127
|
+
assert len(results) == 6
|
|
128
|
+
|
|
129
|
+
for document in results:
|
|
130
|
+
assert document.sparse_embedding
|
|
131
|
+
assert document.embedding
|
|
132
|
+
|
|
100
133
|
def test_query_hybrid_fail_without_sparse_embedding(self, document_store):
|
|
101
134
|
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
102
135
|
embedding = [0.1] * 768
|
|
@@ -27,6 +27,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
27
27
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
28
28
|
assert retriever._return_embedding is False
|
|
29
29
|
assert retriever._score_threshold is None
|
|
30
|
+
assert retriever._group_by is None
|
|
31
|
+
assert retriever._group_size is None
|
|
30
32
|
|
|
31
33
|
retriever = QdrantEmbeddingRetriever(document_store=document_store, filter_policy="replace")
|
|
32
34
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
@@ -87,6 +89,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
87
89
|
"scale_score": False,
|
|
88
90
|
"return_embedding": False,
|
|
89
91
|
"score_threshold": None,
|
|
92
|
+
"group_by": None,
|
|
93
|
+
"group_size": None,
|
|
90
94
|
},
|
|
91
95
|
}
|
|
92
96
|
|
|
@@ -104,6 +108,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
104
108
|
"scale_score": False,
|
|
105
109
|
"return_embedding": True,
|
|
106
110
|
"score_threshold": None,
|
|
111
|
+
"group_by": None,
|
|
112
|
+
"group_size": None,
|
|
107
113
|
},
|
|
108
114
|
}
|
|
109
115
|
retriever = QdrantEmbeddingRetriever.from_dict(data)
|
|
@@ -115,6 +121,8 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
115
121
|
assert retriever._scale_score is False
|
|
116
122
|
assert retriever._return_embedding is True
|
|
117
123
|
assert retriever._score_threshold is None
|
|
124
|
+
assert retriever._group_by is None
|
|
125
|
+
assert retriever._group_size is None
|
|
118
126
|
|
|
119
127
|
def test_run(self, filterable_docs: List[Document]):
|
|
120
128
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False)
|
|
@@ -200,6 +208,26 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
200
208
|
for document in results:
|
|
201
209
|
assert document.embedding is None
|
|
202
210
|
|
|
211
|
+
def test_run_with_group_by(self, filterable_docs: List[Document]):
|
|
212
|
+
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
213
|
+
# Add group_field metadata to documents
|
|
214
|
+
for index, doc in enumerate(filterable_docs):
|
|
215
|
+
doc.meta = {"group_field": index // 2} # So at least two docs have same group each time
|
|
216
|
+
document_store.write_documents(filterable_docs)
|
|
217
|
+
|
|
218
|
+
retriever = QdrantEmbeddingRetriever(document_store=document_store)
|
|
219
|
+
results = retriever.run(
|
|
220
|
+
query_embedding=_random_embeddings(768),
|
|
221
|
+
top_k=3,
|
|
222
|
+
return_embedding=False,
|
|
223
|
+
group_by="meta.group_field",
|
|
224
|
+
group_size=2,
|
|
225
|
+
)["documents"]
|
|
226
|
+
assert len(results) >= 3 # This test is Flaky
|
|
227
|
+
assert len(results) <= 6 # This test is Flaky
|
|
228
|
+
for document in results:
|
|
229
|
+
assert document.embedding is None
|
|
230
|
+
|
|
203
231
|
|
|
204
232
|
class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
205
233
|
def test_init_default(self):
|
|
@@ -211,6 +239,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
211
239
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
212
240
|
assert retriever._return_embedding is False
|
|
213
241
|
assert retriever._score_threshold is None
|
|
242
|
+
assert retriever._group_by is None
|
|
243
|
+
assert retriever._group_size is None
|
|
214
244
|
|
|
215
245
|
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store, filter_policy="replace")
|
|
216
246
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
@@ -271,6 +301,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
271
301
|
"return_embedding": False,
|
|
272
302
|
"filter_policy": "replace",
|
|
273
303
|
"score_threshold": None,
|
|
304
|
+
"group_by": None,
|
|
305
|
+
"group_size": None,
|
|
274
306
|
},
|
|
275
307
|
}
|
|
276
308
|
|
|
@@ -288,6 +320,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
288
320
|
"return_embedding": True,
|
|
289
321
|
"filter_policy": "replace",
|
|
290
322
|
"score_threshold": None,
|
|
323
|
+
"group_by": None,
|
|
324
|
+
"group_size": None,
|
|
291
325
|
},
|
|
292
326
|
}
|
|
293
327
|
retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
|
|
@@ -299,6 +333,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
299
333
|
assert retriever._scale_score is False
|
|
300
334
|
assert retriever._return_embedding is True
|
|
301
335
|
assert retriever._score_threshold is None
|
|
336
|
+
assert retriever._group_by is None
|
|
337
|
+
assert retriever._group_size is None
|
|
302
338
|
|
|
303
339
|
def test_from_dict_no_filter_policy(self):
|
|
304
340
|
data = {
|
|
@@ -313,6 +349,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
313
349
|
"scale_score": False,
|
|
314
350
|
"return_embedding": True,
|
|
315
351
|
"score_threshold": None,
|
|
352
|
+
"group_by": None,
|
|
353
|
+
"group_size": None,
|
|
316
354
|
},
|
|
317
355
|
}
|
|
318
356
|
retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
|
|
@@ -324,6 +362,8 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
324
362
|
assert retriever._scale_score is False
|
|
325
363
|
assert retriever._return_embedding is True
|
|
326
364
|
assert retriever._score_threshold is None
|
|
365
|
+
assert retriever._group_by is None
|
|
366
|
+
assert retriever._group_size is None
|
|
327
367
|
|
|
328
368
|
def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
|
|
329
369
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
@@ -345,6 +385,29 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
345
385
|
for document in results:
|
|
346
386
|
assert document.sparse_embedding
|
|
347
387
|
|
|
388
|
+
def test_run_with_group_by(self, filterable_docs: List[Document], generate_sparse_embedding):
|
|
389
|
+
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
390
|
+
|
|
391
|
+
# Add fake sparse embedding to documents
|
|
392
|
+
for index, doc in enumerate(filterable_docs):
|
|
393
|
+
doc.sparse_embedding = generate_sparse_embedding()
|
|
394
|
+
doc.meta = {"group_field": index // 2} # So at least two docs have same group each time
|
|
395
|
+
document_store.write_documents(filterable_docs)
|
|
396
|
+
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
397
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
398
|
+
results = retriever.run(
|
|
399
|
+
query_sparse_embedding=sparse_embedding,
|
|
400
|
+
top_k=3,
|
|
401
|
+
return_embedding=True,
|
|
402
|
+
group_by="meta.group_field",
|
|
403
|
+
group_size=2,
|
|
404
|
+
)["documents"]
|
|
405
|
+
assert len(results) >= 3 # This test is Flaky
|
|
406
|
+
assert len(results) <= 6 # This test is Flaky
|
|
407
|
+
|
|
408
|
+
for document in results:
|
|
409
|
+
assert document.sparse_embedding
|
|
410
|
+
|
|
348
411
|
|
|
349
412
|
class TestQdrantHybridRetriever:
|
|
350
413
|
def test_init_default(self):
|
|
@@ -357,6 +420,8 @@ class TestQdrantHybridRetriever:
|
|
|
357
420
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
358
421
|
assert retriever._return_embedding is False
|
|
359
422
|
assert retriever._score_threshold is None
|
|
423
|
+
assert retriever._group_by is None
|
|
424
|
+
assert retriever._group_size is None
|
|
360
425
|
|
|
361
426
|
retriever = QdrantHybridRetriever(document_store=document_store, filter_policy="replace")
|
|
362
427
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
@@ -416,6 +481,8 @@ class TestQdrantHybridRetriever:
|
|
|
416
481
|
"filter_policy": "replace",
|
|
417
482
|
"return_embedding": True,
|
|
418
483
|
"score_threshold": None,
|
|
484
|
+
"group_by": None,
|
|
485
|
+
"group_size": None,
|
|
419
486
|
},
|
|
420
487
|
}
|
|
421
488
|
|
|
@@ -432,6 +499,8 @@ class TestQdrantHybridRetriever:
|
|
|
432
499
|
"filter_policy": "replace",
|
|
433
500
|
"return_embedding": True,
|
|
434
501
|
"score_threshold": None,
|
|
502
|
+
"group_by": None,
|
|
503
|
+
"group_size": None,
|
|
435
504
|
},
|
|
436
505
|
}
|
|
437
506
|
retriever = QdrantHybridRetriever.from_dict(data)
|
|
@@ -442,6 +511,8 @@ class TestQdrantHybridRetriever:
|
|
|
442
511
|
assert retriever._filter_policy == FilterPolicy.REPLACE
|
|
443
512
|
assert retriever._return_embedding
|
|
444
513
|
assert retriever._score_threshold is None
|
|
514
|
+
assert retriever._group_by is None
|
|
515
|
+
assert retriever._group_size is None
|
|
445
516
|
|
|
446
517
|
def test_from_dict_no_filter_policy(self):
|
|
447
518
|
data = {
|
|
@@ -455,6 +526,8 @@ class TestQdrantHybridRetriever:
|
|
|
455
526
|
"top_k": 5,
|
|
456
527
|
"return_embedding": True,
|
|
457
528
|
"score_threshold": None,
|
|
529
|
+
"group_by": None,
|
|
530
|
+
"group_size": None,
|
|
458
531
|
},
|
|
459
532
|
}
|
|
460
533
|
retriever = QdrantHybridRetriever.from_dict(data)
|
|
@@ -465,6 +538,8 @@ class TestQdrantHybridRetriever:
|
|
|
465
538
|
assert retriever._filter_policy == FilterPolicy.REPLACE # defaults to REPLACE
|
|
466
539
|
assert retriever._return_embedding
|
|
467
540
|
assert retriever._score_threshold is None
|
|
541
|
+
assert retriever._group_by is None
|
|
542
|
+
assert retriever._group_size is None
|
|
468
543
|
|
|
469
544
|
def test_run(self):
|
|
470
545
|
mock_store = Mock(spec=QdrantDocumentStore)
|
|
@@ -488,3 +563,31 @@ class TestQdrantHybridRetriever:
|
|
|
488
563
|
assert res["documents"][0].content == "Test doc"
|
|
489
564
|
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
490
565
|
assert res["documents"][0].sparse_embedding == sparse_embedding
|
|
566
|
+
|
|
567
|
+
def test_run_with_group_by(self):
|
|
568
|
+
mock_store = Mock(spec=QdrantDocumentStore)
|
|
569
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
570
|
+
mock_store._query_hybrid.return_value = [
|
|
571
|
+
Document(content="Test doc", embedding=[0.1, 0.2], sparse_embedding=sparse_embedding)
|
|
572
|
+
]
|
|
573
|
+
|
|
574
|
+
retriever = QdrantHybridRetriever(document_store=mock_store)
|
|
575
|
+
res = retriever.run(
|
|
576
|
+
query_embedding=[0.5, 0.7],
|
|
577
|
+
query_sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.7]),
|
|
578
|
+
group_by="meta.group_field",
|
|
579
|
+
group_size=2,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
call_args = mock_store._query_hybrid.call_args
|
|
583
|
+
assert call_args[1]["query_embedding"] == [0.5, 0.7]
|
|
584
|
+
assert call_args[1]["query_sparse_embedding"].indices == [0, 5]
|
|
585
|
+
assert call_args[1]["query_sparse_embedding"].values == [0.1, 0.7]
|
|
586
|
+
assert call_args[1]["top_k"] == 10
|
|
587
|
+
assert call_args[1]["return_embedding"] is False
|
|
588
|
+
assert call_args[1]["group_by"] == "meta.group_field"
|
|
589
|
+
assert call_args[1]["group_size"] == 2
|
|
590
|
+
|
|
591
|
+
assert res["documents"][0].content == "Test doc"
|
|
592
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
593
|
+
assert res["documents"][0].sparse_embedding == sparse_embedding
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|