qdrant-haystack 4.0.0__tar.gz → 4.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/CHANGELOG.md +12 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/PKG-INFO +2 -2
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/pyproject.toml +1 -1
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +36 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +38 -3
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_dict_converters.py +3 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_document_store.py +23 -1
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_retriever.py +37 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/.gitignore +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/LICENSE.txt +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/README.md +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/examples/embedding_retrieval.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/pydoc/config.yml +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/components/retrievers/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/document_stores/qdrant/filters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/conftest.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_converters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_filters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.0}/tests/test_legacy_filters.py +0 -0
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/qdrant-v4.0.0] - 2024-07-02
|
|
4
|
+
|
|
5
|
+
### 🚜 Refactor
|
|
6
|
+
|
|
7
|
+
- [**breaking**] Qdrant - remove unused init parameters: `content_field`, `name_field`, `embedding_field`, and `duplicate_documents` (#861)
|
|
8
|
+
- [**breaking**] Qdrant - set `scale_score` default value to `False` (#862)
|
|
9
|
+
|
|
10
|
+
### ⚙️ Miscellaneous Tasks
|
|
11
|
+
|
|
12
|
+
- Retry tests to reduce flakyness (#836)
|
|
13
|
+
- Update ruff invocation to include check parameter (#853)
|
|
14
|
+
|
|
3
15
|
## [integrations/qdrant-v3.8.1] - 2024-06-20
|
|
4
16
|
|
|
5
17
|
### 📚 Documentation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
|
19
19
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
20
20
|
Requires-Python: >=3.8
|
|
21
21
|
Requires-Dist: haystack-ai>=2.0.1
|
|
22
|
-
Requires-Dist: qdrant-client
|
|
22
|
+
Requires-Dist: qdrant-client>=1.10.0
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
|
|
25
25
|
# qdrant-haystack
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
26
26
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
27
27
|
]
|
|
28
|
-
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]
|
|
28
|
+
dependencies = ["haystack-ai>=2.0.1", "qdrant-client>=1.10.0"]
|
|
29
29
|
|
|
30
30
|
[project.urls]
|
|
31
31
|
Source = "https://github.com/deepset-ai/haystack-core-integrations"
|
|
@@ -39,6 +39,7 @@ class QdrantEmbeddingRetriever:
|
|
|
39
39
|
top_k: int = 10,
|
|
40
40
|
scale_score: bool = False,
|
|
41
41
|
return_embedding: bool = False,
|
|
42
|
+
score_threshold: Optional[float] = None,
|
|
42
43
|
):
|
|
43
44
|
"""
|
|
44
45
|
Create a QdrantEmbeddingRetriever component.
|
|
@@ -48,6 +49,10 @@ class QdrantEmbeddingRetriever:
|
|
|
48
49
|
:param top_k: The maximum number of documents to retrieve.
|
|
49
50
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
50
51
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
52
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
53
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
54
|
+
depending on the `similarity` function specified in the Document Store.
|
|
55
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
51
56
|
|
|
52
57
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
53
58
|
"""
|
|
@@ -61,6 +66,7 @@ class QdrantEmbeddingRetriever:
|
|
|
61
66
|
self._top_k = top_k
|
|
62
67
|
self._scale_score = scale_score
|
|
63
68
|
self._return_embedding = return_embedding
|
|
69
|
+
self._score_threshold = score_threshold
|
|
64
70
|
|
|
65
71
|
def to_dict(self) -> Dict[str, Any]:
|
|
66
72
|
"""
|
|
@@ -76,6 +82,7 @@ class QdrantEmbeddingRetriever:
|
|
|
76
82
|
top_k=self._top_k,
|
|
77
83
|
scale_score=self._scale_score,
|
|
78
84
|
return_embedding=self._return_embedding,
|
|
85
|
+
score_threshold=self._score_threshold,
|
|
79
86
|
)
|
|
80
87
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
81
88
|
|
|
@@ -103,6 +110,7 @@ class QdrantEmbeddingRetriever:
|
|
|
103
110
|
top_k: Optional[int] = None,
|
|
104
111
|
scale_score: Optional[bool] = None,
|
|
105
112
|
return_embedding: Optional[bool] = None,
|
|
113
|
+
score_threshold: Optional[float] = None,
|
|
106
114
|
):
|
|
107
115
|
"""
|
|
108
116
|
Run the Embedding Retriever on the given input data.
|
|
@@ -112,6 +120,7 @@ class QdrantEmbeddingRetriever:
|
|
|
112
120
|
:param top_k: The maximum number of documents to return.
|
|
113
121
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
114
122
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
123
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
115
124
|
:returns:
|
|
116
125
|
The retrieved documents.
|
|
117
126
|
|
|
@@ -122,6 +131,7 @@ class QdrantEmbeddingRetriever:
|
|
|
122
131
|
top_k=top_k or self._top_k,
|
|
123
132
|
scale_score=scale_score or self._scale_score,
|
|
124
133
|
return_embedding=return_embedding or self._return_embedding,
|
|
134
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
125
135
|
)
|
|
126
136
|
|
|
127
137
|
return {"documents": docs}
|
|
@@ -161,6 +171,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
161
171
|
top_k: int = 10,
|
|
162
172
|
scale_score: bool = False,
|
|
163
173
|
return_embedding: bool = False,
|
|
174
|
+
score_threshold: Optional[float] = None,
|
|
164
175
|
):
|
|
165
176
|
"""
|
|
166
177
|
Create a QdrantSparseEmbeddingRetriever component.
|
|
@@ -170,6 +181,10 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
170
181
|
:param top_k: The maximum number of documents to retrieve.
|
|
171
182
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
172
183
|
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents.
|
|
184
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
185
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
186
|
+
depending on the Distance function used.
|
|
187
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
173
188
|
|
|
174
189
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
175
190
|
"""
|
|
@@ -183,6 +198,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
183
198
|
self._top_k = top_k
|
|
184
199
|
self._scale_score = scale_score
|
|
185
200
|
self._return_embedding = return_embedding
|
|
201
|
+
self._score_threshold = score_threshold
|
|
186
202
|
|
|
187
203
|
def to_dict(self) -> Dict[str, Any]:
|
|
188
204
|
"""
|
|
@@ -198,6 +214,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
198
214
|
top_k=self._top_k,
|
|
199
215
|
scale_score=self._scale_score,
|
|
200
216
|
return_embedding=self._return_embedding,
|
|
217
|
+
score_threshold=self._score_threshold,
|
|
201
218
|
)
|
|
202
219
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
203
220
|
|
|
@@ -225,6 +242,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
225
242
|
top_k: Optional[int] = None,
|
|
226
243
|
scale_score: Optional[bool] = None,
|
|
227
244
|
return_embedding: Optional[bool] = None,
|
|
245
|
+
score_threshold: Optional[float] = None,
|
|
228
246
|
):
|
|
229
247
|
"""
|
|
230
248
|
Run the Sparse Embedding Retriever on the given input data.
|
|
@@ -234,6 +252,10 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
234
252
|
:param top_k: The maximum number of documents to return.
|
|
235
253
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
236
254
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
255
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
256
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
257
|
+
depending on the Distance function used.
|
|
258
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
237
259
|
:returns:
|
|
238
260
|
The retrieved documents.
|
|
239
261
|
|
|
@@ -244,6 +266,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
244
266
|
top_k=top_k or self._top_k,
|
|
245
267
|
scale_score=scale_score or self._scale_score,
|
|
246
268
|
return_embedding=return_embedding or self._return_embedding,
|
|
269
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
247
270
|
)
|
|
248
271
|
|
|
249
272
|
return {"documents": docs}
|
|
@@ -288,6 +311,7 @@ class QdrantHybridRetriever:
|
|
|
288
311
|
filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
|
|
289
312
|
top_k: int = 10,
|
|
290
313
|
return_embedding: bool = False,
|
|
314
|
+
score_threshold: Optional[float] = None,
|
|
291
315
|
):
|
|
292
316
|
"""
|
|
293
317
|
Create a QdrantHybridRetriever component.
|
|
@@ -296,6 +320,10 @@ class QdrantHybridRetriever:
|
|
|
296
320
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
297
321
|
:param top_k: The maximum number of documents to retrieve.
|
|
298
322
|
:param return_embedding: Whether to return the embeddings of the retrieved Documents.
|
|
323
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
324
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
325
|
+
depending on the Distance function used.
|
|
326
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
299
327
|
|
|
300
328
|
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
301
329
|
"""
|
|
@@ -308,6 +336,7 @@ class QdrantHybridRetriever:
|
|
|
308
336
|
self._filters = filters
|
|
309
337
|
self._top_k = top_k
|
|
310
338
|
self._return_embedding = return_embedding
|
|
339
|
+
self._score_threshold = score_threshold
|
|
311
340
|
|
|
312
341
|
def to_dict(self) -> Dict[str, Any]:
|
|
313
342
|
"""
|
|
@@ -322,6 +351,7 @@ class QdrantHybridRetriever:
|
|
|
322
351
|
filters=self._filters,
|
|
323
352
|
top_k=self._top_k,
|
|
324
353
|
return_embedding=self._return_embedding,
|
|
354
|
+
score_threshold=self._score_threshold,
|
|
325
355
|
)
|
|
326
356
|
|
|
327
357
|
@classmethod
|
|
@@ -346,6 +376,7 @@ class QdrantHybridRetriever:
|
|
|
346
376
|
filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
|
|
347
377
|
top_k: Optional[int] = None,
|
|
348
378
|
return_embedding: Optional[bool] = None,
|
|
379
|
+
score_threshold: Optional[float] = None,
|
|
349
380
|
):
|
|
350
381
|
"""
|
|
351
382
|
Run the Sparse Embedding Retriever on the given input data.
|
|
@@ -355,6 +386,10 @@ class QdrantHybridRetriever:
|
|
|
355
386
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
356
387
|
:param top_k: The maximum number of documents to return.
|
|
357
388
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
389
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
390
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
391
|
+
depending on the Distance function used.
|
|
392
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
358
393
|
:returns:
|
|
359
394
|
The retrieved documents.
|
|
360
395
|
|
|
@@ -365,6 +400,7 @@ class QdrantHybridRetriever:
|
|
|
365
400
|
filters=filters or self._filters,
|
|
366
401
|
top_k=top_k or self._top_k,
|
|
367
402
|
return_embedding=return_embedding or self._return_embedding,
|
|
403
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
368
404
|
)
|
|
369
405
|
|
|
370
406
|
return {"documents": docs}
|
|
@@ -111,6 +111,7 @@ class QdrantDocumentStore:
|
|
|
111
111
|
embedding_dim: int = 768,
|
|
112
112
|
on_disk: bool = False,
|
|
113
113
|
use_sparse_embeddings: bool = False,
|
|
114
|
+
sparse_idf: bool = False,
|
|
114
115
|
similarity: str = "cosine",
|
|
115
116
|
return_embedding: bool = False,
|
|
116
117
|
progress_bar: bool = True,
|
|
@@ -168,6 +169,9 @@ class QdrantDocumentStore:
|
|
|
168
169
|
Whether to store the collection on disk.
|
|
169
170
|
:param use_sparse_embedding:
|
|
170
171
|
If set to `True`, enables support for sparse embeddings.
|
|
172
|
+
:param sparse_idf:
|
|
173
|
+
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
|
|
174
|
+
It is required to use techniques like BM42. It is ignored if `use_sparse_embeddings` is `False`.
|
|
171
175
|
:param similarity:
|
|
172
176
|
The similarity metric to use.
|
|
173
177
|
:param return_embedding:
|
|
@@ -246,6 +250,7 @@ class QdrantDocumentStore:
|
|
|
246
250
|
self.recreate_index = recreate_index
|
|
247
251
|
self.payload_fields_to_index = payload_fields_to_index
|
|
248
252
|
self.use_sparse_embeddings = use_sparse_embeddings
|
|
253
|
+
self.sparse_idf = use_sparse_embeddings and sparse_idf
|
|
249
254
|
self.embedding_dim = embedding_dim
|
|
250
255
|
self.on_disk = on_disk
|
|
251
256
|
self.similarity = similarity
|
|
@@ -280,6 +285,7 @@ class QdrantDocumentStore:
|
|
|
280
285
|
self.recreate_index,
|
|
281
286
|
self.similarity,
|
|
282
287
|
self.use_sparse_embeddings,
|
|
288
|
+
self.sparse_idf,
|
|
283
289
|
self.on_disk,
|
|
284
290
|
self.payload_fields_to_index,
|
|
285
291
|
)
|
|
@@ -347,7 +353,9 @@ class QdrantDocumentStore:
|
|
|
347
353
|
if not isinstance(doc, Document):
|
|
348
354
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
349
355
|
raise ValueError(msg)
|
|
350
|
-
self._set_up_collection(
|
|
356
|
+
self._set_up_collection(
|
|
357
|
+
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
|
|
358
|
+
)
|
|
351
359
|
|
|
352
360
|
if len(documents) == 0:
|
|
353
361
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -498,6 +506,7 @@ class QdrantDocumentStore:
|
|
|
498
506
|
top_k: int = 10,
|
|
499
507
|
scale_score: bool = False,
|
|
500
508
|
return_embedding: bool = False,
|
|
509
|
+
score_threshold: Optional[float] = None,
|
|
501
510
|
) -> List[Document]:
|
|
502
511
|
"""
|
|
503
512
|
Queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
@@ -507,6 +516,10 @@ class QdrantDocumentStore:
|
|
|
507
516
|
:param top_k: Maximum number of documents to return.
|
|
508
517
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
509
518
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
519
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
520
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
521
|
+
depending on the Distance function used.
|
|
522
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
510
523
|
|
|
511
524
|
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
512
525
|
|
|
@@ -536,6 +549,7 @@ class QdrantDocumentStore:
|
|
|
536
549
|
query_filter=qdrant_filters,
|
|
537
550
|
limit=top_k,
|
|
538
551
|
with_vectors=return_embedding,
|
|
552
|
+
score_threshold=score_threshold,
|
|
539
553
|
)
|
|
540
554
|
results = [
|
|
541
555
|
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
@@ -555,6 +569,7 @@ class QdrantDocumentStore:
|
|
|
555
569
|
top_k: int = 10,
|
|
556
570
|
scale_score: bool = False,
|
|
557
571
|
return_embedding: bool = False,
|
|
572
|
+
score_threshold: Optional[float] = None,
|
|
558
573
|
) -> List[Document]:
|
|
559
574
|
"""
|
|
560
575
|
Queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
@@ -564,6 +579,10 @@ class QdrantDocumentStore:
|
|
|
564
579
|
:param top_k: Maximum number of documents to return.
|
|
565
580
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
566
581
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
582
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
583
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
584
|
+
depending on the Distance function used.
|
|
585
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
567
586
|
|
|
568
587
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
569
588
|
"""
|
|
@@ -578,6 +597,7 @@ class QdrantDocumentStore:
|
|
|
578
597
|
query_filter=qdrant_filters,
|
|
579
598
|
limit=top_k,
|
|
580
599
|
with_vectors=return_embedding,
|
|
600
|
+
score_threshold=score_threshold,
|
|
581
601
|
)
|
|
582
602
|
results = [
|
|
583
603
|
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
@@ -600,6 +620,7 @@ class QdrantDocumentStore:
|
|
|
600
620
|
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
601
621
|
top_k: int = 10,
|
|
602
622
|
return_embedding: bool = False,
|
|
623
|
+
score_threshold: Optional[float] = None,
|
|
603
624
|
) -> List[Document]:
|
|
604
625
|
"""
|
|
605
626
|
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
@@ -612,6 +633,10 @@ class QdrantDocumentStore:
|
|
|
612
633
|
:param filters: Filters applied to the retrieved documents.
|
|
613
634
|
:param top_k: Maximum number of documents to return.
|
|
614
635
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
636
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
637
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
638
|
+
depending on the Distance function used.
|
|
639
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
615
640
|
|
|
616
641
|
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
617
642
|
|
|
@@ -642,6 +667,7 @@ class QdrantDocumentStore:
|
|
|
642
667
|
limit=top_k,
|
|
643
668
|
with_payload=True,
|
|
644
669
|
with_vector=return_embedding,
|
|
670
|
+
score_threshold=score_threshold,
|
|
645
671
|
)
|
|
646
672
|
|
|
647
673
|
dense_request = rest.SearchRequest(
|
|
@@ -714,6 +740,7 @@ class QdrantDocumentStore:
|
|
|
714
740
|
recreate_collection: bool,
|
|
715
741
|
similarity: str,
|
|
716
742
|
use_sparse_embeddings: bool,
|
|
743
|
+
sparse_idf: bool,
|
|
717
744
|
on_disk: bool = False,
|
|
718
745
|
payload_fields_to_index: Optional[List[dict]] = None,
|
|
719
746
|
):
|
|
@@ -729,6 +756,8 @@ class QdrantDocumentStore:
|
|
|
729
756
|
The similarity measure to use.
|
|
730
757
|
:param use_sparse_embeddings:
|
|
731
758
|
Whether to use sparse embeddings.
|
|
759
|
+
:param sparse_idf:
|
|
760
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
732
761
|
:param on_disk:
|
|
733
762
|
Whether to store the collection on disk.
|
|
734
763
|
:param payload_fields_to_index:
|
|
@@ -745,7 +774,9 @@ class QdrantDocumentStore:
|
|
|
745
774
|
if recreate_collection or not self.client.collection_exists(collection_name):
|
|
746
775
|
# There is no need to verify the current configuration of that
|
|
747
776
|
# collection. It might be just recreated again or does not exist yet.
|
|
748
|
-
self.recreate_collection(
|
|
777
|
+
self.recreate_collection(
|
|
778
|
+
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
|
|
779
|
+
)
|
|
749
780
|
# Create Payload index if payload_fields_to_index is provided
|
|
750
781
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
751
782
|
return
|
|
@@ -808,6 +839,7 @@ class QdrantDocumentStore:
|
|
|
808
839
|
embedding_dim: int,
|
|
809
840
|
on_disk: Optional[bool] = None,
|
|
810
841
|
use_sparse_embeddings: Optional[bool] = None,
|
|
842
|
+
sparse_idf: bool = False,
|
|
811
843
|
):
|
|
812
844
|
"""
|
|
813
845
|
Recreates the Qdrant collection with the specified parameters.
|
|
@@ -822,6 +854,8 @@ class QdrantDocumentStore:
|
|
|
822
854
|
Whether to store the collection on disk.
|
|
823
855
|
:param use_sparse_embeddings:
|
|
824
856
|
Whether to use sparse embeddings.
|
|
857
|
+
:param sparse_idf:
|
|
858
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
825
859
|
"""
|
|
826
860
|
if on_disk is None:
|
|
827
861
|
on_disk = self.on_disk
|
|
@@ -840,7 +874,8 @@ class QdrantDocumentStore:
|
|
|
840
874
|
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
841
875
|
index=rest.SparseIndexParams(
|
|
842
876
|
on_disk=on_disk,
|
|
843
|
-
)
|
|
877
|
+
),
|
|
878
|
+
modifier=rest.Modifier.IDF if sparse_idf else None,
|
|
844
879
|
),
|
|
845
880
|
}
|
|
846
881
|
|
|
@@ -24,6 +24,7 @@ def test_to_dict():
|
|
|
24
24
|
"on_disk": False,
|
|
25
25
|
"force_disable_check_same_thread": False,
|
|
26
26
|
"use_sparse_embeddings": False,
|
|
27
|
+
"sparse_idf": False,
|
|
27
28
|
"similarity": "cosine",
|
|
28
29
|
"return_embedding": False,
|
|
29
30
|
"progress_bar": True,
|
|
@@ -60,6 +61,7 @@ def test_from_dict():
|
|
|
60
61
|
"on_disk": False,
|
|
61
62
|
"force_disable_check_same_thread": False,
|
|
62
63
|
"use_sparse_embeddings": True,
|
|
64
|
+
"sparse_idf": True,
|
|
63
65
|
"similarity": "cosine",
|
|
64
66
|
"return_embedding": False,
|
|
65
67
|
"progress_bar": True,
|
|
@@ -81,6 +83,7 @@ def test_from_dict():
|
|
|
81
83
|
document_store.index == "test",
|
|
82
84
|
document_store.force_disable_check_same_thread is False,
|
|
83
85
|
document_store.use_sparse_embeddings is True,
|
|
86
|
+
document_store.sparse_idf is True,
|
|
84
87
|
document_store.on_disk is False,
|
|
85
88
|
document_store.similarity == "cosine",
|
|
86
89
|
document_store.return_embedding is False,
|
|
@@ -12,7 +12,12 @@ from haystack.testing.document_store import (
|
|
|
12
12
|
WriteDocumentsTest,
|
|
13
13
|
_random_embeddings,
|
|
14
14
|
)
|
|
15
|
-
from haystack_integrations.document_stores.qdrant.document_store import
|
|
15
|
+
from haystack_integrations.document_stores.qdrant.document_store import (
|
|
16
|
+
SPARSE_VECTORS_NAME,
|
|
17
|
+
QdrantDocumentStore,
|
|
18
|
+
QdrantStoreError,
|
|
19
|
+
)
|
|
20
|
+
from qdrant_client.http import models as rest
|
|
16
21
|
|
|
17
22
|
|
|
18
23
|
class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
|
|
@@ -49,6 +54,23 @@ class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocu
|
|
|
49
54
|
with pytest.raises(DuplicateDocumentError):
|
|
50
55
|
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
51
56
|
|
|
57
|
+
def test_sparse_configuration(self):
|
|
58
|
+
document_store = QdrantDocumentStore(
|
|
59
|
+
":memory:",
|
|
60
|
+
recreate_index=True,
|
|
61
|
+
use_sparse_embeddings=True,
|
|
62
|
+
sparse_idf=True,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
client = document_store.client
|
|
66
|
+
sparse_config = client.get_collection("Document").config.params.sparse_vectors
|
|
67
|
+
|
|
68
|
+
assert SPARSE_VECTORS_NAME in sparse_config
|
|
69
|
+
|
|
70
|
+
# check that the `sparse_idf` parameter takes effect
|
|
71
|
+
assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier")
|
|
72
|
+
assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF
|
|
73
|
+
|
|
52
74
|
def test_query_hybrid(self, generate_sparse_embedding):
|
|
53
75
|
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
54
76
|
|
|
@@ -22,6 +22,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
22
22
|
assert retriever._filters is None
|
|
23
23
|
assert retriever._top_k == 10
|
|
24
24
|
assert retriever._return_embedding is False
|
|
25
|
+
assert retriever._score_threshold is None
|
|
25
26
|
|
|
26
27
|
def test_to_dict(self):
|
|
27
28
|
document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False)
|
|
@@ -49,6 +50,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
49
50
|
"on_disk": False,
|
|
50
51
|
"force_disable_check_same_thread": False,
|
|
51
52
|
"use_sparse_embeddings": False,
|
|
53
|
+
"sparse_idf": False,
|
|
52
54
|
"similarity": "cosine",
|
|
53
55
|
"return_embedding": False,
|
|
54
56
|
"progress_bar": True,
|
|
@@ -73,6 +75,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
73
75
|
"top_k": 10,
|
|
74
76
|
"scale_score": False,
|
|
75
77
|
"return_embedding": False,
|
|
78
|
+
"score_threshold": None,
|
|
76
79
|
},
|
|
77
80
|
}
|
|
78
81
|
|
|
@@ -88,6 +91,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
88
91
|
"top_k": 5,
|
|
89
92
|
"scale_score": False,
|
|
90
93
|
"return_embedding": True,
|
|
94
|
+
"score_threshold": None,
|
|
91
95
|
},
|
|
92
96
|
}
|
|
93
97
|
retriever = QdrantEmbeddingRetriever.from_dict(data)
|
|
@@ -97,6 +101,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
97
101
|
assert retriever._top_k == 5
|
|
98
102
|
assert retriever._scale_score is False
|
|
99
103
|
assert retriever._return_embedding is True
|
|
104
|
+
assert retriever._score_threshold is None
|
|
100
105
|
|
|
101
106
|
def test_run(self, filterable_docs: List[Document]):
|
|
102
107
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False)
|
|
@@ -114,6 +119,28 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
114
119
|
for document in results:
|
|
115
120
|
assert document.embedding is None
|
|
116
121
|
|
|
122
|
+
def test_run_with_score_threshold(self):
|
|
123
|
+
document_store = QdrantDocumentStore(
|
|
124
|
+
embedding_dim=4, location=":memory:", similarity="cosine", index="Boi", use_sparse_embeddings=False
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
document_store.write_documents(
|
|
128
|
+
[
|
|
129
|
+
Document(
|
|
130
|
+
content="Yet another document",
|
|
131
|
+
embedding=[-0.1, -0.9, -10.0, -0.2],
|
|
132
|
+
),
|
|
133
|
+
Document(content="The document", embedding=[1.0, 1.0, 1.0, 1.0]),
|
|
134
|
+
Document(content="Another document", embedding=[0.8, 0.8, 0.5, 1.0]),
|
|
135
|
+
]
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
retriever = QdrantEmbeddingRetriever(document_store=document_store)
|
|
139
|
+
results = retriever.run(
|
|
140
|
+
query_embedding=[0.9, 0.9, 0.9, 0.9], top_k=5, return_embedding=False, score_threshold=0.5
|
|
141
|
+
)["documents"]
|
|
142
|
+
assert len(results) == 2
|
|
143
|
+
|
|
117
144
|
def test_run_with_sparse_activated(self, filterable_docs: List[Document]):
|
|
118
145
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
119
146
|
|
|
@@ -141,6 +168,7 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
141
168
|
assert retriever._filters is None
|
|
142
169
|
assert retriever._top_k == 10
|
|
143
170
|
assert retriever._return_embedding is False
|
|
171
|
+
assert retriever._score_threshold is None
|
|
144
172
|
|
|
145
173
|
def test_to_dict(self):
|
|
146
174
|
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
@@ -168,6 +196,7 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
168
196
|
"on_disk": False,
|
|
169
197
|
"force_disable_check_same_thread": False,
|
|
170
198
|
"use_sparse_embeddings": False,
|
|
199
|
+
"sparse_idf": False,
|
|
171
200
|
"similarity": "cosine",
|
|
172
201
|
"return_embedding": False,
|
|
173
202
|
"progress_bar": True,
|
|
@@ -192,6 +221,7 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
192
221
|
"top_k": 10,
|
|
193
222
|
"scale_score": False,
|
|
194
223
|
"return_embedding": False,
|
|
224
|
+
"score_threshold": None,
|
|
195
225
|
},
|
|
196
226
|
}
|
|
197
227
|
|
|
@@ -207,6 +237,7 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
207
237
|
"top_k": 5,
|
|
208
238
|
"scale_score": False,
|
|
209
239
|
"return_embedding": True,
|
|
240
|
+
"score_threshold": None,
|
|
210
241
|
},
|
|
211
242
|
}
|
|
212
243
|
retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
|
|
@@ -216,6 +247,7 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
216
247
|
assert retriever._top_k == 5
|
|
217
248
|
assert retriever._scale_score is False
|
|
218
249
|
assert retriever._return_embedding is True
|
|
250
|
+
assert retriever._score_threshold is None
|
|
219
251
|
|
|
220
252
|
def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
|
|
221
253
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
@@ -247,6 +279,7 @@ class TestQdrantHybridRetriever:
|
|
|
247
279
|
assert retriever._filters is None
|
|
248
280
|
assert retriever._top_k == 10
|
|
249
281
|
assert retriever._return_embedding is False
|
|
282
|
+
assert retriever._score_threshold is None
|
|
250
283
|
|
|
251
284
|
def test_to_dict(self):
|
|
252
285
|
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
@@ -274,6 +307,7 @@ class TestQdrantHybridRetriever:
|
|
|
274
307
|
"on_disk": False,
|
|
275
308
|
"force_disable_check_same_thread": False,
|
|
276
309
|
"use_sparse_embeddings": False,
|
|
310
|
+
"sparse_idf": False,
|
|
277
311
|
"similarity": "cosine",
|
|
278
312
|
"return_embedding": False,
|
|
279
313
|
"progress_bar": True,
|
|
@@ -297,6 +331,7 @@ class TestQdrantHybridRetriever:
|
|
|
297
331
|
"filters": None,
|
|
298
332
|
"top_k": 5,
|
|
299
333
|
"return_embedding": True,
|
|
334
|
+
"score_threshold": None,
|
|
300
335
|
},
|
|
301
336
|
}
|
|
302
337
|
|
|
@@ -311,6 +346,7 @@ class TestQdrantHybridRetriever:
|
|
|
311
346
|
"filters": None,
|
|
312
347
|
"top_k": 5,
|
|
313
348
|
"return_embedding": True,
|
|
349
|
+
"score_threshold": None,
|
|
314
350
|
},
|
|
315
351
|
}
|
|
316
352
|
retriever = QdrantHybridRetriever.from_dict(data)
|
|
@@ -319,6 +355,7 @@ class TestQdrantHybridRetriever:
|
|
|
319
355
|
assert retriever._filters is None
|
|
320
356
|
assert retriever._top_k == 5
|
|
321
357
|
assert retriever._return_embedding
|
|
358
|
+
assert retriever._score_threshold is None
|
|
322
359
|
|
|
323
360
|
def test_run(self):
|
|
324
361
|
mock_store = Mock(spec=QdrantDocumentStore)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|