qdrant-haystack 4.0.0__tar.gz → 4.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/CHANGELOG.md +19 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/PKG-INFO +3 -3
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/pyproject.toml +1 -1
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +74 -5
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/document_stores/qdrant/document_store.py +38 -3
- qdrant_haystack-4.1.1/src/haystack_integrations/document_stores/qdrant/filters.py +316 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_dict_converters.py +3 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_document_store.py +23 -1
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_filters.py +106 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_retriever.py +94 -0
- qdrant_haystack-4.0.0/src/haystack_integrations/document_stores/qdrant/filters.py +0 -238
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/.gitignore +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/LICENSE.txt +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/README.md +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/examples/embedding_retrieval.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/pydoc/config.yml +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/components/retrievers/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/__init__.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/conftest.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_converters.py +0 -0
- {qdrant_haystack-4.0.0 → qdrant_haystack-4.1.1}/tests/test_legacy_filters.py +0 -0
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/qdrant-v4.1.0] - 2024-07-03
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Add `score_threshold` to Qdrant Retrievers (#860)
|
|
8
|
+
- Qdrant - add support for BM42 (#864)
|
|
9
|
+
|
|
10
|
+
## [integrations/qdrant-v4.0.0] - 2024-07-02
|
|
11
|
+
|
|
12
|
+
### 🚜 Refactor
|
|
13
|
+
|
|
14
|
+
- [**breaking**] Qdrant - remove unused init parameters: `content_field`, `name_field`, `embedding_field`, and `duplicate_documents` (#861)
|
|
15
|
+
- [**breaking**] Qdrant - set `scale_score` default value to `False` (#862)
|
|
16
|
+
|
|
17
|
+
### ⚙️ Miscellaneous Tasks
|
|
18
|
+
|
|
19
|
+
- Retry tests to reduce flakyness (#836)
|
|
20
|
+
- Update ruff invocation to include check parameter (#853)
|
|
21
|
+
|
|
3
22
|
## [integrations/qdrant-v3.8.1] - 2024-06-20
|
|
4
23
|
|
|
5
24
|
### 📚 Documentation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.1.1
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -18,8 +18,8 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
19
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
20
20
|
Requires-Python: >=3.8
|
|
21
|
-
Requires-Dist: haystack-ai
|
|
22
|
-
Requires-Dist: qdrant-client
|
|
21
|
+
Requires-Dist: haystack-ai
|
|
22
|
+
Requires-Dist: qdrant-client>=1.10.0
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
|
|
25
25
|
# qdrant-haystack
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
26
26
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
27
27
|
]
|
|
28
|
-
dependencies = ["haystack-ai
|
|
28
|
+
dependencies = ["haystack-ai", "qdrant-client>=1.10.0"]
|
|
29
29
|
|
|
30
30
|
[project.urls]
|
|
31
31
|
Source = "https://github.com/deepset-ai/haystack-core-integrations"
|
|
@@ -2,6 +2,8 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
2
2
|
|
|
3
3
|
from haystack import Document, component, default_from_dict, default_to_dict
|
|
4
4
|
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
5
|
+
from haystack.document_stores.types import FilterPolicy
|
|
6
|
+
from haystack.document_stores.types.filter_policy import apply_filter_policy
|
|
5
7
|
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
6
8
|
from qdrant_client.http import models
|
|
7
9
|
|
|
@@ -39,6 +41,8 @@ class QdrantEmbeddingRetriever:
|
|
|
39
41
|
top_k: int = 10,
|
|
40
42
|
scale_score: bool = False,
|
|
41
43
|
return_embedding: bool = False,
|
|
44
|
+
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
45
|
+
score_threshold: Optional[float] = None,
|
|
42
46
|
):
|
|
43
47
|
"""
|
|
44
48
|
Create a QdrantEmbeddingRetriever component.
|
|
@@ -48,6 +52,11 @@ class QdrantEmbeddingRetriever:
|
|
|
48
52
|
:param top_k: The maximum number of documents to retrieve.
|
|
49
53
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
50
54
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
55
|
+
:param filter_policy: Policy to determine how filters are applied.
|
|
56
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
57
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
58
|
+
depending on the `similarity` function specified in the Document Store.
|
|
59
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
51
60
|
|
|
52
61
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
53
62
|
"""
|
|
@@ -61,6 +70,10 @@ class QdrantEmbeddingRetriever:
|
|
|
61
70
|
self._top_k = top_k
|
|
62
71
|
self._scale_score = scale_score
|
|
63
72
|
self._return_embedding = return_embedding
|
|
73
|
+
self._filter_policy = (
|
|
74
|
+
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
75
|
+
)
|
|
76
|
+
self._score_threshold = score_threshold
|
|
64
77
|
|
|
65
78
|
def to_dict(self) -> Dict[str, Any]:
|
|
66
79
|
"""
|
|
@@ -74,8 +87,10 @@ class QdrantEmbeddingRetriever:
|
|
|
74
87
|
document_store=self._document_store,
|
|
75
88
|
filters=self._filters,
|
|
76
89
|
top_k=self._top_k,
|
|
90
|
+
filter_policy=self._filter_policy.value,
|
|
77
91
|
scale_score=self._scale_score,
|
|
78
92
|
return_embedding=self._return_embedding,
|
|
93
|
+
score_threshold=self._score_threshold,
|
|
79
94
|
)
|
|
80
95
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
81
96
|
|
|
@@ -93,6 +108,7 @@ class QdrantEmbeddingRetriever:
|
|
|
93
108
|
"""
|
|
94
109
|
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
95
110
|
data["init_parameters"]["document_store"] = document_store
|
|
111
|
+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
|
|
96
112
|
return default_from_dict(cls, data)
|
|
97
113
|
|
|
98
114
|
@component.output_types(documents=List[Document])
|
|
@@ -103,6 +119,7 @@ class QdrantEmbeddingRetriever:
|
|
|
103
119
|
top_k: Optional[int] = None,
|
|
104
120
|
scale_score: Optional[bool] = None,
|
|
105
121
|
return_embedding: Optional[bool] = None,
|
|
122
|
+
score_threshold: Optional[float] = None,
|
|
106
123
|
):
|
|
107
124
|
"""
|
|
108
125
|
Run the Embedding Retriever on the given input data.
|
|
@@ -112,16 +129,20 @@ class QdrantEmbeddingRetriever:
|
|
|
112
129
|
:param top_k: The maximum number of documents to return.
|
|
113
130
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
114
131
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
132
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
115
133
|
:returns:
|
|
116
134
|
The retrieved documents.
|
|
117
135
|
|
|
118
136
|
"""
|
|
137
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
138
|
+
|
|
119
139
|
docs = self._document_store._query_by_embedding(
|
|
120
140
|
query_embedding=query_embedding,
|
|
121
|
-
filters=filters
|
|
141
|
+
filters=filters,
|
|
122
142
|
top_k=top_k or self._top_k,
|
|
123
143
|
scale_score=scale_score or self._scale_score,
|
|
124
144
|
return_embedding=return_embedding or self._return_embedding,
|
|
145
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
125
146
|
)
|
|
126
147
|
|
|
127
148
|
return {"documents": docs}
|
|
@@ -161,6 +182,8 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
161
182
|
top_k: int = 10,
|
|
162
183
|
scale_score: bool = False,
|
|
163
184
|
return_embedding: bool = False,
|
|
185
|
+
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
186
|
+
score_threshold: Optional[float] = None,
|
|
164
187
|
):
|
|
165
188
|
"""
|
|
166
189
|
Create a QdrantSparseEmbeddingRetriever component.
|
|
@@ -170,6 +193,11 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
170
193
|
:param top_k: The maximum number of documents to retrieve.
|
|
171
194
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
172
195
|
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents.
|
|
196
|
+
:param filter_policy: Policy to determine how filters are applied. Defaults to "replace".
|
|
197
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
198
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
199
|
+
depending on the Distance function used.
|
|
200
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
173
201
|
|
|
174
202
|
:raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
|
|
175
203
|
"""
|
|
@@ -183,6 +211,10 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
183
211
|
self._top_k = top_k
|
|
184
212
|
self._scale_score = scale_score
|
|
185
213
|
self._return_embedding = return_embedding
|
|
214
|
+
self._filter_policy = (
|
|
215
|
+
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
216
|
+
)
|
|
217
|
+
self._score_threshold = score_threshold
|
|
186
218
|
|
|
187
219
|
def to_dict(self) -> Dict[str, Any]:
|
|
188
220
|
"""
|
|
@@ -197,7 +229,9 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
197
229
|
filters=self._filters,
|
|
198
230
|
top_k=self._top_k,
|
|
199
231
|
scale_score=self._scale_score,
|
|
232
|
+
filter_policy=self._filter_policy.value,
|
|
200
233
|
return_embedding=self._return_embedding,
|
|
234
|
+
score_threshold=self._score_threshold,
|
|
201
235
|
)
|
|
202
236
|
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
203
237
|
|
|
@@ -215,6 +249,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
215
249
|
"""
|
|
216
250
|
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
217
251
|
data["init_parameters"]["document_store"] = document_store
|
|
252
|
+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
|
|
218
253
|
return default_from_dict(cls, data)
|
|
219
254
|
|
|
220
255
|
@component.output_types(documents=List[Document])
|
|
@@ -225,25 +260,35 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
225
260
|
top_k: Optional[int] = None,
|
|
226
261
|
scale_score: Optional[bool] = None,
|
|
227
262
|
return_embedding: Optional[bool] = None,
|
|
263
|
+
score_threshold: Optional[float] = None,
|
|
228
264
|
):
|
|
229
265
|
"""
|
|
230
266
|
Run the Sparse Embedding Retriever on the given input data.
|
|
231
267
|
|
|
232
268
|
:param query_sparse_embedding: Sparse Embedding of the query.
|
|
233
|
-
:param filters:
|
|
269
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
270
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
271
|
+
details.
|
|
234
272
|
:param top_k: The maximum number of documents to return.
|
|
235
273
|
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
236
274
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
275
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
276
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
277
|
+
depending on the Distance function used.
|
|
278
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
237
279
|
:returns:
|
|
238
280
|
The retrieved documents.
|
|
239
281
|
|
|
240
282
|
"""
|
|
283
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
284
|
+
|
|
241
285
|
docs = self._document_store._query_by_sparse(
|
|
242
286
|
query_sparse_embedding=query_sparse_embedding,
|
|
243
|
-
filters=filters
|
|
287
|
+
filters=filters,
|
|
244
288
|
top_k=top_k or self._top_k,
|
|
245
289
|
scale_score=scale_score or self._scale_score,
|
|
246
290
|
return_embedding=return_embedding or self._return_embedding,
|
|
291
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
247
292
|
)
|
|
248
293
|
|
|
249
294
|
return {"documents": docs}
|
|
@@ -288,6 +333,8 @@ class QdrantHybridRetriever:
|
|
|
288
333
|
filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
|
|
289
334
|
top_k: int = 10,
|
|
290
335
|
return_embedding: bool = False,
|
|
336
|
+
filter_policy: Union[str, FilterPolicy] = FilterPolicy.REPLACE,
|
|
337
|
+
score_threshold: Optional[float] = None,
|
|
291
338
|
):
|
|
292
339
|
"""
|
|
293
340
|
Create a QdrantHybridRetriever component.
|
|
@@ -296,6 +343,11 @@ class QdrantHybridRetriever:
|
|
|
296
343
|
:param filters: A dictionary with filters to narrow down the search space.
|
|
297
344
|
:param top_k: The maximum number of documents to retrieve.
|
|
298
345
|
:param return_embedding: Whether to return the embeddings of the retrieved Documents.
|
|
346
|
+
:param filter_policy: Policy to determine how filters are applied.
|
|
347
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
348
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
349
|
+
depending on the Distance function used.
|
|
350
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
299
351
|
|
|
300
352
|
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
301
353
|
"""
|
|
@@ -308,6 +360,10 @@ class QdrantHybridRetriever:
|
|
|
308
360
|
self._filters = filters
|
|
309
361
|
self._top_k = top_k
|
|
310
362
|
self._return_embedding = return_embedding
|
|
363
|
+
self._filter_policy = (
|
|
364
|
+
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
|
|
365
|
+
)
|
|
366
|
+
self._score_threshold = score_threshold
|
|
311
367
|
|
|
312
368
|
def to_dict(self) -> Dict[str, Any]:
|
|
313
369
|
"""
|
|
@@ -321,7 +377,9 @@ class QdrantHybridRetriever:
|
|
|
321
377
|
document_store=self._document_store.to_dict(),
|
|
322
378
|
filters=self._filters,
|
|
323
379
|
top_k=self._top_k,
|
|
380
|
+
filter_policy=self._filter_policy.value,
|
|
324
381
|
return_embedding=self._return_embedding,
|
|
382
|
+
score_threshold=self._score_threshold,
|
|
325
383
|
)
|
|
326
384
|
|
|
327
385
|
@classmethod
|
|
@@ -336,6 +394,7 @@ class QdrantHybridRetriever:
|
|
|
336
394
|
"""
|
|
337
395
|
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
338
396
|
data["init_parameters"]["document_store"] = document_store
|
|
397
|
+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(data["init_parameters"]["filter_policy"])
|
|
339
398
|
return default_from_dict(cls, data)
|
|
340
399
|
|
|
341
400
|
@component.output_types(documents=List[Document])
|
|
@@ -346,25 +405,35 @@ class QdrantHybridRetriever:
|
|
|
346
405
|
filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
|
|
347
406
|
top_k: Optional[int] = None,
|
|
348
407
|
return_embedding: Optional[bool] = None,
|
|
408
|
+
score_threshold: Optional[float] = None,
|
|
349
409
|
):
|
|
350
410
|
"""
|
|
351
411
|
Run the Sparse Embedding Retriever on the given input data.
|
|
352
412
|
|
|
353
413
|
:param query_embedding: Dense embedding of the query.
|
|
354
414
|
:param query_sparse_embedding: Sparse embedding of the query.
|
|
355
|
-
:param filters:
|
|
415
|
+
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
|
|
416
|
+
the `filter_policy` chosen at retriever initialization. See init method docstring for more
|
|
417
|
+
details.
|
|
356
418
|
:param top_k: The maximum number of documents to return.
|
|
357
419
|
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
420
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
421
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
422
|
+
depending on the Distance function used.
|
|
423
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
358
424
|
:returns:
|
|
359
425
|
The retrieved documents.
|
|
360
426
|
|
|
361
427
|
"""
|
|
428
|
+
filters = apply_filter_policy(self._filter_policy, self._filters, filters)
|
|
429
|
+
|
|
362
430
|
docs = self._document_store._query_hybrid(
|
|
363
431
|
query_embedding=query_embedding,
|
|
364
432
|
query_sparse_embedding=query_sparse_embedding,
|
|
365
|
-
filters=filters
|
|
433
|
+
filters=filters,
|
|
366
434
|
top_k=top_k or self._top_k,
|
|
367
435
|
return_embedding=return_embedding or self._return_embedding,
|
|
436
|
+
score_threshold=score_threshold or self._score_threshold,
|
|
368
437
|
)
|
|
369
438
|
|
|
370
439
|
return {"documents": docs}
|
|
@@ -111,6 +111,7 @@ class QdrantDocumentStore:
|
|
|
111
111
|
embedding_dim: int = 768,
|
|
112
112
|
on_disk: bool = False,
|
|
113
113
|
use_sparse_embeddings: bool = False,
|
|
114
|
+
sparse_idf: bool = False,
|
|
114
115
|
similarity: str = "cosine",
|
|
115
116
|
return_embedding: bool = False,
|
|
116
117
|
progress_bar: bool = True,
|
|
@@ -168,6 +169,9 @@ class QdrantDocumentStore:
|
|
|
168
169
|
Whether to store the collection on disk.
|
|
169
170
|
:param use_sparse_embedding:
|
|
170
171
|
If set to `True`, enables support for sparse embeddings.
|
|
172
|
+
:param sparse_idf:
|
|
173
|
+
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
|
|
174
|
+
It is required to use techniques like BM42. It is ignored if `use_sparse_embeddings` is `False`.
|
|
171
175
|
:param similarity:
|
|
172
176
|
The similarity metric to use.
|
|
173
177
|
:param return_embedding:
|
|
@@ -246,6 +250,7 @@ class QdrantDocumentStore:
|
|
|
246
250
|
self.recreate_index = recreate_index
|
|
247
251
|
self.payload_fields_to_index = payload_fields_to_index
|
|
248
252
|
self.use_sparse_embeddings = use_sparse_embeddings
|
|
253
|
+
self.sparse_idf = use_sparse_embeddings and sparse_idf
|
|
249
254
|
self.embedding_dim = embedding_dim
|
|
250
255
|
self.on_disk = on_disk
|
|
251
256
|
self.similarity = similarity
|
|
@@ -280,6 +285,7 @@ class QdrantDocumentStore:
|
|
|
280
285
|
self.recreate_index,
|
|
281
286
|
self.similarity,
|
|
282
287
|
self.use_sparse_embeddings,
|
|
288
|
+
self.sparse_idf,
|
|
283
289
|
self.on_disk,
|
|
284
290
|
self.payload_fields_to_index,
|
|
285
291
|
)
|
|
@@ -347,7 +353,9 @@ class QdrantDocumentStore:
|
|
|
347
353
|
if not isinstance(doc, Document):
|
|
348
354
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
349
355
|
raise ValueError(msg)
|
|
350
|
-
self._set_up_collection(
|
|
356
|
+
self._set_up_collection(
|
|
357
|
+
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
|
|
358
|
+
)
|
|
351
359
|
|
|
352
360
|
if len(documents) == 0:
|
|
353
361
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -498,6 +506,7 @@ class QdrantDocumentStore:
|
|
|
498
506
|
top_k: int = 10,
|
|
499
507
|
scale_score: bool = False,
|
|
500
508
|
return_embedding: bool = False,
|
|
509
|
+
score_threshold: Optional[float] = None,
|
|
501
510
|
) -> List[Document]:
|
|
502
511
|
"""
|
|
503
512
|
Queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
@@ -507,6 +516,10 @@ class QdrantDocumentStore:
|
|
|
507
516
|
:param top_k: Maximum number of documents to return.
|
|
508
517
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
509
518
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
519
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
520
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
521
|
+
depending on the Distance function used.
|
|
522
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
510
523
|
|
|
511
524
|
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
512
525
|
|
|
@@ -536,6 +549,7 @@ class QdrantDocumentStore:
|
|
|
536
549
|
query_filter=qdrant_filters,
|
|
537
550
|
limit=top_k,
|
|
538
551
|
with_vectors=return_embedding,
|
|
552
|
+
score_threshold=score_threshold,
|
|
539
553
|
)
|
|
540
554
|
results = [
|
|
541
555
|
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
@@ -555,6 +569,7 @@ class QdrantDocumentStore:
|
|
|
555
569
|
top_k: int = 10,
|
|
556
570
|
scale_score: bool = False,
|
|
557
571
|
return_embedding: bool = False,
|
|
572
|
+
score_threshold: Optional[float] = None,
|
|
558
573
|
) -> List[Document]:
|
|
559
574
|
"""
|
|
560
575
|
Queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
@@ -564,6 +579,10 @@ class QdrantDocumentStore:
|
|
|
564
579
|
:param top_k: Maximum number of documents to return.
|
|
565
580
|
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
566
581
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
582
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
583
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
584
|
+
depending on the Distance function used.
|
|
585
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
567
586
|
|
|
568
587
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
569
588
|
"""
|
|
@@ -578,6 +597,7 @@ class QdrantDocumentStore:
|
|
|
578
597
|
query_filter=qdrant_filters,
|
|
579
598
|
limit=top_k,
|
|
580
599
|
with_vectors=return_embedding,
|
|
600
|
+
score_threshold=score_threshold,
|
|
581
601
|
)
|
|
582
602
|
results = [
|
|
583
603
|
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
@@ -600,6 +620,7 @@ class QdrantDocumentStore:
|
|
|
600
620
|
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
601
621
|
top_k: int = 10,
|
|
602
622
|
return_embedding: bool = False,
|
|
623
|
+
score_threshold: Optional[float] = None,
|
|
603
624
|
) -> List[Document]:
|
|
604
625
|
"""
|
|
605
626
|
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
@@ -612,6 +633,10 @@ class QdrantDocumentStore:
|
|
|
612
633
|
:param filters: Filters applied to the retrieved documents.
|
|
613
634
|
:param top_k: Maximum number of documents to return.
|
|
614
635
|
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
636
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
637
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
638
|
+
depending on the Distance function used.
|
|
639
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
615
640
|
|
|
616
641
|
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
617
642
|
|
|
@@ -642,6 +667,7 @@ class QdrantDocumentStore:
|
|
|
642
667
|
limit=top_k,
|
|
643
668
|
with_payload=True,
|
|
644
669
|
with_vector=return_embedding,
|
|
670
|
+
score_threshold=score_threshold,
|
|
645
671
|
)
|
|
646
672
|
|
|
647
673
|
dense_request = rest.SearchRequest(
|
|
@@ -714,6 +740,7 @@ class QdrantDocumentStore:
|
|
|
714
740
|
recreate_collection: bool,
|
|
715
741
|
similarity: str,
|
|
716
742
|
use_sparse_embeddings: bool,
|
|
743
|
+
sparse_idf: bool,
|
|
717
744
|
on_disk: bool = False,
|
|
718
745
|
payload_fields_to_index: Optional[List[dict]] = None,
|
|
719
746
|
):
|
|
@@ -729,6 +756,8 @@ class QdrantDocumentStore:
|
|
|
729
756
|
The similarity measure to use.
|
|
730
757
|
:param use_sparse_embeddings:
|
|
731
758
|
Whether to use sparse embeddings.
|
|
759
|
+
:param sparse_idf:
|
|
760
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
732
761
|
:param on_disk:
|
|
733
762
|
Whether to store the collection on disk.
|
|
734
763
|
:param payload_fields_to_index:
|
|
@@ -745,7 +774,9 @@ class QdrantDocumentStore:
|
|
|
745
774
|
if recreate_collection or not self.client.collection_exists(collection_name):
|
|
746
775
|
# There is no need to verify the current configuration of that
|
|
747
776
|
# collection. It might be just recreated again or does not exist yet.
|
|
748
|
-
self.recreate_collection(
|
|
777
|
+
self.recreate_collection(
|
|
778
|
+
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
|
|
779
|
+
)
|
|
749
780
|
# Create Payload index if payload_fields_to_index is provided
|
|
750
781
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
751
782
|
return
|
|
@@ -808,6 +839,7 @@ class QdrantDocumentStore:
|
|
|
808
839
|
embedding_dim: int,
|
|
809
840
|
on_disk: Optional[bool] = None,
|
|
810
841
|
use_sparse_embeddings: Optional[bool] = None,
|
|
842
|
+
sparse_idf: bool = False,
|
|
811
843
|
):
|
|
812
844
|
"""
|
|
813
845
|
Recreates the Qdrant collection with the specified parameters.
|
|
@@ -822,6 +854,8 @@ class QdrantDocumentStore:
|
|
|
822
854
|
Whether to store the collection on disk.
|
|
823
855
|
:param use_sparse_embeddings:
|
|
824
856
|
Whether to use sparse embeddings.
|
|
857
|
+
:param sparse_idf:
|
|
858
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
825
859
|
"""
|
|
826
860
|
if on_disk is None:
|
|
827
861
|
on_disk = self.on_disk
|
|
@@ -840,7 +874,8 @@ class QdrantDocumentStore:
|
|
|
840
874
|
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
841
875
|
index=rest.SparseIndexParams(
|
|
842
876
|
on_disk=on_disk,
|
|
843
|
-
)
|
|
877
|
+
),
|
|
878
|
+
modifier=rest.Modifier.IDF if sparse_idf else None,
|
|
844
879
|
),
|
|
845
880
|
}
|
|
846
881
|
|