qdrant-haystack 3.4.0__tar.gz → 3.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/PKG-INFO +2 -1
  2. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/pydoc/config.yml +1 -1
  3. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/pyproject.toml +1 -0
  4. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/components/retrievers/qdrant/__init__.py +2 -2
  5. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +150 -21
  6. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +122 -34
  7. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/document_stores/qdrant/filters.py +3 -2
  8. qdrant_haystack-3.6.0/tests/conftest.py +18 -0
  9. qdrant_haystack-3.6.0/tests/test_document_store.py +109 -0
  10. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/test_filters.py +16 -0
  11. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/test_retriever.py +113 -14
  12. qdrant_haystack-3.4.0/tests/test_document_store.py +0 -41
  13. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/.gitignore +0 -0
  14. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/LICENSE.txt +0 -0
  15. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/README.md +0 -0
  16. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/examples/embedding_retrieval.py +0 -0
  17. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
  18. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
  19. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
  20. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/__init__.py +0 -0
  21. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/test_converters.py +0 -0
  22. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/test_dict_converters.py +0 -0
  23. {qdrant_haystack-3.4.0 → qdrant_haystack-3.6.0}/tests/test_legacy_filters.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: qdrant-haystack
3
- Version: 3.4.0
3
+ Version: 3.6.0
4
4
  Summary: An integration of Qdrant ANN vector database backend with Haystack
5
5
  Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
6
6
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
@@ -9,6 +9,7 @@ Author-email: Kacper Łukawski <kacper.lukawski@qdrant.com>, Anush Shetty <anush
9
9
  License-Expression: Apache-2.0
10
10
  License-File: LICENSE.txt
11
11
  Classifier: Development Status :: 4 - Beta
12
+ Classifier: License :: OSI Approved :: Apache Software License
12
13
  Classifier: Programming Language :: Python
13
14
  Classifier: Programming Language :: Python :: 3.8
14
15
  Classifier: Programming Language :: Python :: 3.9
@@ -17,7 +17,7 @@ processors:
17
17
  - type: smart
18
18
  - type: crossref
19
19
  renderer:
20
- type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
20
+ type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
21
21
  excerpt: Qdrant integration for Haystack
22
22
  category_slug: integrations-api
23
23
  title: Qdrant
@@ -15,6 +15,7 @@ authors = [
15
15
  { name = "Anush Shetty", email = "anush.shetty@qdrant.com" },
16
16
  ]
17
17
  classifiers = [
18
+ "License :: OSI Approved :: Apache Software License",
18
19
  "Development Status :: 4 - Beta",
19
20
  "Programming Language :: Python",
20
21
  "Programming Language :: Python :: 3.8",
@@ -2,6 +2,6 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- from .retriever import QdrantEmbeddingRetriever, QdrantSparseEmbeddingRetriever
5
+ from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseEmbeddingRetriever
6
6
 
7
- __all__ = ("QdrantEmbeddingRetriever", "QdrantSparseEmbeddingRetriever")
7
+ __all__ = ("QdrantEmbeddingRetriever", "QdrantSparseEmbeddingRetriever", "QdrantHybridRetriever")
@@ -1,8 +1,9 @@
1
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional, Union
2
2
 
3
3
  from haystack import Document, component, default_from_dict, default_to_dict
4
4
  from haystack.dataclasses.sparse_embedding import SparseEmbedding
5
5
  from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
6
+ from qdrant_client.http import models
6
7
 
7
8
 
8
9
  @component
@@ -12,6 +13,7 @@ class QdrantEmbeddingRetriever:
12
13
 
13
14
  Usage example:
14
15
  ```python
16
+ from haystack.dataclasses import Document
15
17
  from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
16
18
  from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
17
19
 
@@ -19,8 +21,10 @@ class QdrantEmbeddingRetriever:
19
21
  ":memory:",
20
22
  recreate_index=True,
21
23
  return_embedding=True,
22
- wait_result_from_api=True,
23
24
  )
25
+
26
+ document_store.write_documents([Document(content="test", embedding=[0.5]*768)])
27
+
24
28
  retriever = QdrantEmbeddingRetriever(document_store=document_store)
25
29
 
26
30
  # using a fake vector to keep the example simple
@@ -31,7 +35,7 @@ class QdrantEmbeddingRetriever:
31
35
  def __init__(
32
36
  self,
33
37
  document_store: QdrantDocumentStore,
34
- filters: Optional[Dict[str, Any]] = None,
38
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
35
39
  top_k: int = 10,
36
40
  scale_score: bool = True,
37
41
  return_embedding: bool = False,
@@ -40,12 +44,12 @@ class QdrantEmbeddingRetriever:
40
44
  Create a QdrantEmbeddingRetriever component.
41
45
 
42
46
  :param document_store: An instance of QdrantDocumentStore.
43
- :param filters: A dictionary with filters to narrow down the search space. Default is None.
44
- :param top_k: The maximum number of documents to retrieve. Default is 10.
45
- :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
46
- :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False.
47
+ :param filters: A dictionary with filters to narrow down the search space.
48
+ :param top_k: The maximum number of documents to retrieve.
49
+ :param scale_score: Whether to scale the scores of the retrieved documents or not.
50
+ :param return_embedding: Whether to return the embedding of the retrieved Documents.
47
51
 
48
- :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
52
+ :raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
49
53
  """
50
54
 
51
55
  if not isinstance(document_store, QdrantDocumentStore):
@@ -95,7 +99,7 @@ class QdrantEmbeddingRetriever:
95
99
  def run(
96
100
  self,
97
101
  query_embedding: List[float],
98
- filters: Optional[Dict[str, Any]] = None,
102
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
99
103
  top_k: Optional[int] = None,
100
104
  scale_score: Optional[bool] = None,
101
105
  return_embedding: Optional[bool] = None,
@@ -112,7 +116,7 @@ class QdrantEmbeddingRetriever:
112
116
  The retrieved documents.
113
117
 
114
118
  """
115
- docs = self._document_store.query_by_embedding(
119
+ docs = self._document_store._query_by_embedding(
116
120
  query_embedding=query_embedding,
117
121
  filters=filters or self._filters,
118
122
  top_k=top_k or self._top_k,
@@ -132,14 +136,18 @@ class QdrantSparseEmbeddingRetriever:
132
136
  ```python
133
137
  from haystack_integrations.components.retrievers.qdrant import QdrantSparseEmbeddingRetriever
134
138
  from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
135
- from haystack.dataclasses.sparse_embedding import SparseEmbedding
139
+ from haystack.dataclasses import Document, SparseEmbedding
136
140
 
137
141
  document_store = QdrantDocumentStore(
138
142
  ":memory:",
143
+ use_sparse_embeddings=True,
139
144
  recreate_index=True,
140
145
  return_embedding=True,
141
- wait_result_from_api=True,
142
146
  )
147
+
148
+ doc = Document(content="test", sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
149
+ document_store.write_documents([doc])
150
+
143
151
  retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
144
152
  sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
145
153
  retriever.run(query_sparse_embedding=sparse_embedding)
@@ -149,7 +157,7 @@ class QdrantSparseEmbeddingRetriever:
149
157
  def __init__(
150
158
  self,
151
159
  document_store: QdrantDocumentStore,
152
- filters: Optional[Dict[str, Any]] = None,
160
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
153
161
  top_k: int = 10,
154
162
  scale_score: bool = True,
155
163
  return_embedding: bool = False,
@@ -158,12 +166,12 @@ class QdrantSparseEmbeddingRetriever:
158
166
  Create a QdrantSparseEmbeddingRetriever component.
159
167
 
160
168
  :param document_store: An instance of QdrantDocumentStore.
161
- :param filters: A dictionary with filters to narrow down the search space. Default is None.
162
- :param top_k: The maximum number of documents to retrieve. Default is 10.
163
- :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
164
- :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
169
+ :param filters: A dictionary with filters to narrow down the search space.
170
+ :param top_k: The maximum number of documents to retrieve.
171
+ :param scale_score: Whether to scale the scores of the retrieved documents or not.
172
+ :param return_embedding: Whether to return the sparse embedding of the retrieved Documents.
165
173
 
166
- :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
174
+ :raises ValueError: If `document_store` is not an instance of `QdrantDocumentStore`.
167
175
  """
168
176
 
169
177
  if not isinstance(document_store, QdrantDocumentStore):
@@ -196,7 +204,7 @@ class QdrantSparseEmbeddingRetriever:
196
204
  return d
197
205
 
198
206
  @classmethod
199
- def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
207
+ def from_dict(cls, data: Dict[str, Any]) -> "QdrantSparseEmbeddingRetriever":
200
208
  """
201
209
  Deserializes the component from a dictionary.
202
210
 
@@ -213,7 +221,7 @@ class QdrantSparseEmbeddingRetriever:
213
221
  def run(
214
222
  self,
215
223
  query_sparse_embedding: SparseEmbedding,
216
- filters: Optional[Dict[str, Any]] = None,
224
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
217
225
  top_k: Optional[int] = None,
218
226
  scale_score: Optional[bool] = None,
219
227
  return_embedding: Optional[bool] = None,
@@ -230,7 +238,7 @@ class QdrantSparseEmbeddingRetriever:
230
238
  The retrieved documents.
231
239
 
232
240
  """
233
- docs = self._document_store.query_by_sparse(
241
+ docs = self._document_store._query_by_sparse(
234
242
  query_sparse_embedding=query_sparse_embedding,
235
243
  filters=filters or self._filters,
236
244
  top_k=top_k or self._top_k,
@@ -239,3 +247,124 @@ class QdrantSparseEmbeddingRetriever:
239
247
  )
240
248
 
241
249
  return {"documents": docs}
250
+
251
+
252
+ @component
253
+ class QdrantHybridRetriever:
254
+ """
255
+ A component for retrieving documents from an QdrantDocumentStore using both dense and sparse vectors
256
+ and fusing the results using Reciprocal Rank Fusion.
257
+
258
+ Usage example:
259
+ ```python
260
+ from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
261
+ from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
262
+ from haystack.dataclasses import Document, SparseEmbedding
263
+
264
+ document_store = QdrantDocumentStore(
265
+ ":memory:",
266
+ use_sparse_embeddings=True,
267
+ recreate_index=True,
268
+ return_embedding=True,
269
+ wait_result_from_api=True,
270
+ )
271
+
272
+ doc = Document(content="test",
273
+ embedding=[0.5]*768,
274
+ sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
275
+
276
+ document_store.write_documents([doc])
277
+
278
+ retriever = QdrantHybridRetriever(document_store=document_store)
279
+ embedding = [0.1]*768
280
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
281
+ retriever.run(query_embedding=embedding, query_sparse_embedding=sparse_embedding)
282
+ ```
283
+ """
284
+
285
+ def __init__(
286
+ self,
287
+ document_store: QdrantDocumentStore,
288
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
289
+ top_k: int = 10,
290
+ return_embedding: bool = False,
291
+ ):
292
+ """
293
+ Create a QdrantHybridRetriever component.
294
+
295
+ :param document_store: An instance of QdrantDocumentStore.
296
+ :param filters: A dictionary with filters to narrow down the search space.
297
+ :param top_k: The maximum number of documents to retrieve.
298
+ :param return_embedding: Whether to return the embeddings of the retrieved Documents.
299
+
300
+ :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
301
+ """
302
+
303
+ if not isinstance(document_store, QdrantDocumentStore):
304
+ msg = "document_store must be an instance of QdrantDocumentStore"
305
+ raise ValueError(msg)
306
+
307
+ self._document_store = document_store
308
+ self._filters = filters
309
+ self._top_k = top_k
310
+ self._return_embedding = return_embedding
311
+
312
+ def to_dict(self) -> Dict[str, Any]:
313
+ """
314
+ Serializes the component to a dictionary.
315
+
316
+ :returns:
317
+ Dictionary with serialized data.
318
+ """
319
+ return default_to_dict(
320
+ self,
321
+ document_store=self._document_store.to_dict(),
322
+ filters=self._filters,
323
+ top_k=self._top_k,
324
+ return_embedding=self._return_embedding,
325
+ )
326
+
327
+ @classmethod
328
+ def from_dict(cls, data: Dict[str, Any]) -> "QdrantHybridRetriever":
329
+ """
330
+ Deserializes the component from a dictionary.
331
+
332
+ :param data:
333
+ Dictionary to deserialize from.
334
+ :returns:
335
+ Deserialized component.
336
+ """
337
+ document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
338
+ data["init_parameters"]["document_store"] = document_store
339
+ return default_from_dict(cls, data)
340
+
341
+ @component.output_types(documents=List[Document])
342
+ def run(
343
+ self,
344
+ query_embedding: List[float],
345
+ query_sparse_embedding: SparseEmbedding,
346
+ filters: Optional[Union[Dict[str, Any], models.Filter]] = None,
347
+ top_k: Optional[int] = None,
348
+ return_embedding: Optional[bool] = None,
349
+ ):
350
+ """
351
+ Run the Sparse Embedding Retriever on the given input data.
352
+
353
+ :param query_embedding: Dense embedding of the query.
354
+ :param query_sparse_embedding: Sparse embedding of the query.
355
+ :param filters: A dictionary with filters to narrow down the search space.
356
+ :param top_k: The maximum number of documents to return.
357
+ :param return_embedding: Whether to return the embedding of the retrieved Documents.
358
+ :returns:
359
+ The retrieved documents.
360
+
361
+ """
362
+ docs = self._document_store._query_hybrid(
363
+ query_embedding=query_embedding,
364
+ query_sparse_embedding=query_sparse_embedding,
365
+ filters=filters or self._filters,
366
+ top_k=top_k or self._top_k,
367
+ return_embedding=return_embedding or self._return_embedding,
368
+ )
369
+
370
+ return {"documents": docs}
@@ -16,6 +16,7 @@ from haystack.utils.filters import convert as convert_legacy_filters
16
16
  from qdrant_client import grpc
17
17
  from qdrant_client.http import models as rest
18
18
  from qdrant_client.http.exceptions import UnexpectedResponse
19
+ from qdrant_client.hybrid.fusion import reciprocal_rank_fusion
19
20
  from tqdm import tqdm
20
21
 
21
22
  from .converters import (
@@ -65,7 +66,7 @@ class QdrantDocumentStore:
65
66
  https: Optional[bool] = None,
66
67
  api_key: Optional[Secret] = None,
67
68
  prefix: Optional[str] = None,
68
- timeout: Optional[float] = None,
69
+ timeout: Optional[int] = None,
69
70
  host: Optional[str] = None,
70
71
  path: Optional[str] = None,
71
72
  index: str = "Document",
@@ -95,23 +96,7 @@ class QdrantDocumentStore:
95
96
  scroll_size: int = 10_000,
96
97
  payload_fields_to_index: Optional[List[dict]] = None,
97
98
  ):
98
- super().__init__()
99
-
100
- metadata = metadata or {}
101
- self.client = qdrant_client.QdrantClient(
102
- location=location,
103
- url=url,
104
- port=port,
105
- grpc_port=grpc_port,
106
- prefer_grpc=prefer_grpc,
107
- https=https,
108
- api_key=api_key.resolve_value() if api_key else None,
109
- prefix=prefix,
110
- timeout=timeout,
111
- host=host,
112
- path=path,
113
- metadata=metadata,
114
- )
99
+ self._client = None
115
100
 
116
101
  # Store the Qdrant client specific attributes
117
102
  self.location = location
@@ -125,7 +110,7 @@ class QdrantDocumentStore:
125
110
  self.timeout = timeout
126
111
  self.host = host
127
112
  self.path = path
128
- self.metadata = metadata
113
+ self.metadata = metadata or {}
129
114
  self.api_key = api_key
130
115
 
131
116
  # Store the Qdrant collection specific attributes
@@ -142,12 +127,6 @@ class QdrantDocumentStore:
142
127
  self.recreate_index = recreate_index
143
128
  self.payload_fields_to_index = payload_fields_to_index
144
129
  self.use_sparse_embeddings = use_sparse_embeddings
145
-
146
- # Make sure the collection is properly set up
147
- self._set_up_collection(
148
- index, embedding_dim, recreate_index, similarity, use_sparse_embeddings, on_disk, payload_fields_to_index
149
- )
150
-
151
130
  self.embedding_dim = embedding_dim
152
131
  self.on_disk = on_disk
153
132
  self.content_field = content_field
@@ -161,6 +140,35 @@ class QdrantDocumentStore:
161
140
  self.write_batch_size = write_batch_size
162
141
  self.scroll_size = scroll_size
163
142
 
143
+ @property
144
+ def client(self):
145
+ if not self._client:
146
+ self._client = qdrant_client.QdrantClient(
147
+ location=self.location,
148
+ url=self.url,
149
+ port=self.port,
150
+ grpc_port=self.grpc_port,
151
+ prefer_grpc=self.prefer_grpc,
152
+ https=self.https,
153
+ api_key=self.api_key.resolve_value() if self.api_key else None,
154
+ prefix=self.prefix,
155
+ timeout=self.timeout,
156
+ host=self.host,
157
+ path=self.path,
158
+ metadata=self.metadata,
159
+ )
160
+ # Make sure the collection is properly set up
161
+ self._set_up_collection(
162
+ self.index,
163
+ self.embedding_dim,
164
+ self.recreate_index,
165
+ self.similarity,
166
+ self.use_sparse_embeddings,
167
+ self.on_disk,
168
+ self.payload_fields_to_index,
169
+ )
170
+ return self._client
171
+
164
172
  def count_documents(self) -> int:
165
173
  try:
166
174
  response = self.client.count(
@@ -175,13 +183,13 @@ class QdrantDocumentStore:
175
183
 
176
184
  def filter_documents(
177
185
  self,
178
- filters: Optional[Dict[str, Any]] = None,
186
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
179
187
  ) -> List[Document]:
180
- if filters and not isinstance(filters, dict):
181
- msg = "Filter must be a dictionary"
188
+ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
189
+ msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
182
190
  raise ValueError(msg)
183
191
 
184
- if filters and "operator" not in filters:
192
+ if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
185
193
  filters = convert_legacy_filters(filters)
186
194
  return list(
187
195
  self.get_documents_generator(
@@ -259,7 +267,7 @@ class QdrantDocumentStore:
259
267
 
260
268
  def get_documents_generator(
261
269
  self,
262
- filters: Optional[Dict[str, Any]] = None,
270
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
263
271
  ) -> Generator[Document, None, None]:
264
272
  index = self.index
265
273
  qdrant_filters = convert_filters_to_qdrant(filters)
@@ -307,10 +315,10 @@ class QdrantDocumentStore:
307
315
  )
308
316
  return documents
309
317
 
310
- def query_by_sparse(
318
+ def _query_by_sparse(
311
319
  self,
312
320
  query_sparse_embedding: SparseEmbedding,
313
- filters: Optional[Dict[str, Any]] = None,
321
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
314
322
  top_k: int = 10,
315
323
  scale_score: bool = True,
316
324
  return_embedding: bool = False,
@@ -349,10 +357,10 @@ class QdrantDocumentStore:
349
357
  document.score = score
350
358
  return results
351
359
 
352
- def query_by_embedding(
360
+ def _query_by_embedding(
353
361
  self,
354
362
  query_embedding: List[float],
355
- filters: Optional[Dict[str, Any]] = None,
363
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
356
364
  top_k: int = 10,
357
365
  scale_score: bool = True,
358
366
  return_embedding: bool = False,
@@ -383,6 +391,86 @@ class QdrantDocumentStore:
383
391
  document.score = score
384
392
  return results
385
393
 
394
+ def _query_hybrid(
395
+ self,
396
+ query_embedding: List[float],
397
+ query_sparse_embedding: SparseEmbedding,
398
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
399
+ top_k: int = 10,
400
+ return_embedding: bool = False,
401
+ ) -> List[Document]:
402
+ """
403
+ Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
404
+
405
+ This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
406
+ Use the `QdrantHybridRetriever` instead.
407
+
408
+ :param query_embedding: Dense embedding of the query.
409
+ :param query_sparse_embedding: Sparse embedding of the query.
410
+ :param filters: Filters applied to the retrieved Documents.
411
+ :param top_k: Maximum number of Documents to return.
412
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
413
+
414
+ :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
415
+
416
+ :raises QdrantStoreError:
417
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
418
+ """
419
+
420
+ # This implementation is based on the code from the Python Qdrant client:
421
+ # https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
422
+ if not self.use_sparse_embeddings:
423
+ message = (
424
+ "You are trying to query using sparse embeddings, but the Document Store "
425
+ "was initialized with `use_sparse_embeddings=False`. "
426
+ )
427
+ raise QdrantStoreError(message)
428
+
429
+ qdrant_filters = convert_filters_to_qdrant(filters)
430
+
431
+ sparse_request = rest.SearchRequest(
432
+ vector=rest.NamedSparseVector(
433
+ name=SPARSE_VECTORS_NAME,
434
+ vector=rest.SparseVector(
435
+ indices=query_sparse_embedding.indices,
436
+ values=query_sparse_embedding.values,
437
+ ),
438
+ ),
439
+ filter=qdrant_filters,
440
+ limit=top_k,
441
+ with_payload=True,
442
+ with_vector=return_embedding,
443
+ )
444
+
445
+ dense_request = rest.SearchRequest(
446
+ vector=rest.NamedVector(
447
+ name=DENSE_VECTORS_NAME,
448
+ vector=query_embedding,
449
+ ),
450
+ filter=qdrant_filters,
451
+ limit=top_k,
452
+ with_payload=True,
453
+ with_vector=return_embedding,
454
+ )
455
+
456
+ try:
457
+ dense_request_response, sparse_request_response = self.client.search_batch(
458
+ collection_name=self.index, requests=[dense_request, sparse_request]
459
+ )
460
+ except Exception as e:
461
+ msg = "Error during hybrid search"
462
+ raise QdrantStoreError(msg) from e
463
+
464
+ try:
465
+ points = reciprocal_rank_fusion(responses=[dense_request_response, sparse_request_response], limit=top_k)
466
+ except Exception as e:
467
+ msg = "Error while applying Reciprocal Rank Fusion"
468
+ raise QdrantStoreError(msg) from e
469
+
470
+ results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
471
+
472
+ return results
473
+
386
474
  def _get_distance(self, similarity: str) -> rest.Distance:
387
475
  try:
388
476
  return self.SIMILARITY[similarity]
@@ -11,10 +11,11 @@ LOGICAL_OPERATORS = LOGICAL_OPERATORS.keys()
11
11
 
12
12
 
13
13
  def convert_filters_to_qdrant(
14
- filter_term: Optional[Union[List[dict], dict]] = None,
14
+ filter_term: Optional[Union[List[dict], dict, models.Filter]] = None,
15
15
  ) -> Optional[models.Filter]:
16
16
  """Converts Haystack filters to the format used by Qdrant."""
17
-
17
+ if isinstance(filter_term, models.Filter):
18
+ return filter_term
18
19
  if not filter_term:
19
20
  return None
20
21
 
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+ import pytest
3
+ from haystack.dataclasses import SparseEmbedding
4
+
5
+
6
+ @pytest.fixture(scope="session")
7
+ def generate_sparse_embedding():
8
+ """
9
+ This fixture returns a function that generates a random SparseEmbedding each time it is called.
10
+ """
11
+
12
+ def _generate_random_sparse_embedding():
13
+ random_indice_length = np.random.randint(3, 15)
14
+ indices = list(range(random_indice_length))
15
+ values = [np.random.random_sample() for _ in range(random_indice_length)]
16
+ return SparseEmbedding(indices=indices, values=values)
17
+
18
+ return _generate_random_sparse_embedding
@@ -0,0 +1,109 @@
1
+ from typing import List
2
+ from unittest.mock import patch
3
+
4
+ import pytest
5
+ from haystack import Document
6
+ from haystack.dataclasses import SparseEmbedding
7
+ from haystack.document_stores.errors import DuplicateDocumentError
8
+ from haystack.document_stores.types import DuplicatePolicy
9
+ from haystack.testing.document_store import (
10
+ CountDocumentsTest,
11
+ DeleteDocumentsTest,
12
+ WriteDocumentsTest,
13
+ _random_embeddings,
14
+ )
15
+ from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError
16
+
17
+
18
+ class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
19
+ @pytest.fixture
20
+ def document_store(self) -> QdrantDocumentStore:
21
+ return QdrantDocumentStore(
22
+ ":memory:",
23
+ recreate_index=True,
24
+ return_embedding=True,
25
+ wait_result_from_api=True,
26
+ use_sparse_embeddings=False,
27
+ )
28
+
29
+ def test_init_is_lazy(self):
30
+ with patch("haystack_integrations.document_stores.qdrant.document_store.qdrant_client") as mocked_qdrant:
31
+ QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
32
+ mocked_qdrant.assert_not_called()
33
+
34
+ def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
35
+ """
36
+ Assert that two lists of Documents are equal.
37
+ This is used in every test.
38
+ """
39
+
40
+ # Check that the lengths of the lists are the same
41
+ assert len(received) == len(expected)
42
+
43
+ # Check that the sets are equal, meaning the content and IDs match regardless of order
44
+ assert {doc.id for doc in received} == {doc.id for doc in expected}
45
+
46
+ def test_write_documents(self, document_store: QdrantDocumentStore):
47
+ docs = [Document(id="1")]
48
+ assert document_store.write_documents(docs) == 1
49
+ with pytest.raises(DuplicateDocumentError):
50
+ document_store.write_documents(docs, DuplicatePolicy.FAIL)
51
+
52
+ def test_query_hybrid(self, generate_sparse_embedding):
53
+ document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
54
+
55
+ docs = []
56
+ for i in range(20):
57
+ docs.append(
58
+ Document(
59
+ content=f"doc {i}", sparse_embedding=generate_sparse_embedding(), embedding=_random_embeddings(768)
60
+ )
61
+ )
62
+
63
+ document_store.write_documents(docs)
64
+
65
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
66
+ embedding = [0.1] * 768
67
+
68
+ results: List[Document] = document_store._query_hybrid(
69
+ query_sparse_embedding=sparse_embedding, query_embedding=embedding, top_k=10, return_embedding=True
70
+ )
71
+ assert len(results) == 10
72
+
73
+ for document in results:
74
+ assert document.sparse_embedding
75
+ assert document.embedding
76
+
77
+ def test_query_hybrid_fail_without_sparse_embedding(self, document_store):
78
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
79
+ embedding = [0.1] * 768
80
+
81
+ with pytest.raises(QdrantStoreError):
82
+
83
+ document_store._query_hybrid(
84
+ query_sparse_embedding=sparse_embedding,
85
+ query_embedding=embedding,
86
+ )
87
+
88
+ def test_query_hybrid_search_batch_failure(self):
89
+ document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
90
+
91
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
92
+ embedding = [0.1] * 768
93
+
94
+ with patch.object(document_store.client, "search_batch", side_effect=Exception("search_batch error")):
95
+
96
+ with pytest.raises(QdrantStoreError):
97
+ document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
98
+
99
+ @patch("haystack_integrations.document_stores.qdrant.document_store.reciprocal_rank_fusion")
100
+ def test_query_hybrid_reciprocal_rank_fusion_failure(self, mocked_fusion):
101
+ document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
102
+
103
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
104
+ embedding = [0.1] * 768
105
+
106
+ mocked_fusion.side_effect = Exception("reciprocal_rank_fusion error")
107
+
108
+ with pytest.raises(QdrantStoreError):
109
+ document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
@@ -5,6 +5,7 @@ from haystack import Document
5
5
  from haystack.testing.document_store import FilterDocumentsTest
6
6
  from haystack.utils.filters import FilterError
7
7
  from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
8
+ from qdrant_client.http import models
8
9
 
9
10
 
10
11
  class TestQdrantStoreBaseTests(FilterDocumentsTest):
@@ -17,6 +18,21 @@ class TestQdrantStoreBaseTests(FilterDocumentsTest):
17
18
  wait_result_from_api=True,
18
19
  )
19
20
 
21
+ def test_filter_documents_with_qdrant_filters(self, document_store, filterable_docs):
22
+ document_store.write_documents(filterable_docs)
23
+ result = document_store.filter_documents(
24
+ filters=models.Filter(
25
+ must_not=[
26
+ models.FieldCondition(key="meta.number", match=models.MatchValue(value=100)),
27
+ models.FieldCondition(key="meta.name", match=models.MatchValue(value="name_0")),
28
+ ]
29
+ )
30
+ )
31
+ self.assert_documents_are_equal(
32
+ result,
33
+ [d for d in filterable_docs if (d.meta.get("number") != 100 and d.meta.get("name") != "name_0")],
34
+ )
35
+
20
36
  def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
21
37
  """
22
38
  Assert that two lists of Documents are equal.
@@ -1,6 +1,6 @@
1
1
  from typing import List
2
+ from unittest.mock import Mock
2
3
 
3
- import numpy as np
4
4
  from haystack.dataclasses import Document, SparseEmbedding
5
5
  from haystack.testing.document_store import (
6
6
  FilterableDocsFixtureMixin,
@@ -8,6 +8,7 @@ from haystack.testing.document_store import (
8
8
  )
9
9
  from haystack_integrations.components.retrievers.qdrant import (
10
10
  QdrantEmbeddingRetriever,
11
+ QdrantHybridRetriever,
11
12
  QdrantSparseEmbeddingRetriever,
12
13
  )
13
14
  from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
@@ -222,23 +223,12 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
222
223
  assert retriever._scale_score is False
223
224
  assert retriever._return_embedding is True
224
225
 
225
- def _generate_mocked_sparse_embedding(self, n):
226
- list_of_sparse_vectors = []
227
- for _ in range(n):
228
- random_indice_length = np.random.randint(3, 15)
229
- data = {
230
- "indices": list(range(random_indice_length)),
231
- "values": [np.random.random_sample() for _ in range(random_indice_length)],
232
- }
233
- list_of_sparse_vectors.append(data)
234
- return list_of_sparse_vectors
235
-
236
- def test_run(self, filterable_docs: List[Document]):
226
+ def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
237
227
  document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
238
228
 
239
229
  # Add fake sparse embedding to documents
240
230
  for doc in filterable_docs:
241
- doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0])
231
+ doc.sparse_embedding = generate_sparse_embedding()
242
232
 
243
233
  document_store.write_documents(filterable_docs)
244
234
  retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
@@ -252,3 +242,112 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
252
242
 
253
243
  for document in results:
254
244
  assert document.sparse_embedding
245
+
246
+
247
+ class TestQdrantHybridRetriever:
248
+ def test_init_default(self):
249
+ document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True)
250
+ retriever = QdrantHybridRetriever(document_store=document_store)
251
+
252
+ assert retriever._document_store == document_store
253
+ assert retriever._filters is None
254
+ assert retriever._top_k == 10
255
+ assert retriever._return_embedding is False
256
+
257
+ def test_to_dict(self):
258
+ document_store = QdrantDocumentStore(location=":memory:", index="test")
259
+ retriever = QdrantHybridRetriever(document_store=document_store, top_k=5, return_embedding=True)
260
+ res = retriever.to_dict()
261
+ assert res == {
262
+ "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
263
+ "init_parameters": {
264
+ "document_store": {
265
+ "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
266
+ "init_parameters": {
267
+ "location": ":memory:",
268
+ "url": None,
269
+ "port": 6333,
270
+ "grpc_port": 6334,
271
+ "prefer_grpc": False,
272
+ "https": None,
273
+ "api_key": None,
274
+ "prefix": None,
275
+ "timeout": None,
276
+ "host": None,
277
+ "path": None,
278
+ "index": "test",
279
+ "embedding_dim": 768,
280
+ "on_disk": False,
281
+ "content_field": "content",
282
+ "name_field": "name",
283
+ "embedding_field": "embedding",
284
+ "use_sparse_embeddings": False,
285
+ "similarity": "cosine",
286
+ "return_embedding": False,
287
+ "progress_bar": True,
288
+ "duplicate_documents": "overwrite",
289
+ "recreate_index": False,
290
+ "shard_number": None,
291
+ "replication_factor": None,
292
+ "write_consistency_factor": None,
293
+ "on_disk_payload": None,
294
+ "hnsw_config": None,
295
+ "optimizers_config": None,
296
+ "wal_config": None,
297
+ "quantization_config": None,
298
+ "init_from": None,
299
+ "wait_result_from_api": True,
300
+ "metadata": {},
301
+ "write_batch_size": 100,
302
+ "scroll_size": 10000,
303
+ "payload_fields_to_index": None,
304
+ },
305
+ },
306
+ "filters": None,
307
+ "top_k": 5,
308
+ "return_embedding": True,
309
+ },
310
+ }
311
+
312
+ def test_from_dict(self):
313
+ data = {
314
+ "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
315
+ "init_parameters": {
316
+ "document_store": {
317
+ "init_parameters": {"location": ":memory:", "index": "test"},
318
+ "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
319
+ },
320
+ "filters": None,
321
+ "top_k": 5,
322
+ "return_embedding": True,
323
+ },
324
+ }
325
+ retriever = QdrantHybridRetriever.from_dict(data)
326
+ assert isinstance(retriever._document_store, QdrantDocumentStore)
327
+ assert retriever._document_store.index == "test"
328
+ assert retriever._filters is None
329
+ assert retriever._top_k == 5
330
+ assert retriever._return_embedding
331
+
332
+ def test_run(self):
333
+ mock_store = Mock(spec=QdrantDocumentStore)
334
+ sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
335
+ mock_store._query_hybrid.return_value = [
336
+ Document(content="Test doc", embedding=[0.1, 0.2], sparse_embedding=sparse_embedding)
337
+ ]
338
+
339
+ retriever = QdrantHybridRetriever(document_store=mock_store)
340
+ res = retriever.run(
341
+ query_embedding=[0.5, 0.7], query_sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.7])
342
+ )
343
+
344
+ call_args = mock_store._query_hybrid.call_args
345
+ assert call_args[1]["query_embedding"] == [0.5, 0.7]
346
+ assert call_args[1]["query_sparse_embedding"].indices == [0, 5]
347
+ assert call_args[1]["query_sparse_embedding"].values == [0.1, 0.7]
348
+ assert call_args[1]["top_k"] == 10
349
+ assert call_args[1]["return_embedding"] is False
350
+
351
+ assert res["documents"][0].content == "Test doc"
352
+ assert res["documents"][0].embedding == [0.1, 0.2]
353
+ assert res["documents"][0].sparse_embedding == sparse_embedding
@@ -1,41 +0,0 @@
1
- from typing import List
2
-
3
- import pytest
4
- from haystack import Document
5
- from haystack.document_stores.errors import DuplicateDocumentError
6
- from haystack.document_stores.types import DuplicatePolicy
7
- from haystack.testing.document_store import (
8
- CountDocumentsTest,
9
- DeleteDocumentsTest,
10
- WriteDocumentsTest,
11
- )
12
- from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
13
-
14
-
15
- class TestQdrantStoreBaseTests(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
16
- @pytest.fixture
17
- def document_store(self) -> QdrantDocumentStore:
18
- return QdrantDocumentStore(
19
- ":memory:",
20
- recreate_index=True,
21
- return_embedding=True,
22
- wait_result_from_api=True,
23
- )
24
-
25
- def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
26
- """
27
- Assert that two lists of Documents are equal.
28
- This is used in every test.
29
- """
30
-
31
- # Check that the lengths of the lists are the same
32
- assert len(received) == len(expected)
33
-
34
- # Check that the sets are equal, meaning the content and IDs match regardless of order
35
- assert {doc.id for doc in received} == {doc.id for doc in expected}
36
-
37
- def test_write_documents(self, document_store: QdrantDocumentStore):
38
- docs = [Document(id="1")]
39
- assert document_store.write_documents(docs) == 1
40
- with pytest.raises(DuplicateDocumentError):
41
- document_store.write_documents(docs, DuplicatePolicy.FAIL)