qdrant-haystack 6.0.0__py3-none-any.whl → 10.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,16 @@
1
1
  import inspect
2
- import logging
2
+ from collections.abc import AsyncGenerator, Generator
3
3
  from itertools import islice
4
- from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Union
4
+ from typing import Any, ClassVar, cast
5
5
 
6
- import numpy as np
7
6
  import qdrant_client
8
- from haystack import default_from_dict, default_to_dict
7
+ from haystack import default_from_dict, default_to_dict, logging
9
8
  from haystack.dataclasses import Document
10
9
  from haystack.dataclasses.sparse_embedding import SparseEmbedding
11
10
  from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
12
11
  from haystack.document_stores.types import DuplicatePolicy
13
12
  from haystack.utils import Secret, deserialize_secrets_inplace
14
- from qdrant_client import grpc
13
+ from numpy import exp
15
14
  from qdrant_client.http import models as rest
16
15
  from qdrant_client.http.exceptions import UnexpectedResponse
17
16
  from tqdm import tqdm
@@ -27,15 +26,21 @@ from .filters import convert_filters_to_qdrant
27
26
 
28
27
  logger = logging.getLogger(__name__)
29
28
 
29
+ # Default group size to apply when using group_by
30
+ # - Our methods use None as the default for optional group_size parameter.
31
+ # - Qdrant expects an integer and internally defaults to 3 when performing grouped queries.
32
+ # - When group_by is specified but group_size is None, we use this value instead of passing None.
33
+ DEFAULT_GROUP_SIZE = 3
34
+
30
35
 
31
36
  class QdrantStoreError(DocumentStoreError):
32
37
  pass
33
38
 
34
39
 
35
- FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
40
+ FilterType = dict[str, dict[str, Any] | list[Any] | str | int | float | bool]
36
41
 
37
42
 
38
- def get_batches_from_generator(iterable, n):
43
+ def get_batches_from_generator(iterable: list, n: int) -> Generator:
39
44
  """
40
45
  Batch elements of an iterable into fixed-length chunks or blocks.
41
46
  """
@@ -48,9 +53,8 @@ def get_batches_from_generator(iterable, n):
48
53
 
49
54
  class QdrantDocumentStore:
50
55
  """
51
- QdrantDocumentStore is a Document Store for Qdrant.
52
- It can be used with any Qdrant instance: in-memory, disk-persisted, Docker-based,
53
- and Qdrant Cloud Cluster deployments.
56
+ A QdrantDocumentStore implementation that you can use with any Qdrant instance: in-memory, disk-persisted,
57
+ Docker-based, and Qdrant Cloud Cluster deployments.
54
58
 
55
59
  Usage example by creating an in-memory instance:
56
60
 
@@ -60,7 +64,8 @@ class QdrantDocumentStore:
60
64
 
61
65
  document_store = QdrantDocumentStore(
62
66
  ":memory:",
63
- recreate_index=True
67
+ recreate_index=True,
68
+ embedding_dim=5
64
69
  )
65
70
  document_store.write_documents([
66
71
  Document(content="This is first", embedding=[0.0]*5),
@@ -85,7 +90,7 @@ class QdrantDocumentStore:
85
90
  ```
86
91
  """
87
92
 
88
- SIMILARITY: ClassVar[Dict[str, str]] = {
93
+ SIMILARITY: ClassVar[dict[str, rest.Distance]] = {
89
94
  "cosine": rest.Distance.COSINE,
90
95
  "dot_product": rest.Distance.DOT,
91
96
  "l2": rest.Distance.EUCLID,
@@ -93,17 +98,17 @@ class QdrantDocumentStore:
93
98
 
94
99
  def __init__(
95
100
  self,
96
- location: Optional[str] = None,
97
- url: Optional[str] = None,
101
+ location: str | None = None,
102
+ url: str | None = None,
98
103
  port: int = 6333,
99
104
  grpc_port: int = 6334,
100
105
  prefer_grpc: bool = False,
101
- https: Optional[bool] = None,
102
- api_key: Optional[Secret] = None,
103
- prefix: Optional[str] = None,
104
- timeout: Optional[int] = None,
105
- host: Optional[str] = None,
106
- path: Optional[str] = None,
106
+ https: bool | None = None,
107
+ api_key: Secret | None = None,
108
+ prefix: str | None = None,
109
+ timeout: int | None = None,
110
+ host: str | None = None,
111
+ path: str | None = None,
107
112
  force_disable_check_same_thread: bool = False,
108
113
  index: str = "Document",
109
114
  embedding_dim: int = 768,
@@ -114,24 +119,25 @@ class QdrantDocumentStore:
114
119
  return_embedding: bool = False,
115
120
  progress_bar: bool = True,
116
121
  recreate_index: bool = False,
117
- shard_number: Optional[int] = None,
118
- replication_factor: Optional[int] = None,
119
- write_consistency_factor: Optional[int] = None,
120
- on_disk_payload: Optional[bool] = None,
121
- hnsw_config: Optional[dict] = None,
122
- optimizers_config: Optional[dict] = None,
123
- wal_config: Optional[dict] = None,
124
- quantization_config: Optional[dict] = None,
125
- init_from: Optional[dict] = None,
122
+ shard_number: int | None = None,
123
+ replication_factor: int | None = None,
124
+ write_consistency_factor: int | None = None,
125
+ on_disk_payload: bool | None = None,
126
+ hnsw_config: dict | None = None,
127
+ optimizers_config: dict | None = None,
128
+ wal_config: dict | None = None,
129
+ quantization_config: dict | None = None,
126
130
  wait_result_from_api: bool = True,
127
- metadata: Optional[dict] = None,
131
+ metadata: dict | None = None,
128
132
  write_batch_size: int = 100,
129
133
  scroll_size: int = 10_000,
130
- payload_fields_to_index: Optional[List[dict]] = None,
131
- ):
134
+ payload_fields_to_index: list[dict] | None = None,
135
+ ) -> None:
132
136
  """
137
+ Initializes a QdrantDocumentStore.
138
+
133
139
  :param location:
134
- If `memory` - use in-memory Qdrant instance.
140
+ If `":memory:"` - use in-memory Qdrant instance.
135
141
  If `str` - use it as a URL parameter.
136
142
  If `None` - use default values for host and port.
137
143
  :param url:
@@ -165,7 +171,7 @@ class QdrantDocumentStore:
165
171
  Dimension of the embeddings.
166
172
  :param on_disk:
167
173
  Whether to store the collection on disk.
168
- :param use_sparse_embedding:
174
+ :param use_sparse_embeddings:
169
175
  If set to `True`, enables support for sparse embeddings.
170
176
  :param sparse_idf:
171
177
  If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
@@ -202,8 +208,6 @@ class QdrantDocumentStore:
202
208
  Params for Write-Ahead-Log.
203
209
  :param quantization_config:
204
210
  Params for quantization. If `None`, quantization will be disabled.
205
- :param init_from:
206
- Use data stored in another collection to initialize this collection.
207
211
  :param wait_result_from_api:
208
212
  Whether to wait for the result from the API after each request.
209
213
  :param metadata:
@@ -216,7 +220,8 @@ class QdrantDocumentStore:
216
220
  List of payload fields to index.
217
221
  """
218
222
 
219
- self._client = None
223
+ self._client: qdrant_client.QdrantClient | None = None
224
+ self._async_client: qdrant_client.AsyncQdrantClient | None = None
220
225
 
221
226
  # Store the Qdrant client specific attributes
222
227
  self.location = location
@@ -232,7 +237,6 @@ class QdrantDocumentStore:
232
237
  self.path = path
233
238
  self.force_disable_check_same_thread = force_disable_check_same_thread
234
239
  self.metadata = metadata or {}
235
- self.api_key = api_key
236
240
 
237
241
  # Store the Qdrant collection specific attributes
238
242
  self.shard_number = shard_number
@@ -243,7 +247,6 @@ class QdrantDocumentStore:
243
247
  self.optimizers_config = optimizers_config
244
248
  self.wal_config = wal_config
245
249
  self.quantization_config = quantization_config
246
- self.init_from = init_from
247
250
  self.wait_result_from_api = wait_result_from_api
248
251
  self.recreate_index = recreate_index
249
252
  self.payload_fields_to_index = payload_fields_to_index
@@ -258,24 +261,11 @@ class QdrantDocumentStore:
258
261
  self.write_batch_size = write_batch_size
259
262
  self.scroll_size = scroll_size
260
263
 
261
- @property
262
- def client(self):
263
- if not self._client:
264
- self._client = qdrant_client.QdrantClient(
265
- location=self.location,
266
- url=self.url,
267
- port=self.port,
268
- grpc_port=self.grpc_port,
269
- prefer_grpc=self.prefer_grpc,
270
- https=self.https,
271
- api_key=self.api_key.resolve_value() if self.api_key else None,
272
- prefix=self.prefix,
273
- timeout=self.timeout,
274
- host=self.host,
275
- path=self.path,
276
- metadata=self.metadata,
277
- force_disable_check_same_thread=self.force_disable_check_same_thread,
278
- )
264
+ def _initialize_client(self) -> None:
265
+ if self._client is None:
266
+ client_params = self._prepare_client_params()
267
+ # This step adds the api-key and User-Agent to metadata
268
+ self._client = qdrant_client.QdrantClient(**client_params)
279
269
  # Make sure the collection is properly set up
280
270
  self._set_up_collection(
281
271
  self.index,
@@ -287,14 +277,52 @@ class QdrantDocumentStore:
287
277
  self.on_disk,
288
278
  self.payload_fields_to_index,
289
279
  )
290
- return self._client
280
+
281
+ async def _initialize_async_client(self) -> None:
282
+ """
283
+ Returns the asynchronous Qdrant client, initializing it if necessary.
284
+ """
285
+ if self._async_client is None:
286
+ client_params = self._prepare_client_params()
287
+ self._async_client = qdrant_client.AsyncQdrantClient(
288
+ **client_params,
289
+ )
290
+ await self._set_up_collection_async(
291
+ self.index,
292
+ self.embedding_dim,
293
+ self.recreate_index,
294
+ self.similarity,
295
+ self.use_sparse_embeddings,
296
+ self.sparse_idf,
297
+ self.on_disk,
298
+ self.payload_fields_to_index,
299
+ )
291
300
 
292
301
  def count_documents(self) -> int:
293
302
  """
294
303
  Returns the number of documents present in the Document Store.
295
304
  """
305
+ self._initialize_client()
306
+ assert self._client is not None
307
+ try:
308
+ response = self._client.count(
309
+ collection_name=self.index,
310
+ )
311
+ return response.count
312
+ except (UnexpectedResponse, ValueError):
313
+ # Qdrant local raises ValueError if the collection is not found, but
314
+ # with the remote server UnexpectedResponse is raised. Until that's unified,
315
+ # we need to catch both.
316
+ return 0
317
+
318
+ async def count_documents_async(self) -> int:
319
+ """
320
+ Asynchronously returns the number of documents present in the document dtore.
321
+ """
322
+ await self._initialize_async_client()
323
+ assert self._async_client is not None
296
324
  try:
297
- response = self.client.count(
325
+ response = await self._async_client.count(
298
326
  collection_name=self.index,
299
327
  )
300
328
  return response.count
@@ -306,8 +334,8 @@ class QdrantDocumentStore:
306
334
 
307
335
  def filter_documents(
308
336
  self,
309
- filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
310
- ) -> List[Document]:
337
+ filters: dict[str, Any] | rest.Filter | None = None,
338
+ ) -> list[Document]:
311
339
  """
312
340
  Returns the documents that match the provided filters.
313
341
 
@@ -317,22 +345,32 @@ class QdrantDocumentStore:
317
345
  :param filters: The filters to apply to the document list.
318
346
  :returns: A list of documents that match the given filters.
319
347
  """
320
- if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
321
- msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
322
- raise ValueError(msg)
348
+ # No need to initialize client here as _get_documents_generator
349
+ # will handle client initialization internally
323
350
 
324
- if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
325
- msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
326
- raise ValueError(msg)
351
+ QdrantDocumentStore._validate_filters(filters)
327
352
  return list(
328
- self.get_documents_generator(
353
+ self._get_documents_generator(
329
354
  filters,
330
355
  )
331
356
  )
332
357
 
358
+ async def filter_documents_async(
359
+ self,
360
+ filters: dict[str, Any] | rest.Filter | None = None,
361
+ ) -> list[Document]:
362
+ """
363
+ Asynchronously returns the documents that match the provided filters.
364
+ """
365
+ # No need to initialize client here as _get_documents_generator_async
366
+ # will handle client initialization internally
367
+
368
+ QdrantDocumentStore._validate_filters(filters)
369
+ return [doc async for doc in self._get_documents_generator_async(filters)]
370
+
333
371
  def write_documents(
334
372
  self,
335
- documents: List[Document],
373
+ documents: list[Document],
336
374
  policy: DuplicatePolicy = DuplicatePolicy.FAIL,
337
375
  ) -> int:
338
376
  """
@@ -348,13 +386,14 @@ class QdrantDocumentStore:
348
386
 
349
387
  :returns: The number of documents written to the document store.
350
388
  """
389
+
390
+ self._initialize_client()
391
+ assert self._client is not None
392
+
351
393
  for doc in documents:
352
394
  if not isinstance(doc, Document):
353
395
  msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
354
396
  raise ValueError(msg)
355
- self._set_up_collection(
356
- self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
357
- )
358
397
 
359
398
  if len(documents) == 0:
360
399
  logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
@@ -362,7 +401,6 @@ class QdrantDocumentStore:
362
401
 
363
402
  document_objects = self._handle_duplicate_documents(
364
403
  documents=documents,
365
- index=self.index,
366
404
  policy=policy,
367
405
  )
368
406
 
@@ -374,7 +412,61 @@ class QdrantDocumentStore:
374
412
  use_sparse_embeddings=self.use_sparse_embeddings,
375
413
  )
376
414
 
377
- self.client.upsert(
415
+ self._client.upsert(
416
+ collection_name=self.index,
417
+ points=batch,
418
+ wait=self.wait_result_from_api,
419
+ )
420
+
421
+ progress_bar.update(self.write_batch_size)
422
+ return len(document_objects)
423
+
424
+ async def write_documents_async(
425
+ self,
426
+ documents: list[Document],
427
+ policy: DuplicatePolicy = DuplicatePolicy.FAIL,
428
+ ) -> int:
429
+ """
430
+ Asynchronously writes documents to Qdrant using the specified policy.
431
+ The QdrantDocumentStore can handle duplicate documents based on the given policy.
432
+ The available policies are:
433
+ - `FAIL`: The operation will raise an error if any document already exists.
434
+ - `OVERWRITE`: Existing documents will be overwritten with the new ones.
435
+ - `SKIP`: Existing documents will be skipped, and only new documents will be added.
436
+
437
+ :param documents: A list of Document objects to write to Qdrant.
438
+ :param policy: The policy for handling duplicate documents.
439
+
440
+ :returns: The number of documents written to the document store.
441
+ """
442
+
443
+ await self._initialize_async_client()
444
+ assert self._async_client is not None
445
+
446
+ for doc in documents:
447
+ if not isinstance(doc, Document):
448
+ msg = f"""DocumentStore.write_documents_async() expects a list of
449
+ Documents but got an element of {type(doc)}."""
450
+ raise ValueError(msg)
451
+
452
+ if len(documents) == 0:
453
+ logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
454
+ return 0
455
+
456
+ document_objects = await self._handle_duplicate_documents_async(
457
+ documents=documents,
458
+ policy=policy,
459
+ )
460
+
461
+ batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
462
+ with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
463
+ for document_batch in batched_documents:
464
+ batch = convert_haystack_documents_to_qdrant_points(
465
+ document_batch,
466
+ use_sparse_embeddings=self.use_sparse_embeddings,
467
+ )
468
+
469
+ await self._async_client.upsert(
378
470
  collection_name=self.index,
379
471
  points=batch,
380
472
  wait=self.wait_result_from_api,
@@ -383,17 +475,20 @@ class QdrantDocumentStore:
383
475
  progress_bar.update(self.write_batch_size)
384
476
  return len(document_objects)
385
477
 
386
- def delete_documents(self, document_ids: List[str]) -> None:
478
+ def delete_documents(self, document_ids: list[str]) -> None:
387
479
  """
388
480
  Deletes documents that match the provided `document_ids` from the document store.
389
481
 
390
482
  :param document_ids: the document ids to delete
391
483
  """
392
- ids = [convert_id(_id) for _id in document_ids]
484
+
485
+ self._initialize_client()
486
+ assert self._client is not None
487
+
393
488
  try:
394
- self.client.delete(
489
+ self._client.delete(
395
490
  collection_name=self.index,
396
- points_selector=ids,
491
+ points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
397
492
  wait=self.wait_result_from_api,
398
493
  )
399
494
  except KeyError:
@@ -401,149 +496,987 @@ class QdrantDocumentStore:
401
496
  "Called QdrantDocumentStore.delete_documents() on a non-existing ID",
402
497
  )
403
498
 
404
- @classmethod
405
- def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
499
+ async def delete_documents_async(self, document_ids: list[str]) -> None:
406
500
  """
407
- Deserializes the component from a dictionary.
501
+ Asynchronously deletes documents that match the provided `document_ids` from the document store.
408
502
 
409
- :param data:
410
- The dictionary to deserialize from.
411
- :returns:
412
- The deserialized component.
503
+ :param document_ids: the document ids to delete
413
504
  """
414
- deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
415
- return default_from_dict(cls, data)
416
505
 
417
- def to_dict(self) -> Dict[str, Any]:
506
+ await self._initialize_async_client()
507
+ assert self._async_client is not None
508
+
509
+ try:
510
+ await self._async_client.delete(
511
+ collection_name=self.index,
512
+ points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
513
+ wait=self.wait_result_from_api,
514
+ )
515
+ except KeyError:
516
+ logger.warning(
517
+ "Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
518
+ )
519
+
520
+ def delete_by_filter(self, filters: dict[str, Any]) -> int:
418
521
  """
419
- Serializes the component to a dictionary.
522
+ Deletes all documents that match the provided filters.
523
+
524
+ :param filters: The filters to apply to select documents for deletion.
525
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
420
526
 
421
527
  :returns:
422
- Dictionary with serialized data.
528
+ The number of documents deleted.
423
529
  """
424
- params = inspect.signature(self.__init__).parameters # type: ignore
425
- # All the __init__ params must be set as attributes
426
- # Set as init_parms without default values
427
- init_params = {k: getattr(self, k) for k in params}
428
- init_params["api_key"] = self.api_key.to_dict() if self.api_key else None
429
- return default_to_dict(
430
- self,
431
- **init_params,
432
- )
530
+ self._initialize_client()
531
+ assert self._client is not None
433
532
 
434
- def get_documents_generator(
435
- self,
436
- filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
437
- ) -> Generator[Document, None, None]:
533
+ try:
534
+ qdrant_filter = convert_filters_to_qdrant(filters)
535
+ if qdrant_filter is None:
536
+ return 0
537
+
538
+ count_response = self._client.count(
539
+ collection_name=self.index,
540
+ count_filter=qdrant_filter,
541
+ )
542
+ deleted_count = count_response.count
543
+
544
+ self._client.delete(
545
+ collection_name=self.index,
546
+ points_selector=rest.FilterSelector(filter=qdrant_filter),
547
+ wait=self.wait_result_from_api,
548
+ )
549
+ return deleted_count
550
+
551
+ except Exception as e:
552
+ msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
553
+ raise QdrantStoreError(msg) from e
554
+
555
+ async def delete_by_filter_async(self, filters: dict[str, Any]) -> int:
438
556
  """
439
- Returns a generator that yields documents from Qdrant based on the provided filters.
557
+ Asynchronously deletes all documents that match the provided filters.
440
558
 
441
- :param filters: Filters applied to the retrieved documents.
442
- :returns: A generator that yields documents retrieved from Qdrant.
559
+ :param filters: The filters to apply to select documents for deletion.
560
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
561
+
562
+ :returns:
563
+ The number of documents deleted.
443
564
  """
565
+ await self._initialize_async_client()
566
+ assert self._async_client is not None
444
567
 
445
- index = self.index
446
- qdrant_filters = convert_filters_to_qdrant(filters)
568
+ try:
569
+ qdrant_filter = convert_filters_to_qdrant(filters)
570
+ if qdrant_filter is None:
571
+ return 0
447
572
 
448
- next_offset = None
449
- stop_scrolling = False
450
- while not stop_scrolling:
451
- records, next_offset = self.client.scroll(
452
- collection_name=index,
453
- scroll_filter=qdrant_filters,
454
- limit=self.scroll_size,
455
- offset=next_offset,
456
- with_payload=True,
457
- with_vectors=True,
573
+ count_response = await self._async_client.count(
574
+ collection_name=self.index,
575
+ count_filter=qdrant_filter,
458
576
  )
459
- stop_scrolling = next_offset is None or (
460
- isinstance(next_offset, grpc.PointId) and next_offset.num == 0 and next_offset.uuid == ""
577
+ deleted_count = count_response.count
578
+
579
+ await self._async_client.delete(
580
+ collection_name=self.index,
581
+ points_selector=rest.FilterSelector(filter=qdrant_filter),
582
+ wait=self.wait_result_from_api,
461
583
  )
584
+ return deleted_count
462
585
 
463
- for record in records:
464
- yield convert_qdrant_point_to_haystack_document(
465
- record, use_sparse_embeddings=self.use_sparse_embeddings
466
- )
586
+ except Exception as e:
587
+ msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
588
+ raise QdrantStoreError(msg) from e
467
589
 
468
- def get_documents_by_id(
469
- self,
470
- ids: List[str],
471
- index: Optional[str] = None,
472
- ) -> List[Document]:
590
+ @staticmethod
591
+ def _check_stop_scrolling(next_offset: Any) -> bool:
473
592
  """
474
- Retrieves documents from Qdrant by their IDs.
593
+ Checks if scrolling should stop based on the next_offset value.
475
594
 
476
- :param ids:
477
- A list of document IDs to retrieve.
478
- :param index:
479
- The name of the index to retrieve documents from.
480
- :returns:
481
- A list of documents.
595
+ :param next_offset: The offset returned from the scroll operation.
596
+ :returns: True if scrolling should stop, False otherwise.
482
597
  """
483
- index = index or self.index
598
+ return next_offset is None or (
599
+ hasattr(next_offset, "num")
600
+ and hasattr(next_offset, "uuid")
601
+ and next_offset.num == 0
602
+ and next_offset.uuid == ""
603
+ )
484
604
 
485
- documents: List[Document] = []
605
+ @staticmethod
606
+ def _metadata_fields_info_from_schema(payload_schema: dict[str, Any]) -> dict[str, str]:
607
+ """Build field name -> type dict from Qdrant payload_schema. Used by get_metadata_fields_info (sync/async)."""
608
+ fields_info: dict[str, str] = {}
609
+ for field_name, field_config in payload_schema.items():
610
+ if hasattr(field_config, "data_type"):
611
+ fields_info[field_name] = str(field_config.data_type)
612
+ else:
613
+ fields_info[field_name] = "unknown"
614
+ return fields_info
615
+
616
+ @staticmethod
617
+ def _process_records_min_max(
618
+ records: list[Any], metadata_field: str, min_value: Any, max_value: Any
619
+ ) -> tuple[Any, Any]:
620
+ """Update min/max from a batch of Qdrant records. Used by get_metadata_field_min_max (sync/async)."""
621
+ for record in records:
622
+ if record.payload and "meta" in record.payload:
623
+ meta = record.payload["meta"]
624
+ if metadata_field in meta:
625
+ value = meta[metadata_field]
626
+ if value is not None:
627
+ if min_value is None or value < min_value:
628
+ min_value = value
629
+ if max_value is None or value > max_value:
630
+ max_value = value
631
+ return min_value, max_value
632
+
633
+ @staticmethod
634
+ def _process_records_count_unique(
635
+ records: list[Any], metadata_fields: list[str], unique_values_by_field: dict[str, set[Any]]
636
+ ) -> None:
637
+ """
638
+ Update unique_values_by_field from a batch of Qdrant records.
486
639
 
487
- ids = [convert_id(_id) for _id in ids]
488
- records = self.client.retrieve(
489
- collection_name=index,
490
- ids=ids,
491
- with_payload=True,
492
- with_vectors=True,
640
+ Used by count_unique_metadata_by_filter (sync/async).
641
+ """
642
+ for record in records:
643
+ if record.payload and "meta" in record.payload:
644
+ meta = record.payload["meta"]
645
+ for field in metadata_fields:
646
+ if field in meta:
647
+ value = meta[field]
648
+ if value is not None:
649
+ if isinstance(value, (list, dict)):
650
+ unique_values_by_field[field].add(str(value))
651
+ else:
652
+ unique_values_by_field[field].add(value)
653
+
654
+ @staticmethod
655
+ def _process_records_unique_values(
656
+ records: list[Any],
657
+ metadata_field: str,
658
+ unique_values: list[Any],
659
+ unique_values_set: set[Any],
660
+ offset: int,
661
+ limit: int,
662
+ ) -> bool:
663
+ """Collect unique values from a batch of records. Returns True when len(unique_values) >= offset + limit."""
664
+ for record in records:
665
+ if record.payload and "meta" in record.payload:
666
+ meta = record.payload["meta"]
667
+ if metadata_field in meta:
668
+ value = meta[metadata_field]
669
+ if value is not None:
670
+ hashable_value = str(value) if isinstance(value, (list, dict)) else value
671
+ if hashable_value not in unique_values_set:
672
+ unique_values_set.add(hashable_value)
673
+ unique_values.append(value)
674
+ if len(unique_values) >= offset + limit:
675
+ return True
676
+ return False
677
+
678
+ @staticmethod
679
+ def _create_updated_point_from_record(record: Any, meta: dict[str, Any]) -> rest.PointStruct:
680
+ """
681
+ Creates an updated PointStruct from a Qdrant record with merged metadata.
682
+
683
+ :param record: The Qdrant record to update.
684
+ :param meta: The metadata fields to merge with existing metadata.
685
+ :returns: A PointStruct with updated metadata and preserved vectors.
686
+ """
687
+ # merge existing payload with new metadata
688
+ # Metadata is stored under the "meta" key in the payload
689
+ updated_payload = dict(record.payload or {})
690
+ if "meta" not in updated_payload:
691
+ updated_payload["meta"] = {}
692
+ updated_payload["meta"].update(meta)
693
+
694
+ # create updated point preserving vectors
695
+ # Type cast needed because record.vector type doesn't include all PointStruct vector types
696
+ vector_value = record.vector if record.vector is not None else {}
697
+ return rest.PointStruct(
698
+ id=record.id,
699
+ vector=cast(Any, vector_value),
700
+ payload=updated_payload,
493
701
  )
494
702
 
495
- for record in records:
496
- documents.append(
497
- convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
703
+ def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
704
+ """
705
+ Updates the metadata of all documents that match the provided filters.
706
+
707
+ **Note**: This operation is not atomic. Documents matching the filter are fetched first,
708
+ then updated. If documents are modified between the fetch and update operations,
709
+ those changes may be lost.
710
+
711
+ :param filters: The filters to apply to select documents for updating.
712
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
713
+ :param meta: The metadata fields to update. This will be merged with existing metadata.
714
+
715
+ :returns:
716
+ The number of documents updated.
717
+ """
718
+ self._initialize_client()
719
+ assert self._client is not None
720
+
721
+ try:
722
+ qdrant_filter = convert_filters_to_qdrant(filters)
723
+ if qdrant_filter is None:
724
+ return 0
725
+
726
+ # get all matching documents using scroll
727
+ updated_points = []
728
+ next_offset = None
729
+
730
+ while True:
731
+ records, next_offset = self._client.scroll(
732
+ collection_name=self.index,
733
+ scroll_filter=qdrant_filter,
734
+ limit=self.scroll_size,
735
+ offset=next_offset,
736
+ with_payload=True,
737
+ with_vectors=True,
738
+ )
739
+
740
+ # update payload for each record
741
+ for record in records:
742
+ updated_points.append(self._create_updated_point_from_record(record, meta))
743
+
744
+ if self._check_stop_scrolling(next_offset):
745
+ break
746
+
747
+ if not updated_points:
748
+ return 0
749
+
750
+ # upsert updated points back in batches
751
+ for batch in get_batches_from_generator(updated_points, self.write_batch_size):
752
+ self._client.upsert(
753
+ collection_name=self.index,
754
+ points=list(batch),
755
+ wait=self.wait_result_from_api,
756
+ )
757
+
758
+ logger.info(
759
+ "Updated {n_docs} documents in collection '{name}' using filters.",
760
+ n_docs=len(updated_points),
761
+ name=self.index,
498
762
  )
499
- return documents
763
+ return len(updated_points)
764
+ except Exception as e:
765
+ msg = f"Failed to update documents by filter in Qdrant: {e!s}"
766
+ raise QdrantStoreError(msg) from e
500
767
 
501
- def _query_by_sparse(
502
- self,
503
- query_sparse_embedding: SparseEmbedding,
504
- filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
505
- top_k: int = 10,
506
- scale_score: bool = False,
507
- return_embedding: bool = False,
508
- score_threshold: Optional[float] = None,
509
- group_by: Optional[str] = None,
510
- group_size: Optional[int] = None,
511
- ) -> List[Document]:
768
+ async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
512
769
  """
513
- Queries Qdrant using a sparse embedding and returns the most relevant documents.
770
+ Asynchronously updates the metadata of all documents that match the provided filters.
514
771
 
515
- :param query_sparse_embedding: Sparse embedding of the query.
516
- :param filters: Filters applied to the retrieved documents.
517
- :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
518
- groups to return.
519
- :param scale_score: Whether to scale the scores of the retrieved documents.
520
- :param return_embedding: Whether to return the embeddings of the retrieved documents.
521
- :param score_threshold: A minimal score threshold for the result.
522
- Score of the returned result might be higher or smaller than the threshold
523
- depending on the Distance function used.
524
- E.g. for cosine similarity only higher scores will be returned.
525
- :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
526
- value, all values will be used for grouping. One point can be in multiple groups.
527
- :param group_size: Maximum amount of points to return per group. Default is 3.
772
+ **Note**: This operation is not atomic. Documents matching the filter are fetched first,
773
+ then updated. If documents are modified between the fetch and update operations,
774
+ those changes may be lost.
528
775
 
529
- :returns: List of documents that are most similar to `query_sparse_embedding`.
776
+ :param filters: The filters to apply to select documents for updating.
777
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
778
+ :param meta: The metadata fields to update. This will be merged with existing metadata.
530
779
 
531
- :raises QdrantStoreError:
532
- If the Document Store was initialized with `use_sparse_embeddings=False`.
780
+ :returns:
781
+ The number of documents updated.
533
782
  """
783
+ await self._initialize_async_client()
784
+ assert self._async_client is not None
534
785
 
535
- if not self.use_sparse_embeddings:
536
- message = (
537
- "You are trying to query using sparse embeddings, but the Document Store "
538
- "was initialized with `use_sparse_embeddings=False`. "
786
+ try:
787
+ qdrant_filter = convert_filters_to_qdrant(filters)
788
+ if qdrant_filter is None:
789
+ return 0
790
+
791
+ updated_points = []
792
+ next_offset = None
793
+
794
+ while True:
795
+ records, next_offset = await self._async_client.scroll(
796
+ collection_name=self.index,
797
+ scroll_filter=qdrant_filter,
798
+ limit=self.scroll_size,
799
+ offset=next_offset,
800
+ with_payload=True,
801
+ with_vectors=True,
802
+ )
803
+
804
+ # update payload for each record
805
+ for record in records:
806
+ updated_points.append(self._create_updated_point_from_record(record, meta))
807
+
808
+ if self._check_stop_scrolling(next_offset):
809
+ break
810
+
811
+ if not updated_points:
812
+ return 0
813
+
814
+ # upsert updated points back in batches
815
+ for batch in get_batches_from_generator(updated_points, self.write_batch_size):
816
+ await self._async_client.upsert(
817
+ collection_name=self.index,
818
+ points=list(batch),
819
+ wait=self.wait_result_from_api,
820
+ )
821
+
822
+ logger.info(
823
+ "Updated {n_docs} documents in collection '{name}' using filters.",
824
+ n_docs=len(updated_points),
825
+ name=self.index,
539
826
  )
540
- raise QdrantStoreError(message)
827
+ return len(updated_points)
828
+ except Exception as e:
829
+ msg = f"Failed to update documents by filter in Qdrant: {e!s}"
830
+ raise QdrantStoreError(msg) from e
831
+
832
+ def delete_all_documents(self, recreate_index: bool = False) -> None:
833
+ """
834
+ Deletes all documents from the document store.
835
+
836
+ :param recreate_index: Whether to recreate the index after deleting all documents.
837
+ """
838
+
839
+ self._initialize_client()
840
+ assert self._client is not None
841
+
842
+ if recreate_index:
843
+ # get current collection config as json
844
+ collection_info = self._client.get_collection(collection_name=self.index)
845
+ info_json = collection_info.model_dump()
846
+
847
+ # deal with the Optional use_sparse_embeddings
848
+ sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
849
+ use_sparse_embeddings = True if sparse_vectors else False
850
+
851
+ # deal with the Optional sparse_idf
852
+ hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
853
+ sparse_idf = True if use_sparse_embeddings and hnsw_config else False
854
+
855
+ # recreate collection
856
+ self._set_up_collection(
857
+ collection_name=self.index,
858
+ embedding_dim=info_json["config"]["params"]["vectors"]["size"],
859
+ recreate_collection=True,
860
+ similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
861
+ use_sparse_embeddings=use_sparse_embeddings,
862
+ sparse_idf=sparse_idf,
863
+ on_disk=info_json["config"]["hnsw_config"]["on_disk"],
864
+ payload_fields_to_index=info_json["payload_schema"],
865
+ )
866
+
867
+ else:
868
+ try:
869
+ self._client.delete(
870
+ collection_name=self.index,
871
+ points_selector=rest.FilterSelector(
872
+ filter=rest.Filter(
873
+ must=[],
874
+ )
875
+ ),
876
+ wait=self.wait_result_from_api,
877
+ )
878
+ except Exception as e:
879
+ logger.warning(
880
+ f"Error {e} when calling QdrantDocumentStore.delete_all_documents()",
881
+ )
882
+
883
+ async def delete_all_documents_async(self, recreate_index: bool = False) -> None:
884
+ """
885
+ Asynchronously deletes all documents from the document store.
886
+
887
+ :param recreate_index: Whether to recreate the index after deleting all documents.
888
+ """
889
+
890
+ await self._initialize_async_client()
891
+ assert self._async_client is not None
892
+
893
+ if recreate_index:
894
+ # get current collection config as json
895
+ collection_info = await self._async_client.get_collection(collection_name=self.index)
896
+ info_json = collection_info.model_dump()
897
+
898
+ # deal with the Optional use_sparse_embeddings
899
+ sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
900
+ use_sparse_embeddings = True if sparse_vectors else False
901
+
902
+ # deal with the Optional sparse_idf
903
+ hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
904
+ sparse_idf = True if use_sparse_embeddings and hnsw_config else False
905
+
906
+ # recreate collection
907
+ await self._set_up_collection_async(
908
+ collection_name=self.index,
909
+ embedding_dim=info_json["config"]["params"]["vectors"]["size"],
910
+ recreate_collection=True,
911
+ similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
912
+ use_sparse_embeddings=use_sparse_embeddings,
913
+ sparse_idf=sparse_idf,
914
+ on_disk=info_json["config"]["hnsw_config"]["on_disk"],
915
+ payload_fields_to_index=info_json["payload_schema"],
916
+ )
917
+
918
+ else:
919
+ try:
920
+ await self._async_client.delete(
921
+ collection_name=self.index,
922
+ points_selector=rest.FilterSelector(
923
+ filter=rest.Filter(
924
+ must=[],
925
+ )
926
+ ),
927
+ wait=self.wait_result_from_api,
928
+ )
929
+ except Exception as e:
930
+ logger.warning(
931
+ f"Error {e} when calling QdrantDocumentStore.delete_all_documents_async()",
932
+ )
933
+
934
+ def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
935
+ """
936
+ Returns the number of documents that match the provided filters.
937
+
938
+ :param filters: The filters to apply to count documents.
939
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
940
+
941
+ :returns: The number of documents that match the filters.
942
+ """
943
+ self._initialize_client()
944
+ assert self._client is not None
945
+
946
+ qdrant_filter = convert_filters_to_qdrant(filters)
947
+ try:
948
+ response = self._client.count(
949
+ collection_name=self.index,
950
+ count_filter=qdrant_filter,
951
+ )
952
+ return response.count
953
+ except (UnexpectedResponse, ValueError) as e:
954
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter()")
955
+ return 0
956
+
957
+ async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int:
958
+ """
959
+ Asynchronously returns the number of documents that match the provided filters.
960
+
961
+ :param filters: The filters to apply to select documents for counting.
962
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
963
+
964
+ :returns:
965
+ The number of documents that match the filters.
966
+ """
967
+ await self._initialize_async_client()
968
+ assert self._async_client is not None
969
+
970
+ qdrant_filter = convert_filters_to_qdrant(filters)
971
+ try:
972
+ response = await self._async_client.count(
973
+ collection_name=self.index,
974
+ count_filter=qdrant_filter,
975
+ )
976
+ return response.count
977
+ except (UnexpectedResponse, ValueError) as e:
978
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter_async()")
979
+ return 0
980
+
981
+ def get_metadata_fields_info(self) -> dict[str, str]:
982
+ """
983
+ Returns the information about the fields from the collection.
984
+
985
+ :returns:
986
+ A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
987
+ """
988
+ self._initialize_client()
989
+ assert self._client is not None
990
+
991
+ try:
992
+ collection_info = self._client.get_collection(self.index)
993
+ payload_schema = collection_info.payload_schema or {}
994
+ return self._metadata_fields_info_from_schema(payload_schema)
995
+ except (UnexpectedResponse, ValueError) as e:
996
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info()")
997
+ return {}
998
+
999
+ async def get_metadata_fields_info_async(self) -> dict[str, str]:
1000
+ """
1001
+ Asynchronously returns the information about the fields from the collection.
1002
+
1003
+ :returns:
1004
+ A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
1005
+ """
1006
+ await self._initialize_async_client()
1007
+ assert self._async_client is not None
1008
+
1009
+ try:
1010
+ collection_info = await self._async_client.get_collection(self.index)
1011
+ payload_schema = collection_info.payload_schema or {}
1012
+ return self._metadata_fields_info_from_schema(payload_schema)
1013
+ except (UnexpectedResponse, ValueError) as e:
1014
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info_async()")
1015
+ return {}
1016
+
1017
+ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]:
1018
+ """
1019
+ Returns the minimum and maximum values for the given metadata field.
1020
+
1021
+ :param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
1022
+
1023
+ :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
1024
+ metadata field across all documents. Returns an empty dict if no documents have the field.
1025
+ """
1026
+ self._initialize_client()
1027
+ assert self._client is not None
1028
+
1029
+ try:
1030
+ min_value: Any = None
1031
+ max_value: Any = None
1032
+ next_offset = None
1033
+
1034
+ while True:
1035
+ records, next_offset = self._client.scroll(
1036
+ collection_name=self.index,
1037
+ scroll_filter=None,
1038
+ limit=self.scroll_size,
1039
+ offset=next_offset,
1040
+ with_payload=True,
1041
+ with_vectors=False,
1042
+ )
1043
+ min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
1044
+ if self._check_stop_scrolling(next_offset):
1045
+ break
1046
+
1047
+ if min_value is not None and max_value is not None:
1048
+ return {"min": min_value, "max": max_value}
1049
+ return {}
1050
+ except Exception as e:
1051
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max()")
1052
+ return {}
1053
+
1054
+ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, Any]:
1055
+ """
1056
+ Asynchronously returns the minimum and maximum values for the given metadata field.
1057
+
1058
+ :param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
1059
+
1060
+ :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
1061
+ metadata field across all documents. Returns an empty dict if no documents have the field.
1062
+ """
1063
+ await self._initialize_async_client()
1064
+ assert self._async_client is not None
1065
+
1066
+ try:
1067
+ min_value: Any = None
1068
+ max_value: Any = None
1069
+ next_offset = None
1070
+
1071
+ while True:
1072
+ records, next_offset = await self._async_client.scroll(
1073
+ collection_name=self.index,
1074
+ scroll_filter=None,
1075
+ limit=self.scroll_size,
1076
+ offset=next_offset,
1077
+ with_payload=True,
1078
+ with_vectors=False,
1079
+ )
1080
+ min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
1081
+ if self._check_stop_scrolling(next_offset):
1082
+ break
1083
+
1084
+ if min_value is not None and max_value is not None:
1085
+ return {"min": min_value, "max": max_value}
1086
+ return {}
1087
+ except Exception as e:
1088
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max_async()")
1089
+ return {}
1090
+
1091
+ def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
1092
+ """
1093
+ Returns the number of unique values for each specified metadata field among documents that match the filters.
1094
+
1095
+ :param filters: The filters to restrict the documents considered.
1096
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1097
+ :param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
1098
+
1099
+ :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
1100
+ documents.
1101
+ """
1102
+ self._initialize_client()
1103
+ assert self._client is not None
1104
+
1105
+ qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
1106
+ unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
1107
+
1108
+ try:
1109
+ next_offset = None
1110
+ while True:
1111
+ records, next_offset = self._client.scroll(
1112
+ collection_name=self.index,
1113
+ scroll_filter=qdrant_filter,
1114
+ limit=self.scroll_size,
1115
+ offset=next_offset,
1116
+ with_payload=True,
1117
+ with_vectors=False,
1118
+ )
1119
+ self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
1120
+ if self._check_stop_scrolling(next_offset):
1121
+ break
1122
+
1123
+ return {field: len(unique_values_by_field[field]) for field in metadata_fields}
1124
+ except Exception as e:
1125
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter()")
1126
+ return dict.fromkeys(metadata_fields, 0)
1127
+
1128
+ async def count_unique_metadata_by_filter_async(
1129
+ self, filters: dict[str, Any], metadata_fields: list[str]
1130
+ ) -> dict[str, int]:
1131
+ """
1132
+ Asynchronously returns the number of unique values for each specified metadata field among documents that
1133
+ match the filters.
1134
+
1135
+ :param filters: The filters to restrict the documents considered.
1136
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1137
+ :param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
1138
+
1139
+ :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
1140
+ documents.
1141
+ """
1142
+ await self._initialize_async_client()
1143
+ assert self._async_client is not None
1144
+
1145
+ qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
1146
+ unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
1147
+
1148
+ try:
1149
+ next_offset = None
1150
+ while True:
1151
+ records, next_offset = await self._async_client.scroll(
1152
+ collection_name=self.index,
1153
+ scroll_filter=qdrant_filter,
1154
+ limit=self.scroll_size,
1155
+ offset=next_offset,
1156
+ with_payload=True,
1157
+ with_vectors=False,
1158
+ )
1159
+ self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
1160
+ if self._check_stop_scrolling(next_offset):
1161
+ break
1162
+
1163
+ return {field: len(unique_values_by_field[field]) for field in metadata_fields}
1164
+ except Exception as e:
1165
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter_async()")
1166
+ return dict.fromkeys(metadata_fields, 0)
1167
+
1168
+ def get_metadata_field_unique_values(
1169
+ self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
1170
+ ) -> list[Any]:
1171
+ """
1172
+ Returns unique values for a metadata field, with optional filters and offset/limit pagination.
1173
+
1174
+ Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
1175
+
1176
+ :param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
1177
+ :param filters: Optional filters to restrict the documents considered.
1178
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1179
+ :param limit: Maximum number of unique values to return per page. Defaults to 100.
1180
+ :param offset: Number of unique values to skip (for pagination). Defaults to 0.
1181
+
1182
+ :returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
1183
+ """
1184
+ self._initialize_client()
1185
+ assert self._client is not None
1186
+
1187
+ qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
1188
+ unique_values: list[Any] = []
1189
+ unique_values_set: set[Any] = set()
1190
+
1191
+ try:
1192
+ next_offset = None
1193
+ while len(unique_values) < offset + limit:
1194
+ records, next_offset = self._client.scroll(
1195
+ collection_name=self.index,
1196
+ scroll_filter=qdrant_filter,
1197
+ limit=self.scroll_size,
1198
+ offset=next_offset,
1199
+ with_payload=True,
1200
+ with_vectors=False,
1201
+ )
1202
+ if self._process_records_unique_values(
1203
+ records, metadata_field, unique_values, unique_values_set, offset, limit
1204
+ ):
1205
+ break
1206
+ if self._check_stop_scrolling(next_offset):
1207
+ break
1208
+
1209
+ return unique_values[offset : offset + limit]
1210
+ except Exception as e:
1211
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values()")
1212
+ return []
1213
+
1214
+ async def get_metadata_field_unique_values_async(
1215
+ self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
1216
+ ) -> list[Any]:
1217
+ """
1218
+ Asynchronously returns unique values for a metadata field, with optional filters and offset/limit pagination.
1219
+
1220
+ Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
1221
+
1222
+ :param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
1223
+ :param filters: Optional filters to restrict the documents considered.
1224
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1225
+ :param limit: Maximum number of unique values to return per page. Defaults to 100.
1226
+ :param offset: Number of unique values to skip (for pagination). Defaults to 0.
1227
+
1228
+ :returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
1229
+ """
1230
+ await self._initialize_async_client()
1231
+ assert self._async_client is not None
1232
+
1233
+ qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
1234
+ unique_values: list[Any] = []
1235
+ unique_values_set: set[Any] = set()
1236
+
1237
+ try:
1238
+ next_offset = None
1239
+ while len(unique_values) < offset + limit:
1240
+ records, next_offset = await self._async_client.scroll(
1241
+ collection_name=self.index,
1242
+ scroll_filter=qdrant_filter,
1243
+ limit=self.scroll_size,
1244
+ offset=next_offset,
1245
+ with_payload=True,
1246
+ with_vectors=False,
1247
+ )
1248
+ if self._process_records_unique_values(
1249
+ records, metadata_field, unique_values, unique_values_set, offset, limit
1250
+ ):
1251
+ break
1252
+ if self._check_stop_scrolling(next_offset):
1253
+ break
1254
+
1255
+ return unique_values[offset : offset + limit]
1256
+ except Exception as e:
1257
+ logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values_async()")
1258
+ return []
1259
+
1260
+ @classmethod
1261
+ def from_dict(cls, data: dict[str, Any]) -> "QdrantDocumentStore":
1262
+ """
1263
+ Deserializes the component from a dictionary.
1264
+
1265
+ :param data:
1266
+ The dictionary to deserialize from.
1267
+ :returns:
1268
+ The deserialized component.
1269
+ """
1270
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
1271
+ return default_from_dict(cls, data)
1272
+
1273
+ def to_dict(self) -> dict[str, Any]:
1274
+ """
1275
+ Serializes the component to a dictionary.
1276
+
1277
+ :returns:
1278
+ Dictionary with serialized data.
1279
+ """
1280
+ params = inspect.signature(self.__init__).parameters # type: ignore
1281
+ # All the __init__ params must be set as attributes
1282
+ # Set as init_parms without default values
1283
+ init_params = {k: getattr(self, k) for k in params}
1284
+ init_params["api_key"] = self.api_key.to_dict() if self.api_key else None
1285
+ return default_to_dict(
1286
+ self,
1287
+ **init_params,
1288
+ )
1289
+
1290
+ def _get_documents_generator(
1291
+ self,
1292
+ filters: dict[str, Any] | rest.Filter | None = None,
1293
+ ) -> Generator[Document, None, None]:
1294
+ """
1295
+ Returns a generator that yields documents from Qdrant based on the provided filters.
1296
+
1297
+ :param filters: Filters applied to the retrieved documents.
1298
+ :returns: A generator that yields documents retrieved from Qdrant.
1299
+ """
1300
+
1301
+ self._initialize_client()
1302
+ assert self._client is not None
1303
+
1304
+ index = self.index
1305
+ qdrant_filters = convert_filters_to_qdrant(filters)
1306
+
1307
+ next_offset = None
1308
+ stop_scrolling = False
1309
+ while not stop_scrolling:
1310
+ records, next_offset = self._client.scroll(
1311
+ collection_name=index,
1312
+ scroll_filter=qdrant_filters,
1313
+ limit=self.scroll_size,
1314
+ offset=next_offset,
1315
+ with_payload=True,
1316
+ with_vectors=True,
1317
+ )
1318
+ stop_scrolling = next_offset is None or (
1319
+ hasattr(next_offset, "num")
1320
+ and hasattr(next_offset, "uuid")
1321
+ and next_offset.num == 0
1322
+ and next_offset.uuid == ""
1323
+ ) # PointId always has num and uuid
1324
+
1325
+ for record in records:
1326
+ yield convert_qdrant_point_to_haystack_document(
1327
+ record, use_sparse_embeddings=self.use_sparse_embeddings
1328
+ )
1329
+
1330
+ async def _get_documents_generator_async(
1331
+ self,
1332
+ filters: dict[str, Any] | rest.Filter | None = None,
1333
+ ) -> AsyncGenerator[Document, None]:
1334
+ """
1335
+ Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
1336
+
1337
+ :param filters: Filters applied to the retrieved documents.
1338
+ :returns: An asynchronous generator that yields documents retrieved from Qdrant.
1339
+ """
1340
+
1341
+ await self._initialize_async_client()
1342
+ assert self._async_client is not None
1343
+
1344
+ index = self.index
1345
+ qdrant_filters = convert_filters_to_qdrant(filters)
1346
+
1347
+ next_offset = None
1348
+ stop_scrolling = False
1349
+ while not stop_scrolling:
1350
+ records, next_offset = await self._async_client.scroll(
1351
+ collection_name=index,
1352
+ scroll_filter=qdrant_filters,
1353
+ limit=self.scroll_size,
1354
+ offset=next_offset,
1355
+ with_payload=True,
1356
+ with_vectors=True,
1357
+ )
1358
+ stop_scrolling = next_offset is None or (
1359
+ hasattr(next_offset, "num")
1360
+ and hasattr(next_offset, "uuid")
1361
+ and next_offset.num == 0
1362
+ and next_offset.uuid == ""
1363
+ ) # PointId always has num and uuid
1364
+
1365
+ for record in records:
1366
+ yield convert_qdrant_point_to_haystack_document(
1367
+ record, use_sparse_embeddings=self.use_sparse_embeddings
1368
+ )
1369
+
1370
+ def get_documents_by_id(
1371
+ self,
1372
+ ids: list[str],
1373
+ ) -> list[Document]:
1374
+ """
1375
+ Retrieves documents from Qdrant by their IDs.
1376
+
1377
+ :param ids:
1378
+ A list of document IDs to retrieve.
1379
+ :returns:
1380
+ A list of documents.
1381
+ """
1382
+ documents: list[Document] = []
1383
+
1384
+ self._initialize_client()
1385
+ assert self._client is not None
1386
+
1387
+ ids = [convert_id(_id) for _id in ids]
1388
+ records = self._client.retrieve(
1389
+ collection_name=self.index,
1390
+ ids=ids,
1391
+ with_payload=True,
1392
+ with_vectors=True,
1393
+ )
1394
+
1395
+ for record in records:
1396
+ documents.append(
1397
+ convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
1398
+ )
1399
+ return documents
1400
+
1401
+ async def get_documents_by_id_async(
1402
+ self,
1403
+ ids: list[str],
1404
+ ) -> list[Document]:
1405
+ """
1406
+ Retrieves documents from Qdrant by their IDs.
1407
+
1408
+ :param ids:
1409
+ A list of document IDs to retrieve.
1410
+ :returns:
1411
+ A list of documents.
1412
+ """
1413
+ documents: list[Document] = []
1414
+
1415
+ await self._initialize_async_client()
1416
+ assert self._async_client is not None
1417
+
1418
+ ids = [convert_id(_id) for _id in ids]
1419
+ records = await self._async_client.retrieve(
1420
+ collection_name=self.index,
1421
+ ids=ids,
1422
+ with_payload=True,
1423
+ with_vectors=True,
1424
+ )
1425
+
1426
+ for record in records:
1427
+ documents.append(
1428
+ convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
1429
+ )
1430
+ return documents
1431
+
1432
+ def _query_by_sparse(
1433
+ self,
1434
+ query_sparse_embedding: SparseEmbedding,
1435
+ filters: dict[str, Any] | rest.Filter | None = None,
1436
+ top_k: int = 10,
1437
+ scale_score: bool = False,
1438
+ return_embedding: bool = False,
1439
+ score_threshold: float | None = None,
1440
+ group_by: str | None = None,
1441
+ group_size: int | None = None,
1442
+ ) -> list[Document]:
1443
+ """
1444
+ Queries Qdrant using a sparse embedding and returns the most relevant documents.
1445
+
1446
+ :param query_sparse_embedding: Sparse embedding of the query.
1447
+ :param filters: Filters applied to the retrieved documents.
1448
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1449
+ groups to return.
1450
+ :param scale_score: Whether to scale the scores of the retrieved documents.
1451
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1452
+ :param score_threshold: A minimal score threshold for the result.
1453
+ Score of the returned result might be higher or smaller than the threshold
1454
+ depending on the Distance function used.
1455
+ E.g. for cosine similarity only higher scores will be returned.
1456
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1457
+ value, all values will be used for grouping. One point can be in multiple groups.
1458
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1459
+
1460
+ :returns: List of documents that are most similar to `query_sparse_embedding`.
1461
+
1462
+ :raises QdrantStoreError:
1463
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
1464
+ """
1465
+ self._initialize_client()
1466
+ assert self._client is not None
1467
+
1468
+ if not self.use_sparse_embeddings:
1469
+ message = (
1470
+ "You are trying to query using sparse embeddings, but the Document Store "
1471
+ "was initialized with `use_sparse_embeddings=False`. "
1472
+ )
1473
+ raise QdrantStoreError(message)
541
1474
 
542
1475
  qdrant_filters = convert_filters_to_qdrant(filters)
543
1476
  query_indices = query_sparse_embedding.indices
544
1477
  query_values = query_sparse_embedding.values
545
1478
  if group_by:
546
- groups = self.client.query_points_groups(
1479
+ groups = self._client.query_points_groups(
547
1480
  collection_name=self.index,
548
1481
  query=rest.SparseVector(
549
1482
  indices=query_indices,
@@ -553,21 +1486,13 @@ class QdrantDocumentStore:
553
1486
  query_filter=qdrant_filters,
554
1487
  limit=top_k,
555
1488
  group_by=group_by,
556
- group_size=group_size,
1489
+ group_size=group_size or DEFAULT_GROUP_SIZE,
557
1490
  with_vectors=return_embedding,
558
1491
  score_threshold=score_threshold,
559
1492
  ).groups
560
- results = (
561
- [
562
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
563
- for group in groups
564
- for point in group.hits
565
- ]
566
- if groups
567
- else []
568
- )
1493
+ return self._process_group_results(groups)
569
1494
  else:
570
- points = self.client.query_points(
1495
+ points = self._client.query_points(
571
1496
  collection_name=self.index,
572
1497
  query=rest.SparseVector(
573
1498
  indices=query_indices,
@@ -579,28 +1504,19 @@ class QdrantDocumentStore:
579
1504
  with_vectors=return_embedding,
580
1505
  score_threshold=score_threshold,
581
1506
  ).points
582
- results = [
583
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
584
- for point in points
585
- ]
586
- if scale_score:
587
- for document in results:
588
- score = document.score
589
- score = float(1 / (1 + np.exp(-score / 100)))
590
- document.score = score
591
- return results
1507
+ return self._process_query_point_results(points, scale_score=scale_score)
592
1508
 
593
1509
  def _query_by_embedding(
594
1510
  self,
595
- query_embedding: List[float],
596
- filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
1511
+ query_embedding: list[float],
1512
+ filters: dict[str, Any] | rest.Filter | None = None,
597
1513
  top_k: int = 10,
598
1514
  scale_score: bool = False,
599
1515
  return_embedding: bool = False,
600
- score_threshold: Optional[float] = None,
601
- group_by: Optional[str] = None,
602
- group_size: Optional[int] = None,
603
- ) -> List[Document]:
1516
+ score_threshold: float | None = None,
1517
+ group_by: str | None = None,
1518
+ group_size: int | None = None,
1519
+ ) -> list[Document]:
604
1520
  """
605
1521
  Queries Qdrant using a dense embedding and returns the most relevant documents.
606
1522
 
@@ -620,30 +1536,26 @@ class QdrantDocumentStore:
620
1536
 
621
1537
  :returns: List of documents that are most similar to `query_embedding`.
622
1538
  """
1539
+ self._initialize_client()
1540
+ assert self._client is not None
1541
+
623
1542
  qdrant_filters = convert_filters_to_qdrant(filters)
624
1543
  if group_by:
625
- groups = self.client.query_points_groups(
1544
+ groups = self._client.query_points_groups(
626
1545
  collection_name=self.index,
627
1546
  query=query_embedding,
628
1547
  using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
629
1548
  query_filter=qdrant_filters,
630
1549
  limit=top_k,
631
1550
  group_by=group_by,
632
- group_size=group_size,
1551
+ group_size=group_size or DEFAULT_GROUP_SIZE,
633
1552
  with_vectors=return_embedding,
634
1553
  score_threshold=score_threshold,
635
1554
  ).groups
636
- results = (
637
- [
638
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
639
- for group in groups
640
- for point in group.hits
641
- ]
642
- if groups
643
- else []
644
- )
1555
+ return self._process_group_results(groups)
1556
+
645
1557
  else:
646
- points = self.client.query_points(
1558
+ points = self._client.query_points(
647
1559
  collection_name=self.index,
648
1560
  query=query_embedding,
649
1561
  using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
@@ -652,32 +1564,19 @@ class QdrantDocumentStore:
652
1564
  with_vectors=return_embedding,
653
1565
  score_threshold=score_threshold,
654
1566
  ).points
655
- results = [
656
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
657
- for point in points
658
- ]
659
-
660
- if scale_score:
661
- for document in results:
662
- score = document.score
663
- if self.similarity == "cosine":
664
- score = (score + 1) / 2
665
- else:
666
- score = float(1 / (1 + np.exp(-score / 100)))
667
- document.score = score
668
- return results
1567
+ return self._process_query_point_results(points, scale_score=scale_score)
669
1568
 
670
1569
  def _query_hybrid(
671
1570
  self,
672
- query_embedding: List[float],
1571
+ query_embedding: list[float],
673
1572
  query_sparse_embedding: SparseEmbedding,
674
- filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
1573
+ filters: dict[str, Any] | rest.Filter | None = None,
675
1574
  top_k: int = 10,
676
1575
  return_embedding: bool = False,
677
- score_threshold: Optional[float] = None,
678
- group_by: Optional[str] = None,
679
- group_size: Optional[int] = None,
680
- ) -> List[Document]:
1576
+ score_threshold: float | None = None,
1577
+ group_by: str | None = None,
1578
+ group_size: int | None = None,
1579
+ ) -> list[Document]:
681
1580
  """
682
1581
  Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
683
1582
 
@@ -706,6 +1605,10 @@ class QdrantDocumentStore:
706
1605
 
707
1606
  # This implementation is based on the code from the Python Qdrant client:
708
1607
  # https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
1608
+
1609
+ self._initialize_client()
1610
+ assert self._client is not None
1611
+
709
1612
  if not self.use_sparse_embeddings:
710
1613
  message = (
711
1614
  "You are trying to query using sparse embeddings, but the Document Store "
@@ -717,7 +1620,7 @@ class QdrantDocumentStore:
717
1620
 
718
1621
  try:
719
1622
  if group_by:
720
- groups = self.client.query_points_groups(
1623
+ groups = self._client.query_points_groups(
721
1624
  collection_name=self.index,
722
1625
  prefetch=[
723
1626
  rest.Prefetch(
@@ -737,13 +1640,13 @@ class QdrantDocumentStore:
737
1640
  query=rest.FusionQuery(fusion=rest.Fusion.RRF),
738
1641
  limit=top_k,
739
1642
  group_by=group_by,
740
- group_size=group_size,
1643
+ group_size=group_size or DEFAULT_GROUP_SIZE,
741
1644
  score_threshold=score_threshold,
742
1645
  with_payload=True,
743
1646
  with_vectors=return_embedding,
744
1647
  ).groups
745
1648
  else:
746
- points = self.client.query_points(
1649
+ points = self._client.query_points(
747
1650
  collection_name=self.index,
748
1651
  prefetch=[
749
1652
  rest.Prefetch(
@@ -772,19 +1675,263 @@ class QdrantDocumentStore:
772
1675
  raise QdrantStoreError(msg) from e
773
1676
 
774
1677
  if group_by:
775
- results = (
776
- [
777
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
778
- for group in groups
779
- for point in group.hits
780
- ]
781
- if groups
782
- else []
1678
+ return self._process_group_results(groups)
1679
+ else:
1680
+ return self._process_query_point_results(points)
1681
+
1682
+ async def _query_by_sparse_async(
1683
+ self,
1684
+ query_sparse_embedding: SparseEmbedding,
1685
+ filters: dict[str, Any] | rest.Filter | None = None,
1686
+ top_k: int = 10,
1687
+ scale_score: bool = False,
1688
+ return_embedding: bool = False,
1689
+ score_threshold: float | None = None,
1690
+ group_by: str | None = None,
1691
+ group_size: int | None = None,
1692
+ ) -> list[Document]:
1693
+ """
1694
+ Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
1695
+
1696
+ :param query_sparse_embedding: Sparse embedding of the query.
1697
+ :param filters: Filters applied to the retrieved documents.
1698
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1699
+ groups to return.
1700
+ :param scale_score: Whether to scale the scores of the retrieved documents.
1701
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1702
+ :param score_threshold: A minimal score threshold for the result.
1703
+ Score of the returned result might be higher or smaller than the threshold
1704
+ depending on the Distance function used.
1705
+ E.g. for cosine similarity only higher scores will be returned.
1706
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1707
+ value, all values will be used for grouping. One point can be in multiple groups.
1708
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1709
+
1710
+ :returns: List of documents that are most similar to `query_sparse_embedding`.
1711
+
1712
+ :raises QdrantStoreError:
1713
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
1714
+ """
1715
+
1716
+ await self._initialize_async_client()
1717
+ assert self._async_client is not None
1718
+
1719
+ if not self.use_sparse_embeddings:
1720
+ message = (
1721
+ "You are trying to query using sparse embeddings, but the Document Store "
1722
+ "was initialized with `use_sparse_embeddings=False`. "
1723
+ )
1724
+ raise QdrantStoreError(message)
1725
+
1726
+ qdrant_filters = convert_filters_to_qdrant(filters)
1727
+ query_indices = query_sparse_embedding.indices
1728
+ query_values = query_sparse_embedding.values
1729
+ if group_by:
1730
+ response = await self._async_client.query_points_groups(
1731
+ collection_name=self.index,
1732
+ query=rest.SparseVector(
1733
+ indices=query_indices,
1734
+ values=query_values,
1735
+ ),
1736
+ using=SPARSE_VECTORS_NAME,
1737
+ query_filter=qdrant_filters,
1738
+ limit=top_k,
1739
+ group_by=group_by,
1740
+ group_size=group_size or DEFAULT_GROUP_SIZE,
1741
+ with_vectors=return_embedding,
1742
+ score_threshold=score_threshold,
1743
+ )
1744
+ groups = response.groups
1745
+ return self._process_group_results(groups)
1746
+ else:
1747
+ query_response = await self._async_client.query_points(
1748
+ collection_name=self.index,
1749
+ query=rest.SparseVector(
1750
+ indices=query_indices,
1751
+ values=query_values,
1752
+ ),
1753
+ using=SPARSE_VECTORS_NAME,
1754
+ query_filter=qdrant_filters,
1755
+ limit=top_k,
1756
+ with_vectors=return_embedding,
1757
+ score_threshold=score_threshold,
1758
+ )
1759
+ points = query_response.points
1760
+ return self._process_query_point_results(points, scale_score=scale_score)
1761
+
1762
+ async def _query_by_embedding_async(
1763
+ self,
1764
+ query_embedding: list[float],
1765
+ filters: dict[str, Any] | rest.Filter | None = None,
1766
+ top_k: int = 10,
1767
+ scale_score: bool = False,
1768
+ return_embedding: bool = False,
1769
+ score_threshold: float | None = None,
1770
+ group_by: str | None = None,
1771
+ group_size: int | None = None,
1772
+ ) -> list[Document]:
1773
+ """
1774
+ Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
1775
+
1776
+ :param query_embedding: Dense embedding of the query.
1777
+ :param filters: Filters applied to the retrieved documents.
1778
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1779
+ groups to return.
1780
+ :param scale_score: Whether to scale the scores of the retrieved documents.
1781
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1782
+ :param score_threshold: A minimal score threshold for the result.
1783
+ Score of the returned result might be higher or smaller than the threshold
1784
+ depending on the Distance function used.
1785
+ E.g. for cosine similarity only higher scores will be returned.
1786
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1787
+ value, all values will be used for grouping. One point can be in multiple groups.
1788
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1789
+
1790
+ :returns: List of documents that are most similar to `query_embedding`.
1791
+ """
1792
+ await self._initialize_async_client()
1793
+ assert self._async_client is not None
1794
+
1795
+ qdrant_filters = convert_filters_to_qdrant(filters)
1796
+ if group_by:
1797
+ response = await self._async_client.query_points_groups(
1798
+ collection_name=self.index,
1799
+ query=query_embedding,
1800
+ using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
1801
+ query_filter=qdrant_filters,
1802
+ limit=top_k,
1803
+ group_by=group_by,
1804
+ group_size=group_size or DEFAULT_GROUP_SIZE,
1805
+ with_vectors=return_embedding,
1806
+ score_threshold=score_threshold,
783
1807
  )
1808
+ groups = response.groups
1809
+ return self._process_group_results(groups)
784
1810
  else:
785
- results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
1811
+ query_response = await self._async_client.query_points(
1812
+ collection_name=self.index,
1813
+ query=query_embedding,
1814
+ using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
1815
+ query_filter=qdrant_filters,
1816
+ limit=top_k,
1817
+ with_vectors=return_embedding,
1818
+ score_threshold=score_threshold,
1819
+ )
1820
+ points = query_response.points
1821
+ return self._process_query_point_results(points, scale_score=scale_score)
1822
+
1823
+ async def _query_hybrid_async(
1824
+ self,
1825
+ query_embedding: list[float],
1826
+ query_sparse_embedding: SparseEmbedding,
1827
+ filters: dict[str, Any] | rest.Filter | None = None,
1828
+ top_k: int = 10,
1829
+ return_embedding: bool = False,
1830
+ score_threshold: float | None = None,
1831
+ group_by: str | None = None,
1832
+ group_size: int | None = None,
1833
+ ) -> list[Document]:
1834
+ """
1835
+ Asynchronously retrieves documents based on dense and sparse embeddings and fuses
1836
+ the results using Reciprocal Rank Fusion.
1837
+
1838
+ This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
1839
+ Use the `QdrantHybridRetriever` instead.
1840
+
1841
+ :param query_embedding: Dense embedding of the query.
1842
+ :param query_sparse_embedding: Sparse embedding of the query.
1843
+ :param filters: Filters applied to the retrieved documents.
1844
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1845
+ groups to return.
1846
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1847
+ :param score_threshold: A minimal score threshold for the result.
1848
+ Score of the returned result might be higher or smaller than the threshold
1849
+ depending on the Distance function used.
1850
+ E.g. for cosine similarity only higher scores will be returned.
1851
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1852
+ value, all values will be used for grouping. One point can be in multiple groups.
1853
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1854
+
1855
+ :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
1856
+
1857
+ :raises QdrantStoreError:
1858
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
1859
+ """
1860
+
1861
+ await self._initialize_async_client()
1862
+ assert self._async_client is not None
1863
+
1864
+ if not self.use_sparse_embeddings:
1865
+ message = (
1866
+ "You are trying to query using sparse embeddings, but the Document Store "
1867
+ "was initialized with `use_sparse_embeddings=False`. "
1868
+ )
1869
+ raise QdrantStoreError(message)
1870
+
1871
+ qdrant_filters = convert_filters_to_qdrant(filters)
786
1872
 
787
- return results
1873
+ try:
1874
+ if group_by:
1875
+ response = await self._async_client.query_points_groups(
1876
+ collection_name=self.index,
1877
+ prefetch=[
1878
+ rest.Prefetch(
1879
+ query=rest.SparseVector(
1880
+ indices=query_sparse_embedding.indices,
1881
+ values=query_sparse_embedding.values,
1882
+ ),
1883
+ using=SPARSE_VECTORS_NAME,
1884
+ filter=qdrant_filters,
1885
+ ),
1886
+ rest.Prefetch(
1887
+ query=query_embedding,
1888
+ using=DENSE_VECTORS_NAME,
1889
+ filter=qdrant_filters,
1890
+ ),
1891
+ ],
1892
+ query=rest.FusionQuery(fusion=rest.Fusion.RRF),
1893
+ limit=top_k,
1894
+ group_by=group_by,
1895
+ group_size=group_size or DEFAULT_GROUP_SIZE,
1896
+ score_threshold=score_threshold,
1897
+ with_payload=True,
1898
+ with_vectors=return_embedding,
1899
+ )
1900
+ groups = response.groups
1901
+ else:
1902
+ query_response = await self._async_client.query_points(
1903
+ collection_name=self.index,
1904
+ prefetch=[
1905
+ rest.Prefetch(
1906
+ query=rest.SparseVector(
1907
+ indices=query_sparse_embedding.indices,
1908
+ values=query_sparse_embedding.values,
1909
+ ),
1910
+ using=SPARSE_VECTORS_NAME,
1911
+ filter=qdrant_filters,
1912
+ ),
1913
+ rest.Prefetch(
1914
+ query=query_embedding,
1915
+ using=DENSE_VECTORS_NAME,
1916
+ filter=qdrant_filters,
1917
+ ),
1918
+ ],
1919
+ query=rest.FusionQuery(fusion=rest.Fusion.RRF),
1920
+ limit=top_k,
1921
+ score_threshold=score_threshold,
1922
+ with_payload=True,
1923
+ with_vectors=return_embedding,
1924
+ )
1925
+ points = query_response.points
1926
+
1927
+ except Exception as e:
1928
+ msg = "Error during hybrid search"
1929
+ raise QdrantStoreError(msg) from e
1930
+
1931
+ if group_by:
1932
+ return self._process_group_results(groups)
1933
+ else:
1934
+ return self._process_query_point_results(points)
788
1935
 
789
1936
  def get_distance(self, similarity: str) -> rest.Distance:
790
1937
  """
@@ -807,14 +1954,39 @@ class QdrantDocumentStore:
807
1954
  )
808
1955
  raise QdrantStoreError(msg) from ke
809
1956
 
810
- def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
1957
+ def _create_payload_index(self, collection_name: str, payload_fields_to_index: list[dict] | None = None) -> None:
1958
+ """
1959
+ Create payload index for the collection if payload_fields_to_index is provided.
1960
+
1961
+ See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
1962
+ """
1963
+ if payload_fields_to_index is not None:
1964
+ for payload_index in payload_fields_to_index:
1965
+ # self._client is initialized at this point
1966
+ # since _initialize_client() is called before this method is executed
1967
+
1968
+ assert self._client is not None
1969
+ self._client.create_payload_index(
1970
+ collection_name=collection_name,
1971
+ field_name=payload_index["field_name"],
1972
+ field_schema=payload_index["field_schema"],
1973
+ )
1974
+
1975
+ async def _create_payload_index_async(
1976
+ self, collection_name: str, payload_fields_to_index: list[dict] | None = None
1977
+ ) -> None:
811
1978
  """
812
- Create payload index for the collection if payload_fields_to_index is provided
1979
+ Asynchronously create payload index for the collection if payload_fields_to_index is provided.
1980
+
813
1981
  See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
814
1982
  """
815
1983
  if payload_fields_to_index is not None:
816
1984
  for payload_index in payload_fields_to_index:
817
- self.client.create_payload_index(
1985
+ # self._async_client is initialized at this point
1986
+ # since _initialize_async_client() is called before this method is executed
1987
+ assert self._async_client is not None
1988
+
1989
+ await self._async_client.create_payload_index(
818
1990
  collection_name=collection_name,
819
1991
  field_name=payload_index["field_name"],
820
1992
  field_schema=payload_index["field_schema"],
@@ -829,10 +2001,11 @@ class QdrantDocumentStore:
829
2001
  use_sparse_embeddings: bool,
830
2002
  sparse_idf: bool,
831
2003
  on_disk: bool = False,
832
- payload_fields_to_index: Optional[List[dict]] = None,
833
- ):
2004
+ payload_fields_to_index: list[dict] | None = None,
2005
+ ) -> None:
834
2006
  """
835
2007
  Sets up the Qdrant collection with the specified parameters.
2008
+
836
2009
  :param collection_name:
837
2010
  The name of the collection to set up.
838
2011
  :param embedding_dim:
@@ -856,9 +2029,13 @@ class QdrantDocumentStore:
856
2029
  If the collection exists with a different similarity measure or embedding dimension.
857
2030
 
858
2031
  """
2032
+
2033
+ self._initialize_client()
2034
+ assert self._client is not None
2035
+
859
2036
  distance = self.get_distance(similarity)
860
2037
 
861
- if recreate_collection or not self.client.collection_exists(collection_name):
2038
+ if recreate_collection or not self._client.collection_exists(collection_name):
862
2039
  # There is no need to verify the current configuration of that
863
2040
  # collection. It might be just recreated again or does not exist yet.
864
2041
  self.recreate_collection(
@@ -868,66 +2045,76 @@ class QdrantDocumentStore:
868
2045
  self._create_payload_index(collection_name, payload_fields_to_index)
869
2046
  return
870
2047
 
871
- collection_info = self.client.get_collection(collection_name)
2048
+ collection_info = self._client.get_collection(collection_name)
872
2049
 
873
- has_named_vectors = (
874
- isinstance(collection_info.config.params.vectors, dict)
875
- and DENSE_VECTORS_NAME in collection_info.config.params.vectors
876
- )
2050
+ self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
877
2051
 
878
- if self.use_sparse_embeddings and not has_named_vectors:
879
- msg = (
880
- f"Collection '{collection_name}' already exists in Qdrant, "
881
- f"but it has been originally created without sparse embedding vectors. "
882
- f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
883
- f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
884
- f"See `migrate_to_sparse_embeddings_support` function in "
885
- f"`haystack_integrations.document_stores.qdrant`."
886
- )
887
- raise QdrantStoreError(msg)
2052
+ async def _set_up_collection_async(
2053
+ self,
2054
+ collection_name: str,
2055
+ embedding_dim: int,
2056
+ recreate_collection: bool,
2057
+ similarity: str,
2058
+ use_sparse_embeddings: bool,
2059
+ sparse_idf: bool,
2060
+ on_disk: bool = False,
2061
+ payload_fields_to_index: list[dict] | None = None,
2062
+ ) -> None:
2063
+ """
2064
+ Asynchronously sets up the Qdrant collection with the specified parameters.
888
2065
 
889
- elif not self.use_sparse_embeddings and has_named_vectors:
890
- msg = (
891
- f"Collection '{collection_name}' already exists in Qdrant, "
892
- f"but it has been originally created with sparse embedding vectors."
893
- f"If you want to use that collection, please set `use_sparse_embeddings=True`."
894
- )
895
- raise QdrantStoreError(msg)
2066
+ :param collection_name:
2067
+ The name of the collection to set up.
2068
+ :param embedding_dim:
2069
+ The dimension of the embeddings.
2070
+ :param recreate_collection:
2071
+ Whether to recreate the collection if it already exists.
2072
+ :param similarity:
2073
+ The similarity measure to use.
2074
+ :param use_sparse_embeddings:
2075
+ Whether to use sparse embeddings.
2076
+ :param sparse_idf:
2077
+ Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
2078
+ :param on_disk:
2079
+ Whether to store the collection on disk.
2080
+ :param payload_fields_to_index:
2081
+ List of payload fields to index.
896
2082
 
897
- if self.use_sparse_embeddings:
898
- current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
899
- current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
900
- else:
901
- current_distance = collection_info.config.params.vectors.distance
902
- current_vector_size = collection_info.config.params.vectors.size
2083
+ :raises QdrantStoreError:
2084
+ If the collection exists with incompatible settings.
2085
+ :raises ValueError:
2086
+ If the collection exists with a different similarity measure or embedding dimension.
903
2087
 
904
- if current_distance != distance:
905
- msg = (
906
- f"Collection '{collection_name}' already exists in Qdrant, "
907
- f"but it is configured with a similarity '{current_distance.name}'. "
908
- f"If you want to use that collection, but with a different "
909
- f"similarity, please set `recreate_collection=True` argument."
910
- )
911
- raise ValueError(msg)
2088
+ """
912
2089
 
913
- if current_vector_size != embedding_dim:
914
- msg = (
915
- f"Collection '{collection_name}' already exists in Qdrant, "
916
- f"but it is configured with a vector size '{current_vector_size}'. "
917
- f"If you want to use that collection, but with a different "
918
- f"vector size, please set `recreate_collection=True` argument."
2090
+ await self._initialize_async_client()
2091
+ assert self._async_client is not None
2092
+
2093
+ distance = self.get_distance(similarity)
2094
+
2095
+ if recreate_collection or not await self._async_client.collection_exists(collection_name):
2096
+ # There is no need to verify the current configuration of that
2097
+ # collection. It might be just recreated again or does not exist yet.
2098
+ await self.recreate_collection_async(
2099
+ collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
919
2100
  )
920
- raise ValueError(msg)
2101
+ # Create Payload index if payload_fields_to_index is provided
2102
+ await self._create_payload_index_async(collection_name, payload_fields_to_index)
2103
+ return
2104
+
2105
+ collection_info = await self._async_client.get_collection(collection_name)
2106
+
2107
+ self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
921
2108
 
922
2109
  def recreate_collection(
923
2110
  self,
924
2111
  collection_name: str,
925
- distance,
2112
+ distance: rest.Distance,
926
2113
  embedding_dim: int,
927
- on_disk: Optional[bool] = None,
928
- use_sparse_embeddings: Optional[bool] = None,
2114
+ on_disk: bool | None = None,
2115
+ use_sparse_embeddings: bool | None = None,
929
2116
  sparse_idf: bool = False,
930
- ):
2117
+ ) -> None:
931
2118
  """
932
2119
  Recreates the Qdrant collection with the specified parameters.
933
2120
 
@@ -944,96 +2131,356 @@ class QdrantDocumentStore:
944
2131
  :param sparse_idf:
945
2132
  Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
946
2133
  """
947
- if on_disk is None:
948
- on_disk = self.on_disk
2134
+ vectors_config, sparse_vectors_config = self._prepare_collection_config(
2135
+ embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
2136
+ )
2137
+ collection_params = self._prepare_collection_params()
949
2138
 
950
- if use_sparse_embeddings is None:
951
- use_sparse_embeddings = self.use_sparse_embeddings
2139
+ self._initialize_client()
2140
+ assert self._client is not None
952
2141
 
953
- # dense vectors configuration
954
- vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
2142
+ if self._client.collection_exists(collection_name):
2143
+ self._client.delete_collection(collection_name)
955
2144
 
956
- if use_sparse_embeddings:
957
- # in this case, we need to define named vectors
958
- vectors_config = {DENSE_VECTORS_NAME: vectors_config}
2145
+ self._client.create_collection(
2146
+ collection_name=collection_name,
2147
+ vectors_config=vectors_config,
2148
+ sparse_vectors_config=sparse_vectors_config,
2149
+ **collection_params,
2150
+ )
959
2151
 
960
- sparse_vectors_config = {
961
- SPARSE_VECTORS_NAME: rest.SparseVectorParams(
962
- index=rest.SparseIndexParams(
963
- on_disk=on_disk,
964
- ),
965
- modifier=rest.Modifier.IDF if sparse_idf else None,
966
- ),
967
- }
2152
+ async def recreate_collection_async(
2153
+ self,
2154
+ collection_name: str,
2155
+ distance: rest.Distance,
2156
+ embedding_dim: int,
2157
+ on_disk: bool | None = None,
2158
+ use_sparse_embeddings: bool | None = None,
2159
+ sparse_idf: bool = False,
2160
+ ) -> None:
2161
+ """
2162
+ Asynchronously recreates the Qdrant collection with the specified parameters.
2163
+
2164
+ :param collection_name:
2165
+ The name of the collection to recreate.
2166
+ :param distance:
2167
+ The distance metric to use for the collection.
2168
+ :param embedding_dim:
2169
+ The dimension of the embeddings.
2170
+ :param on_disk:
2171
+ Whether to store the collection on disk.
2172
+ :param use_sparse_embeddings:
2173
+ Whether to use sparse embeddings.
2174
+ :param sparse_idf:
2175
+ Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
2176
+ """
2177
+ vectors_config, sparse_vectors_config = self._prepare_collection_config(
2178
+ embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
2179
+ )
2180
+ collection_params = self._prepare_collection_params()
2181
+
2182
+ await self._initialize_async_client()
2183
+ assert self._async_client is not None
968
2184
 
969
- if self.client.collection_exists(collection_name):
970
- self.client.delete_collection(collection_name)
2185
+ if await self._async_client.collection_exists(collection_name):
2186
+ await self._async_client.delete_collection(collection_name)
971
2187
 
972
- self.client.create_collection(
2188
+ await self._async_client.create_collection(
973
2189
  collection_name=collection_name,
974
2190
  vectors_config=vectors_config,
975
- sparse_vectors_config=sparse_vectors_config if use_sparse_embeddings else None,
976
- shard_number=self.shard_number,
977
- replication_factor=self.replication_factor,
978
- write_consistency_factor=self.write_consistency_factor,
979
- on_disk_payload=self.on_disk_payload,
980
- hnsw_config=self.hnsw_config,
981
- optimizers_config=self.optimizers_config,
982
- wal_config=self.wal_config,
983
- quantization_config=self.quantization_config,
984
- init_from=self.init_from,
2191
+ sparse_vectors_config=sparse_vectors_config,
2192
+ **collection_params,
985
2193
  )
986
2194
 
987
2195
  def _handle_duplicate_documents(
988
2196
  self,
989
- documents: List[Document],
990
- index: Optional[str] = None,
991
- policy: DuplicatePolicy = None,
992
- ):
2197
+ documents: list[Document],
2198
+ policy: DuplicatePolicy | None = None,
2199
+ ) -> list[Document]:
993
2200
  """
994
2201
  Checks whether any of the passed documents is already existing in the chosen index and returns a list of
995
2202
  documents that are not in the index yet.
996
2203
 
997
2204
  :param documents: A list of Haystack Document objects.
998
- :param index: name of the index
999
2205
  :param policy: The duplicate policy to use when writing documents.
1000
2206
  :returns: A list of Haystack Document objects.
1001
2207
  """
1002
2208
 
1003
- index = index or self.index
1004
2209
  if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
1005
- documents = self._drop_duplicate_documents(documents, index)
1006
- documents_found = self.get_documents_by_id(ids=[doc.id for doc in documents], index=index)
1007
- ids_exist_in_db: List[str] = [doc.id for doc in documents_found]
2210
+ documents = self._drop_duplicate_documents(documents)
2211
+ documents_found = self.get_documents_by_id(ids=[doc.id for doc in documents])
2212
+ ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
1008
2213
 
1009
2214
  if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
1010
- msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{index}'."
2215
+ msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
1011
2216
  raise DuplicateDocumentError(msg)
1012
2217
 
1013
2218
  documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
1014
2219
 
1015
2220
  return documents
1016
2221
 
1017
- def _drop_duplicate_documents(self, documents: List[Document], index: Optional[str] = None) -> List[Document]:
2222
+ async def _handle_duplicate_documents_async(
2223
+ self,
2224
+ documents: list[Document],
2225
+ policy: DuplicatePolicy | None = None,
2226
+ ) -> list[Document]:
1018
2227
  """
1019
- Drop duplicate documents based on same hash ID.
2228
+ Asynchronously checks whether any of the passed documents is already existing
2229
+ in the chosen index and returns a list of
2230
+ documents that are not in the index yet.
1020
2231
 
1021
2232
  :param documents: A list of Haystack Document objects.
1022
- :param index: Name of the index.
2233
+ :param policy: The duplicate policy to use when writing documents.
1023
2234
  :returns: A list of Haystack Document objects.
1024
2235
  """
1025
- _hash_ids: Set = set()
1026
- _documents: List[Document] = []
2236
+
2237
+ if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
2238
+ documents = self._drop_duplicate_documents(documents)
2239
+ documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
2240
+ ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
2241
+
2242
+ if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
2243
+ msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
2244
+ raise DuplicateDocumentError(msg)
2245
+
2246
+ documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
2247
+
2248
+ return documents
2249
+
2250
+ def _drop_duplicate_documents(self, documents: list[Document]) -> list[Document]:
2251
+ """
2252
+ Drop duplicate documents based on same hash ID.
2253
+
2254
+ """
2255
+ _hash_ids: set = set()
2256
+ _documents: list[Document] = []
1027
2257
 
1028
2258
  for document in documents:
1029
2259
  if document.id in _hash_ids:
1030
2260
  logger.info(
1031
- "Duplicate Documents: Document with id '%s' already exists in index '%s'",
1032
- document.id,
1033
- index or self.index,
2261
+ "Duplicate Documents: Document with id '{document_id}' already exists in index '{index}'",
2262
+ document_id=document.id,
2263
+ index=self.index,
1034
2264
  )
1035
2265
  continue
1036
2266
  _documents.append(document)
1037
2267
  _hash_ids.add(document.id)
1038
2268
 
1039
2269
  return _documents
2270
+
2271
+ def _prepare_collection_params(self) -> dict[str, Any]:
2272
+ """
2273
+ Prepares the common parameters for collection creation.
2274
+ """
2275
+ return {
2276
+ "shard_number": self.shard_number,
2277
+ "replication_factor": self.replication_factor,
2278
+ "write_consistency_factor": self.write_consistency_factor,
2279
+ "on_disk_payload": self.on_disk_payload,
2280
+ "hnsw_config": self.hnsw_config,
2281
+ "optimizers_config": self.optimizers_config,
2282
+ "wal_config": self.wal_config,
2283
+ "quantization_config": self.quantization_config,
2284
+ }
2285
+
2286
+ def _prepare_client_params(self) -> dict[str, Any]:
2287
+ """
2288
+ Prepares the common parameters for client initialization.
2289
+
2290
+ """
2291
+ return {
2292
+ "location": self.location,
2293
+ "url": self.url,
2294
+ "port": self.port,
2295
+ "grpc_port": self.grpc_port,
2296
+ "prefer_grpc": self.prefer_grpc,
2297
+ "https": self.https,
2298
+ "api_key": self.api_key.resolve_value() if self.api_key else None,
2299
+ "prefix": self.prefix,
2300
+ "timeout": self.timeout,
2301
+ "host": self.host,
2302
+ "path": self.path,
2303
+ # NOTE: We purposefully expand the fields of self.metadata to avoid modifying the original self.metadata
2304
+ # class attribute. For example, the resolved api key is added to metadata by the QdrantClient class
2305
+ # when using a hosted Qdrant service, which means running to_dict() exposes the api key.
2306
+ "metadata": {**self.metadata},
2307
+ "force_disable_check_same_thread": self.force_disable_check_same_thread,
2308
+ }
2309
+
2310
+ def _prepare_collection_config(
2311
+ self,
2312
+ embedding_dim: int,
2313
+ distance: rest.Distance,
2314
+ on_disk: bool | None = None,
2315
+ use_sparse_embeddings: bool | None = None,
2316
+ sparse_idf: bool = False,
2317
+ ) -> tuple[dict[str, rest.VectorParams] | rest.VectorParams, dict[str, rest.SparseVectorParams] | None]:
2318
+ """
2319
+ Prepares the configuration for creating or recreating a Qdrant collection.
2320
+
2321
+ """
2322
+ if on_disk is None:
2323
+ on_disk = self.on_disk
2324
+
2325
+ if use_sparse_embeddings is None:
2326
+ use_sparse_embeddings = self.use_sparse_embeddings
2327
+
2328
+ # dense vectors configuration
2329
+ base_vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
2330
+ vectors_config: rest.VectorParams | dict[str, rest.VectorParams] = base_vectors_config
2331
+
2332
+ sparse_vectors_config: dict[str, rest.SparseVectorParams] | None = None
2333
+
2334
+ if use_sparse_embeddings:
2335
+ # in this case, we need to define named vectors
2336
+ vectors_config = {DENSE_VECTORS_NAME: base_vectors_config}
2337
+
2338
+ sparse_vectors_config = {
2339
+ SPARSE_VECTORS_NAME: rest.SparseVectorParams(
2340
+ index=rest.SparseIndexParams(
2341
+ on_disk=on_disk,
2342
+ ),
2343
+ modifier=rest.Modifier.IDF if sparse_idf else None,
2344
+ ),
2345
+ }
2346
+
2347
+ return vectors_config, sparse_vectors_config
2348
+
2349
+ @staticmethod
2350
+ def _validate_filters(filters: dict[str, Any] | rest.Filter | None = None) -> None:
2351
+ """
2352
+ Validates the filters provided for querying.
2353
+
2354
+ :param filters: Filters to validate. Can be a dictionary or an instance of `qdrant_client.http.models.Filter`.
2355
+ :raises ValueError: If the filters are not in the correct format or syntax.
2356
+ """
2357
+ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
2358
+ msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
2359
+ raise ValueError(msg)
2360
+
2361
+ if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
2362
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
2363
+ raise ValueError(msg)
2364
+
2365
+ def _process_query_point_results(
2366
+ self, results: list[rest.ScoredPoint], scale_score: bool = False
2367
+ ) -> list[Document]:
2368
+ """
2369
+ Processes query results from Qdrant.
2370
+ """
2371
+ documents = [
2372
+ convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
2373
+ for point in results
2374
+ ]
2375
+
2376
+ if scale_score:
2377
+ for document in documents:
2378
+ score = document.score
2379
+ if score is None:
2380
+ continue
2381
+ if self.similarity == "cosine":
2382
+ score = (score + 1) / 2
2383
+ else:
2384
+ score = float(1 / (1 + exp(-score / 100)))
2385
+ document.score = score
2386
+
2387
+ return documents
2388
+
2389
+ def _process_group_results(self, groups: list[rest.PointGroup]) -> list[Document]:
2390
+ """
2391
+ Processes grouped query results from Qdrant.
2392
+
2393
+ """
2394
+ if not groups:
2395
+ return []
2396
+
2397
+ return [
2398
+ convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
2399
+ for group in groups
2400
+ for point in group.hits
2401
+ ]
2402
+
2403
+ def _validate_collection_compatibility(
2404
+ self,
2405
+ collection_name: str,
2406
+ collection_info: rest.CollectionInfo,
2407
+ distance: rest.Distance,
2408
+ embedding_dim: int,
2409
+ ) -> None:
2410
+ """
2411
+ Validates that an existing collection is compatible with the current configuration.
2412
+ """
2413
+ vectors_config = collection_info.config.params.vectors
2414
+
2415
+ if vectors_config is None:
2416
+ msg = f"Collection '{collection_name}' has no vector configuration."
2417
+ raise QdrantStoreError(msg)
2418
+
2419
+ has_named_vectors = isinstance(vectors_config, dict)
2420
+
2421
+ if has_named_vectors and DENSE_VECTORS_NAME not in vectors_config:
2422
+ msg = (
2423
+ f"Collection '{collection_name}' already exists in Qdrant, "
2424
+ f"but it has been originally created outside of Haystack and is not supported. "
2425
+ f"If possible, you should create a new Document Store with Haystack. "
2426
+ f"In case you want to migrate the existing collection, see an example script in "
2427
+ f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
2428
+ f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
2429
+ )
2430
+ raise QdrantStoreError(msg)
2431
+
2432
+ if self.use_sparse_embeddings and not has_named_vectors:
2433
+ msg = (
2434
+ f"Collection '{collection_name}' already exists in Qdrant, "
2435
+ f"but it has been originally created without sparse embedding vectors. "
2436
+ f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
2437
+ f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
2438
+ f"See `migrate_to_sparse_embeddings_support` function in "
2439
+ f"`haystack_integrations.document_stores.qdrant`."
2440
+ )
2441
+ raise QdrantStoreError(msg)
2442
+
2443
+ if not self.use_sparse_embeddings and has_named_vectors:
2444
+ msg = (
2445
+ f"Collection '{collection_name}' already exists in Qdrant, "
2446
+ f"but it has been originally created with sparse embedding vectors."
2447
+ f"If you want to use that collection, please set `use_sparse_embeddings=True`."
2448
+ )
2449
+ raise QdrantStoreError(msg)
2450
+
2451
+ # Get current distance and vector size based on collection configuration
2452
+ if self.use_sparse_embeddings:
2453
+ if not isinstance(vectors_config, dict):
2454
+ msg = f"Collection '{collection_name}' has invalid vector configuration for sparse embeddings."
2455
+ raise QdrantStoreError(msg)
2456
+
2457
+ dense_vector_config = vectors_config[DENSE_VECTORS_NAME]
2458
+ current_distance = dense_vector_config.distance
2459
+ current_vector_size = dense_vector_config.size
2460
+ else:
2461
+ if isinstance(vectors_config, dict):
2462
+ msg = f"Collection '{collection_name}' has invalid vector configuration for dense embeddings only."
2463
+ raise QdrantStoreError(msg)
2464
+
2465
+ current_distance = vectors_config.distance
2466
+ current_vector_size = vectors_config.size
2467
+
2468
+ # Validate distance metric
2469
+ if current_distance != distance:
2470
+ msg = (
2471
+ f"Collection '{collection_name}' already exists in Qdrant, "
2472
+ f"but it is configured with a similarity '{current_distance.name}'. "
2473
+ f"If you want to use that collection, but with a different "
2474
+ f"similarity, please set `recreate_collection=True` argument."
2475
+ )
2476
+ raise ValueError(msg)
2477
+
2478
+ # Validate embedding dimension
2479
+ if current_vector_size != embedding_dim:
2480
+ msg = (
2481
+ f"Collection '{collection_name}' already exists in Qdrant, "
2482
+ f"but it is configured with a vector size '{current_vector_size}'. "
2483
+ f"If you want to use that collection, but with a different "
2484
+ f"vector size, please set `recreate_collection=True` argument."
2485
+ )
2486
+ raise ValueError(msg)