qdrant-haystack 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import inspect
2
2
  from itertools import islice
3
- from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Union
3
+ from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union
4
4
 
5
5
  import numpy as np
6
6
  import qdrant_client
@@ -216,6 +216,7 @@ class QdrantDocumentStore:
216
216
  """
217
217
 
218
218
  self._client = None
219
+ self._async_client = None
219
220
 
220
221
  # Store the Qdrant client specific attributes
221
222
  self.location = location
@@ -257,24 +258,10 @@ class QdrantDocumentStore:
257
258
  self.write_batch_size = write_batch_size
258
259
  self.scroll_size = scroll_size
259
260
 
260
- @property
261
- def client(self):
262
- if not self._client:
263
- self._client = qdrant_client.QdrantClient(
264
- location=self.location,
265
- url=self.url,
266
- port=self.port,
267
- grpc_port=self.grpc_port,
268
- prefer_grpc=self.prefer_grpc,
269
- https=self.https,
270
- api_key=self.api_key.resolve_value() if self.api_key else None,
271
- prefix=self.prefix,
272
- timeout=self.timeout,
273
- host=self.host,
274
- path=self.path,
275
- metadata=self.metadata,
276
- force_disable_check_same_thread=self.force_disable_check_same_thread,
277
- )
261
+ def _initialize_client(self):
262
+ if self._client is None:
263
+ client_params = self._prepare_client_params()
264
+ self._client = qdrant_client.QdrantClient(**client_params)
278
265
  # Make sure the collection is properly set up
279
266
  self._set_up_collection(
280
267
  self.index,
@@ -286,14 +273,52 @@ class QdrantDocumentStore:
286
273
  self.on_disk,
287
274
  self.payload_fields_to_index,
288
275
  )
289
- return self._client
276
+
277
+ async def _initialize_async_client(self):
278
+ """
279
+ Returns the asynchronous Qdrant client, initializing it if necessary.
280
+ """
281
+ if self._async_client is None:
282
+ client_params = self._prepare_client_params()
283
+ self._async_client = qdrant_client.AsyncQdrantClient(
284
+ **client_params,
285
+ )
286
+ await self._set_up_collection_async(
287
+ self.index,
288
+ self.embedding_dim,
289
+ self.recreate_index,
290
+ self.similarity,
291
+ self.use_sparse_embeddings,
292
+ self.sparse_idf,
293
+ self.on_disk,
294
+ self.payload_fields_to_index,
295
+ )
290
296
 
291
297
  def count_documents(self) -> int:
292
298
  """
293
299
  Returns the number of documents present in the Document Store.
294
300
  """
301
+ self._initialize_client()
302
+ assert self._client is not None
303
+ try:
304
+ response = self._client.count(
305
+ collection_name=self.index,
306
+ )
307
+ return response.count
308
+ except (UnexpectedResponse, ValueError):
309
+ # Qdrant local raises ValueError if the collection is not found, but
310
+ # with the remote server UnexpectedResponse is raised. Until that's unified,
311
+ # we need to catch both.
312
+ return 0
313
+
314
+ async def count_documents_async(self) -> int:
315
+ """
316
+ Asynchronously returns the number of documents present in the document dtore.
317
+ """
318
+ await self._initialize_async_client()
319
+ assert self._async_client is not None
295
320
  try:
296
- response = self.client.count(
321
+ response = await self._async_client.count(
297
322
  collection_name=self.index,
298
323
  )
299
324
  return response.count
@@ -316,19 +341,29 @@ class QdrantDocumentStore:
316
341
  :param filters: The filters to apply to the document list.
317
342
  :returns: A list of documents that match the given filters.
318
343
  """
319
- if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
320
- msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
321
- raise ValueError(msg)
344
+ # No need to initialize client here as _get_documents_generator
345
+ # will handle client initialization internally
322
346
 
323
- if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
324
- msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
325
- raise ValueError(msg)
347
+ self._validate_filters(filters)
326
348
  return list(
327
- self.get_documents_generator(
349
+ self._get_documents_generator(
328
350
  filters,
329
351
  )
330
352
  )
331
353
 
354
+ async def filter_documents_async(
355
+ self,
356
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
357
+ ) -> List[Document]:
358
+ """
359
+ Asynchronously returns the documents that match the provided filters.
360
+ """
361
+ # No need to initialize client here as _get_documents_generator_async
362
+ # will handle client initialization internally
363
+
364
+ self._validate_filters(filters)
365
+ return [doc async for doc in self._get_documents_generator_async(filters)]
366
+
332
367
  def write_documents(
333
368
  self,
334
369
  documents: List[Document],
@@ -347,13 +382,14 @@ class QdrantDocumentStore:
347
382
 
348
383
  :returns: The number of documents written to the document store.
349
384
  """
385
+
386
+ self._initialize_client()
387
+ assert self._client is not None
388
+
350
389
  for doc in documents:
351
390
  if not isinstance(doc, Document):
352
391
  msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
353
392
  raise ValueError(msg)
354
- self._set_up_collection(
355
- self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
356
- )
357
393
 
358
394
  if len(documents) == 0:
359
395
  logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
@@ -372,7 +408,61 @@ class QdrantDocumentStore:
372
408
  use_sparse_embeddings=self.use_sparse_embeddings,
373
409
  )
374
410
 
375
- self.client.upsert(
411
+ self._client.upsert(
412
+ collection_name=self.index,
413
+ points=batch,
414
+ wait=self.wait_result_from_api,
415
+ )
416
+
417
+ progress_bar.update(self.write_batch_size)
418
+ return len(document_objects)
419
+
420
+ async def write_documents_async(
421
+ self,
422
+ documents: List[Document],
423
+ policy: DuplicatePolicy = DuplicatePolicy.FAIL,
424
+ ) -> int:
425
+ """
426
+ Asynchronously writes documents to Qdrant using the specified policy.
427
+ The QdrantDocumentStore can handle duplicate documents based on the given policy.
428
+ The available policies are:
429
+ - `FAIL`: The operation will raise an error if any document already exists.
430
+ - `OVERWRITE`: Existing documents will be overwritten with the new ones.
431
+ - `SKIP`: Existing documents will be skipped, and only new documents will be added.
432
+
433
+ :param documents: A list of Document objects to write to Qdrant.
434
+ :param policy: The policy for handling duplicate documents.
435
+
436
+ :returns: The number of documents written to the document store.
437
+ """
438
+
439
+ await self._initialize_async_client()
440
+ assert self._async_client is not None
441
+
442
+ for doc in documents:
443
+ if not isinstance(doc, Document):
444
+ msg = f"""DocumentStore.write_documents_async() expects a list of
445
+ Documents but got an element of {type(doc)}."""
446
+ raise ValueError(msg)
447
+
448
+ if len(documents) == 0:
449
+ logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
450
+ return 0
451
+
452
+ document_objects = await self._handle_duplicate_documents_async(
453
+ documents=documents,
454
+ policy=policy,
455
+ )
456
+
457
+ batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
458
+ with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
459
+ for document_batch in batched_documents:
460
+ batch = convert_haystack_documents_to_qdrant_points(
461
+ document_batch,
462
+ use_sparse_embeddings=self.use_sparse_embeddings,
463
+ )
464
+
465
+ await self._async_client.upsert(
376
466
  collection_name=self.index,
377
467
  points=batch,
378
468
  wait=self.wait_result_from_api,
@@ -387,9 +477,13 @@ class QdrantDocumentStore:
387
477
 
388
478
  :param document_ids: the document ids to delete
389
479
  """
480
+
481
+ self._initialize_client()
482
+ assert self._client is not None
483
+
390
484
  ids = [convert_id(_id) for _id in document_ids]
391
485
  try:
392
- self.client.delete(
486
+ self._client.delete(
393
487
  collection_name=self.index,
394
488
  points_selector=ids,
395
489
  wait=self.wait_result_from_api,
@@ -399,6 +493,28 @@ class QdrantDocumentStore:
399
493
  "Called QdrantDocumentStore.delete_documents() on a non-existing ID",
400
494
  )
401
495
 
496
+ async def delete_documents_async(self, document_ids: List[str]) -> None:
497
+ """
498
+ Asynchronously deletes documents that match the provided `document_ids` from the document store.
499
+
500
+ :param document_ids: the document ids to delete
501
+ """
502
+
503
+ await self._initialize_async_client()
504
+ assert self._async_client is not None
505
+
506
+ ids = [convert_id(_id) for _id in document_ids]
507
+ try:
508
+ await self._async_client.delete(
509
+ collection_name=self.index,
510
+ points_selector=ids,
511
+ wait=self.wait_result_from_api,
512
+ )
513
+ except KeyError:
514
+ logger.warning(
515
+ "Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
516
+ )
517
+
402
518
  @classmethod
403
519
  def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
404
520
  """
@@ -429,7 +545,7 @@ class QdrantDocumentStore:
429
545
  **init_params,
430
546
  )
431
547
 
432
- def get_documents_generator(
548
+ def _get_documents_generator(
433
549
  self,
434
550
  filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
435
551
  ) -> Generator[Document, None, None]:
@@ -440,13 +556,53 @@ class QdrantDocumentStore:
440
556
  :returns: A generator that yields documents retrieved from Qdrant.
441
557
  """
442
558
 
559
+ self._initialize_client()
560
+ assert self._client is not None
561
+
562
+ index = self.index
563
+ qdrant_filters = convert_filters_to_qdrant(filters)
564
+
565
+ next_offset = None
566
+ stop_scrolling = False
567
+ while not stop_scrolling:
568
+ records, next_offset = self._client.scroll(
569
+ collection_name=index,
570
+ scroll_filter=qdrant_filters,
571
+ limit=self.scroll_size,
572
+ offset=next_offset,
573
+ with_payload=True,
574
+ with_vectors=True,
575
+ )
576
+ stop_scrolling = next_offset is None or (
577
+ isinstance(next_offset, grpc.PointId) and next_offset.num == 0 and next_offset.uuid == ""
578
+ )
579
+
580
+ for record in records:
581
+ yield convert_qdrant_point_to_haystack_document(
582
+ record, use_sparse_embeddings=self.use_sparse_embeddings
583
+ )
584
+
585
+ async def _get_documents_generator_async(
586
+ self,
587
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
588
+ ) -> AsyncGenerator[Document, None]:
589
+ """
590
+ Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
591
+
592
+ :param filters: Filters applied to the retrieved documents.
593
+ :returns: An asynchronous generator that yields documents retrieved from Qdrant.
594
+ """
595
+
596
+ await self._initialize_async_client()
597
+ assert self._async_client is not None
598
+
443
599
  index = self.index
444
600
  qdrant_filters = convert_filters_to_qdrant(filters)
445
601
 
446
602
  next_offset = None
447
603
  stop_scrolling = False
448
604
  while not stop_scrolling:
449
- records, next_offset = self.client.scroll(
605
+ records, next_offset = await self._async_client.scroll(
450
606
  collection_name=index,
451
607
  scroll_filter=qdrant_filters,
452
608
  limit=self.scroll_size,
@@ -479,8 +635,44 @@ class QdrantDocumentStore:
479
635
  """
480
636
  documents: List[Document] = []
481
637
 
638
+ self._initialize_client()
639
+ assert self._client is not None
640
+
641
+ ids = [convert_id(_id) for _id in ids]
642
+ records = self._client.retrieve(
643
+ collection_name=self.index,
644
+ ids=ids,
645
+ with_payload=True,
646
+ with_vectors=True,
647
+ )
648
+
649
+ for record in records:
650
+ documents.append(
651
+ convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
652
+ )
653
+ return documents
654
+
655
+ async def get_documents_by_id_async(
656
+ self,
657
+ ids: List[str],
658
+ ) -> List[Document]:
659
+ """
660
+ Retrieves documents from Qdrant by their IDs.
661
+
662
+ :param ids:
663
+ A list of document IDs to retrieve.
664
+ :param index:
665
+ The name of the index to retrieve documents from.
666
+ :returns:
667
+ A list of documents.
668
+ """
669
+ documents: List[Document] = []
670
+
671
+ await self._initialize_async_client()
672
+ assert self._async_client is not None
673
+
482
674
  ids = [convert_id(_id) for _id in ids]
483
- records = self.client.retrieve(
675
+ records = await self._async_client.retrieve(
484
676
  collection_name=self.index,
485
677
  ids=ids,
486
678
  with_payload=True,
@@ -526,6 +718,8 @@ class QdrantDocumentStore:
526
718
  :raises QdrantStoreError:
527
719
  If the Document Store was initialized with `use_sparse_embeddings=False`.
528
720
  """
721
+ self._initialize_client()
722
+ assert self._client is not None
529
723
 
530
724
  if not self.use_sparse_embeddings:
531
725
  message = (
@@ -538,7 +732,7 @@ class QdrantDocumentStore:
538
732
  query_indices = query_sparse_embedding.indices
539
733
  query_values = query_sparse_embedding.values
540
734
  if group_by:
541
- groups = self.client.query_points_groups(
735
+ groups = self._client.query_points_groups(
542
736
  collection_name=self.index,
543
737
  query=rest.SparseVector(
544
738
  indices=query_indices,
@@ -552,17 +746,9 @@ class QdrantDocumentStore:
552
746
  with_vectors=return_embedding,
553
747
  score_threshold=score_threshold,
554
748
  ).groups
555
- results = (
556
- [
557
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
558
- for group in groups
559
- for point in group.hits
560
- ]
561
- if groups
562
- else []
563
- )
749
+ return self._process_group_results(groups)
564
750
  else:
565
- points = self.client.query_points(
751
+ points = self._client.query_points(
566
752
  collection_name=self.index,
567
753
  query=rest.SparseVector(
568
754
  indices=query_indices,
@@ -574,16 +760,7 @@ class QdrantDocumentStore:
574
760
  with_vectors=return_embedding,
575
761
  score_threshold=score_threshold,
576
762
  ).points
577
- results = [
578
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
579
- for point in points
580
- ]
581
- if scale_score:
582
- for document in results:
583
- score = document.score
584
- score = float(1 / (1 + np.exp(-score / 100)))
585
- document.score = score
586
- return results
763
+ return self._process_query_point_results(points, scale_score=scale_score)
587
764
 
588
765
  def _query_by_embedding(
589
766
  self,
@@ -615,9 +792,12 @@ class QdrantDocumentStore:
615
792
 
616
793
  :returns: List of documents that are most similar to `query_embedding`.
617
794
  """
795
+ self._initialize_client()
796
+ assert self._client is not None
797
+
618
798
  qdrant_filters = convert_filters_to_qdrant(filters)
619
799
  if group_by:
620
- groups = self.client.query_points_groups(
800
+ groups = self._client.query_points_groups(
621
801
  collection_name=self.index,
622
802
  query=query_embedding,
623
803
  using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
@@ -628,17 +808,10 @@ class QdrantDocumentStore:
628
808
  with_vectors=return_embedding,
629
809
  score_threshold=score_threshold,
630
810
  ).groups
631
- results = (
632
- [
633
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
634
- for group in groups
635
- for point in group.hits
636
- ]
637
- if groups
638
- else []
639
- )
811
+ return self._process_group_results(groups)
812
+
640
813
  else:
641
- points = self.client.query_points(
814
+ points = self._client.query_points(
642
815
  collection_name=self.index,
643
816
  query=query_embedding,
644
817
  using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
@@ -647,20 +820,7 @@ class QdrantDocumentStore:
647
820
  with_vectors=return_embedding,
648
821
  score_threshold=score_threshold,
649
822
  ).points
650
- results = [
651
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
652
- for point in points
653
- ]
654
-
655
- if scale_score:
656
- for document in results:
657
- score = document.score
658
- if self.similarity == "cosine":
659
- score = (score + 1) / 2
660
- else:
661
- score = float(1 / (1 + np.exp(-score / 100)))
662
- document.score = score
663
- return results
823
+ return self._process_query_point_results(points, scale_score=scale_score)
664
824
 
665
825
  def _query_hybrid(
666
826
  self,
@@ -701,6 +861,10 @@ class QdrantDocumentStore:
701
861
 
702
862
  # This implementation is based on the code from the Python Qdrant client:
703
863
  # https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
864
+
865
+ self._initialize_client()
866
+ assert self._client is not None
867
+
704
868
  if not self.use_sparse_embeddings:
705
869
  message = (
706
870
  "You are trying to query using sparse embeddings, but the Document Store "
@@ -712,7 +876,7 @@ class QdrantDocumentStore:
712
876
 
713
877
  try:
714
878
  if group_by:
715
- groups = self.client.query_points_groups(
879
+ groups = self._client.query_points_groups(
716
880
  collection_name=self.index,
717
881
  prefetch=[
718
882
  rest.Prefetch(
@@ -738,7 +902,7 @@ class QdrantDocumentStore:
738
902
  with_vectors=return_embedding,
739
903
  ).groups
740
904
  else:
741
- points = self.client.query_points(
905
+ points = self._client.query_points(
742
906
  collection_name=self.index,
743
907
  prefetch=[
744
908
  rest.Prefetch(
@@ -767,71 +931,339 @@ class QdrantDocumentStore:
767
931
  raise QdrantStoreError(msg) from e
768
932
 
769
933
  if group_by:
770
- results = (
771
- [
772
- convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
773
- for group in groups
774
- for point in group.hits
775
- ]
776
- if groups
777
- else []
778
- )
934
+ return self._process_group_results(groups)
779
935
  else:
780
- results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
936
+ return self._process_query_point_results(points)
781
937
 
782
- return results
783
-
784
- def get_distance(self, similarity: str) -> rest.Distance:
938
+ async def _query_by_sparse_async(
939
+ self,
940
+ query_sparse_embedding: SparseEmbedding,
941
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
942
+ top_k: int = 10,
943
+ scale_score: bool = False,
944
+ return_embedding: bool = False,
945
+ score_threshold: Optional[float] = None,
946
+ group_by: Optional[str] = None,
947
+ group_size: Optional[int] = None,
948
+ ) -> List[Document]:
785
949
  """
786
- Retrieves the distance metric for the specified similarity measure.
950
+ Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
951
+
952
+ :param query_sparse_embedding: Sparse embedding of the query.
953
+ :param filters: Filters applied to the retrieved documents.
954
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
955
+ groups to return.
956
+ :param scale_score: Whether to scale the scores of the retrieved documents.
957
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
958
+ :param score_threshold: A minimal score threshold for the result.
959
+ Score of the returned result might be higher or smaller than the threshold
960
+ depending on the Distance function used.
961
+ E.g. for cosine similarity only higher scores will be returned.
962
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
963
+ value, all values will be used for grouping. One point can be in multiple groups.
964
+ :param group_size: Maximum amount of points to return per group. Default is 3.
965
+
966
+ :returns: List of documents that are most similar to `query_sparse_embedding`.
787
967
 
788
- :param similarity:
789
- The similarity measure to retrieve the distance.
790
- :returns:
791
- The corresponding rest.Distance object.
792
968
  :raises QdrantStoreError:
793
- If the provided similarity measure is not supported.
969
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
794
970
  """
795
- try:
796
- return self.SIMILARITY[similarity]
797
- except KeyError as ke:
798
- msg = (
799
- f"Provided similarity '{similarity}' is not supported by Qdrant "
800
- f"document store. Please choose one of the options: "
801
- f"{', '.join(self.SIMILARITY.keys())}"
802
- )
803
- raise QdrantStoreError(msg) from ke
804
971
 
805
- def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
806
- """
807
- Create payload index for the collection if payload_fields_to_index is provided
808
- See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
809
- """
810
- if payload_fields_to_index is not None:
811
- for payload_index in payload_fields_to_index:
812
- self.client.create_payload_index(
813
- collection_name=collection_name,
814
- field_name=payload_index["field_name"],
815
- field_schema=payload_index["field_schema"],
816
- )
972
+ await self._initialize_async_client()
973
+ assert self._async_client is not None
817
974
 
818
- def _set_up_collection(
819
- self,
820
- collection_name: str,
821
- embedding_dim: int,
822
- recreate_collection: bool,
823
- similarity: str,
824
- use_sparse_embeddings: bool,
825
- sparse_idf: bool,
826
- on_disk: bool = False,
827
- payload_fields_to_index: Optional[List[dict]] = None,
828
- ):
829
- """
830
- Sets up the Qdrant collection with the specified parameters.
831
- :param collection_name:
832
- The name of the collection to set up.
833
- :param embedding_dim:
834
- The dimension of the embeddings.
975
+ if not self.use_sparse_embeddings:
976
+ message = (
977
+ "You are trying to query using sparse embeddings, but the Document Store "
978
+ "was initialized with `use_sparse_embeddings=False`. "
979
+ )
980
+ raise QdrantStoreError(message)
981
+
982
+ qdrant_filters = convert_filters_to_qdrant(filters)
983
+ query_indices = query_sparse_embedding.indices
984
+ query_values = query_sparse_embedding.values
985
+ if group_by:
986
+ response = await self._async_client.query_points_groups(
987
+ collection_name=self.index,
988
+ query=rest.SparseVector(
989
+ indices=query_indices,
990
+ values=query_values,
991
+ ),
992
+ using=SPARSE_VECTORS_NAME,
993
+ query_filter=qdrant_filters,
994
+ limit=top_k,
995
+ group_by=group_by,
996
+ group_size=group_size,
997
+ with_vectors=return_embedding,
998
+ score_threshold=score_threshold,
999
+ )
1000
+ groups = response.groups
1001
+ return self._process_group_results(groups)
1002
+ else:
1003
+ response = await self._async_client.query_points(
1004
+ collection_name=self.index,
1005
+ query=rest.SparseVector(
1006
+ indices=query_indices,
1007
+ values=query_values,
1008
+ ),
1009
+ using=SPARSE_VECTORS_NAME,
1010
+ query_filter=qdrant_filters,
1011
+ limit=top_k,
1012
+ with_vectors=return_embedding,
1013
+ score_threshold=score_threshold,
1014
+ )
1015
+ points = response.points
1016
+ return self._process_query_point_results(points, scale_score=scale_score)
1017
+
1018
+ async def _query_by_embedding_async(
1019
+ self,
1020
+ query_embedding: List[float],
1021
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
1022
+ top_k: int = 10,
1023
+ scale_score: bool = False,
1024
+ return_embedding: bool = False,
1025
+ score_threshold: Optional[float] = None,
1026
+ group_by: Optional[str] = None,
1027
+ group_size: Optional[int] = None,
1028
+ ) -> List[Document]:
1029
+ """
1030
+ Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
1031
+
1032
+ :param query_embedding: Dense embedding of the query.
1033
+ :param filters: Filters applied to the retrieved documents.
1034
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1035
+ groups to return.
1036
+ :param scale_score: Whether to scale the scores of the retrieved documents.
1037
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1038
+ :param score_threshold: A minimal score threshold for the result.
1039
+ Score of the returned result might be higher or smaller than the threshold
1040
+ depending on the Distance function used.
1041
+ E.g. for cosine similarity only higher scores will be returned.
1042
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1043
+ value, all values will be used for grouping. One point can be in multiple groups.
1044
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1045
+
1046
+ :returns: List of documents that are most similar to `query_embedding`.
1047
+ """
1048
+ await self._initialize_async_client()
1049
+ assert self._async_client is not None
1050
+
1051
+ qdrant_filters = convert_filters_to_qdrant(filters)
1052
+ if group_by:
1053
+ response = await self._async_client.query_points_groups(
1054
+ collection_name=self.index,
1055
+ query=query_embedding,
1056
+ using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
1057
+ query_filter=qdrant_filters,
1058
+ limit=top_k,
1059
+ group_by=group_by,
1060
+ group_size=group_size,
1061
+ with_vectors=return_embedding,
1062
+ score_threshold=score_threshold,
1063
+ )
1064
+ groups = response.groups
1065
+ return self._process_group_results(groups)
1066
+ else:
1067
+ response = await self._async_client.query_points(
1068
+ collection_name=self.index,
1069
+ query=query_embedding,
1070
+ using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
1071
+ query_filter=qdrant_filters,
1072
+ limit=top_k,
1073
+ with_vectors=return_embedding,
1074
+ score_threshold=score_threshold,
1075
+ )
1076
+ points = response.points
1077
+ return self._process_query_point_results(points, scale_score=scale_score)
1078
+
1079
+ async def _query_hybrid_async(
1080
+ self,
1081
+ query_embedding: List[float],
1082
+ query_sparse_embedding: SparseEmbedding,
1083
+ filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
1084
+ top_k: int = 10,
1085
+ return_embedding: bool = False,
1086
+ score_threshold: Optional[float] = None,
1087
+ group_by: Optional[str] = None,
1088
+ group_size: Optional[int] = None,
1089
+ ) -> List[Document]:
1090
+ """
1091
+ Asynchronously retrieves documents based on dense and sparse embeddings and fuses
1092
+ the results using Reciprocal Rank Fusion.
1093
+
1094
+ This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
1095
+ Use the `QdrantHybridRetriever` instead.
1096
+
1097
+ :param query_embedding: Dense embedding of the query.
1098
+ :param query_sparse_embedding: Sparse embedding of the query.
1099
+ :param filters: Filters applied to the retrieved documents.
1100
+ :param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
1101
+ groups to return.
1102
+ :param return_embedding: Whether to return the embeddings of the retrieved documents.
1103
+ :param score_threshold: A minimal score threshold for the result.
1104
+ Score of the returned result might be higher or smaller than the threshold
1105
+ depending on the Distance function used.
1106
+ E.g. for cosine similarity only higher scores will be returned.
1107
+ :param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
1108
+ value, all values will be used for grouping. One point can be in multiple groups.
1109
+ :param group_size: Maximum amount of points to return per group. Default is 3.
1110
+
1111
+ :returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
1112
+
1113
+ :raises QdrantStoreError:
1114
+ If the Document Store was initialized with `use_sparse_embeddings=False`.
1115
+ """
1116
+
1117
+ await self._initialize_async_client()
1118
+ assert self._async_client is not None
1119
+
1120
+ if not self.use_sparse_embeddings:
1121
+ message = (
1122
+ "You are trying to query using sparse embeddings, but the Document Store "
1123
+ "was initialized with `use_sparse_embeddings=False`. "
1124
+ )
1125
+ raise QdrantStoreError(message)
1126
+
1127
+ qdrant_filters = convert_filters_to_qdrant(filters)
1128
+
1129
+ try:
1130
+ if group_by:
1131
+ response = await self._async_client.query_points_groups(
1132
+ collection_name=self.index,
1133
+ prefetch=[
1134
+ rest.Prefetch(
1135
+ query=rest.SparseVector(
1136
+ indices=query_sparse_embedding.indices,
1137
+ values=query_sparse_embedding.values,
1138
+ ),
1139
+ using=SPARSE_VECTORS_NAME,
1140
+ filter=qdrant_filters,
1141
+ ),
1142
+ rest.Prefetch(
1143
+ query=query_embedding,
1144
+ using=DENSE_VECTORS_NAME,
1145
+ filter=qdrant_filters,
1146
+ ),
1147
+ ],
1148
+ query=rest.FusionQuery(fusion=rest.Fusion.RRF),
1149
+ limit=top_k,
1150
+ group_by=group_by,
1151
+ group_size=group_size,
1152
+ score_threshold=score_threshold,
1153
+ with_payload=True,
1154
+ with_vectors=return_embedding,
1155
+ )
1156
+ groups = response.groups
1157
+ else:
1158
+ response = await self._async_client.query_points(
1159
+ collection_name=self.index,
1160
+ prefetch=[
1161
+ rest.Prefetch(
1162
+ query=rest.SparseVector(
1163
+ indices=query_sparse_embedding.indices,
1164
+ values=query_sparse_embedding.values,
1165
+ ),
1166
+ using=SPARSE_VECTORS_NAME,
1167
+ filter=qdrant_filters,
1168
+ ),
1169
+ rest.Prefetch(
1170
+ query=query_embedding,
1171
+ using=DENSE_VECTORS_NAME,
1172
+ filter=qdrant_filters,
1173
+ ),
1174
+ ],
1175
+ query=rest.FusionQuery(fusion=rest.Fusion.RRF),
1176
+ limit=top_k,
1177
+ score_threshold=score_threshold,
1178
+ with_payload=True,
1179
+ with_vectors=return_embedding,
1180
+ )
1181
+ points = response.points
1182
+
1183
+ except Exception as e:
1184
+ msg = "Error during hybrid search"
1185
+ raise QdrantStoreError(msg) from e
1186
+
1187
+ if group_by:
1188
+ return self._process_group_results(groups)
1189
+ else:
1190
+ return self._process_query_point_results(points)
1191
+
1192
+ def get_distance(self, similarity: str) -> rest.Distance:
1193
+ """
1194
+ Retrieves the distance metric for the specified similarity measure.
1195
+
1196
+ :param similarity:
1197
+ The similarity measure to retrieve the distance.
1198
+ :returns:
1199
+ The corresponding rest.Distance object.
1200
+ :raises QdrantStoreError:
1201
+ If the provided similarity measure is not supported.
1202
+ """
1203
+ try:
1204
+ return self.SIMILARITY[similarity]
1205
+ except KeyError as ke:
1206
+ msg = (
1207
+ f"Provided similarity '{similarity}' is not supported by Qdrant "
1208
+ f"document store. Please choose one of the options: "
1209
+ f"{', '.join(self.SIMILARITY.keys())}"
1210
+ )
1211
+ raise QdrantStoreError(msg) from ke
1212
+
1213
+ def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
1214
+ """
1215
+ Create payload index for the collection if payload_fields_to_index is provided
1216
+ See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
1217
+ """
1218
+ if payload_fields_to_index is not None:
1219
+ for payload_index in payload_fields_to_index:
1220
+ # self._client is initialized at this point
1221
+ # since _initialize_client() is called before this method is executed
1222
+
1223
+ assert self._client is not None
1224
+ self._client.create_payload_index(
1225
+ collection_name=collection_name,
1226
+ field_name=payload_index["field_name"],
1227
+ field_schema=payload_index["field_schema"],
1228
+ )
1229
+
1230
+ async def _create_payload_index_async(
1231
+ self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None
1232
+ ):
1233
+ """
1234
+ Asynchronously create payload index for the collection if payload_fields_to_index is provided
1235
+ See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
1236
+ """
1237
+ if payload_fields_to_index is not None:
1238
+ for payload_index in payload_fields_to_index:
1239
+
1240
+ # self._async_client is initialized at this point
1241
+ # since _initialize_async_client() is called before this method is executed
1242
+ assert self._async_client is not None
1243
+
1244
+ await self._async_client.create_payload_index(
1245
+ collection_name=collection_name,
1246
+ field_name=payload_index["field_name"],
1247
+ field_schema=payload_index["field_schema"],
1248
+ )
1249
+
1250
+ def _set_up_collection(
1251
+ self,
1252
+ collection_name: str,
1253
+ embedding_dim: int,
1254
+ recreate_collection: bool,
1255
+ similarity: str,
1256
+ use_sparse_embeddings: bool,
1257
+ sparse_idf: bool,
1258
+ on_disk: bool = False,
1259
+ payload_fields_to_index: Optional[List[dict]] = None,
1260
+ ):
1261
+ """
1262
+ Sets up the Qdrant collection with the specified parameters.
1263
+ :param collection_name:
1264
+ The name of the collection to set up.
1265
+ :param embedding_dim:
1266
+ The dimension of the embeddings.
835
1267
  :param recreate_collection:
836
1268
  Whether to recreate the collection if it already exists.
837
1269
  :param similarity:
@@ -851,9 +1283,13 @@ class QdrantDocumentStore:
851
1283
  If the collection exists with a different similarity measure or embedding dimension.
852
1284
 
853
1285
  """
1286
+
1287
+ self._initialize_client()
1288
+ assert self._client is not None
1289
+
854
1290
  distance = self.get_distance(similarity)
855
1291
 
856
- if recreate_collection or not self.client.collection_exists(collection_name):
1292
+ if recreate_collection or not self._client.collection_exists(collection_name):
857
1293
  # There is no need to verify the current configuration of that
858
1294
  # collection. It might be just recreated again or does not exist yet.
859
1295
  self.recreate_collection(
@@ -863,64 +1299,65 @@ class QdrantDocumentStore:
863
1299
  self._create_payload_index(collection_name, payload_fields_to_index)
864
1300
  return
865
1301
 
866
- collection_info = self.client.get_collection(collection_name)
1302
+ collection_info = self._client.get_collection(collection_name)
867
1303
 
868
- has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
1304
+ self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
869
1305
 
870
- if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
871
- msg = (
872
- f"Collection '{collection_name}' already exists in Qdrant, "
873
- f"but it has been originally created outside of Haystack and is not supported. "
874
- f"If possible, you should create a new Document Store with Haystack. "
875
- f"In case you want to migrate the existing collection, see an example script in "
876
- f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
877
- f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
878
- )
879
- raise QdrantStoreError(msg)
1306
+ async def _set_up_collection_async(
1307
+ self,
1308
+ collection_name: str,
1309
+ embedding_dim: int,
1310
+ recreate_collection: bool,
1311
+ similarity: str,
1312
+ use_sparse_embeddings: bool,
1313
+ sparse_idf: bool,
1314
+ on_disk: bool = False,
1315
+ payload_fields_to_index: Optional[List[dict]] = None,
1316
+ ):
1317
+ """
1318
+ Asynchronously sets up the Qdrant collection with the specified parameters.
1319
+ :param collection_name:
1320
+ The name of the collection to set up.
1321
+ :param embedding_dim:
1322
+ The dimension of the embeddings.
1323
+ :param recreate_collection:
1324
+ Whether to recreate the collection if it already exists.
1325
+ :param similarity:
1326
+ The similarity measure to use.
1327
+ :param use_sparse_embeddings:
1328
+ Whether to use sparse embeddings.
1329
+ :param sparse_idf:
1330
+ Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
1331
+ :param on_disk:
1332
+ Whether to store the collection on disk.
1333
+ :param payload_fields_to_index:
1334
+ List of payload fields to index.
880
1335
 
881
- if self.use_sparse_embeddings and not has_named_vectors:
882
- msg = (
883
- f"Collection '{collection_name}' already exists in Qdrant, "
884
- f"but it has been originally created without sparse embedding vectors. "
885
- f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
886
- f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
887
- f"See `migrate_to_sparse_embeddings_support` function in "
888
- f"`haystack_integrations.document_stores.qdrant`."
889
- )
890
- raise QdrantStoreError(msg)
1336
+ :raises QdrantStoreError:
1337
+ If the collection exists with incompatible settings.
1338
+ :raises ValueError:
1339
+ If the collection exists with a different similarity measure or embedding dimension.
891
1340
 
892
- if not self.use_sparse_embeddings and has_named_vectors:
893
- msg = (
894
- f"Collection '{collection_name}' already exists in Qdrant, "
895
- f"but it has been originally created with sparse embedding vectors."
896
- f"If you want to use that collection, please set `use_sparse_embeddings=True`."
897
- )
898
- raise QdrantStoreError(msg)
1341
+ """
899
1342
 
900
- if self.use_sparse_embeddings:
901
- current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
902
- current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
903
- else:
904
- current_distance = collection_info.config.params.vectors.distance
905
- current_vector_size = collection_info.config.params.vectors.size
1343
+ await self._initialize_async_client()
1344
+ assert self._async_client is not None
906
1345
 
907
- if current_distance != distance:
908
- msg = (
909
- f"Collection '{collection_name}' already exists in Qdrant, "
910
- f"but it is configured with a similarity '{current_distance.name}'. "
911
- f"If you want to use that collection, but with a different "
912
- f"similarity, please set `recreate_collection=True` argument."
913
- )
914
- raise ValueError(msg)
1346
+ distance = self.get_distance(similarity)
915
1347
 
916
- if current_vector_size != embedding_dim:
917
- msg = (
918
- f"Collection '{collection_name}' already exists in Qdrant, "
919
- f"but it is configured with a vector size '{current_vector_size}'. "
920
- f"If you want to use that collection, but with a different "
921
- f"vector size, please set `recreate_collection=True` argument."
1348
+ if recreate_collection or not await self._async_client.collection_exists(collection_name):
1349
+ # There is no need to verify the current configuration of that
1350
+ # collection. It might be just recreated again or does not exist yet.
1351
+ await self.recreate_collection_async(
1352
+ collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
922
1353
  )
923
- raise ValueError(msg)
1354
+ # Create Payload index if payload_fields_to_index is provided
1355
+ await self._create_payload_index_async(collection_name, payload_fields_to_index)
1356
+ return
1357
+
1358
+ collection_info = await self._async_client.get_collection(collection_name)
1359
+
1360
+ self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
924
1361
 
925
1362
  def recreate_collection(
926
1363
  self,
@@ -947,44 +1384,65 @@ class QdrantDocumentStore:
947
1384
  :param sparse_idf:
948
1385
  Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
949
1386
  """
950
- if on_disk is None:
951
- on_disk = self.on_disk
1387
+ vectors_config, sparse_vectors_config = self._prepare_collection_config(
1388
+ embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
1389
+ )
1390
+ collection_params = self._prepare_collection_params()
952
1391
 
953
- if use_sparse_embeddings is None:
954
- use_sparse_embeddings = self.use_sparse_embeddings
1392
+ self._initialize_client()
1393
+ assert self._client is not None
955
1394
 
956
- # dense vectors configuration
957
- vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
1395
+ if self._client.collection_exists(collection_name):
1396
+ self._client.delete_collection(collection_name)
958
1397
 
959
- if use_sparse_embeddings:
960
- # in this case, we need to define named vectors
961
- vectors_config = {DENSE_VECTORS_NAME: vectors_config}
1398
+ self._client.create_collection(
1399
+ collection_name=collection_name,
1400
+ vectors_config=vectors_config,
1401
+ sparse_vectors_config=sparse_vectors_config,
1402
+ **collection_params,
1403
+ )
962
1404
 
963
- sparse_vectors_config = {
964
- SPARSE_VECTORS_NAME: rest.SparseVectorParams(
965
- index=rest.SparseIndexParams(
966
- on_disk=on_disk,
967
- ),
968
- modifier=rest.Modifier.IDF if sparse_idf else None,
969
- ),
970
- }
1405
+ async def recreate_collection_async(
1406
+ self,
1407
+ collection_name: str,
1408
+ distance,
1409
+ embedding_dim: int,
1410
+ on_disk: Optional[bool] = None,
1411
+ use_sparse_embeddings: Optional[bool] = None,
1412
+ sparse_idf: bool = False,
1413
+ ):
1414
+ """
1415
+ Asynchronously recreates the Qdrant collection with the specified parameters.
1416
+
1417
+ :param collection_name:
1418
+ The name of the collection to recreate.
1419
+ :param distance:
1420
+ The distance metric to use for the collection.
1421
+ :param embedding_dim:
1422
+ The dimension of the embeddings.
1423
+ :param on_disk:
1424
+ Whether to store the collection on disk.
1425
+ :param use_sparse_embeddings:
1426
+ Whether to use sparse embeddings.
1427
+ :param sparse_idf:
1428
+ Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
1429
+ """
1430
+ vectors_config, sparse_vectors_config = self._prepare_collection_config(
1431
+ embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
1432
+ )
1433
+ collection_params = self._prepare_collection_params()
1434
+
1435
+ await self._initialize_async_client()
1436
+ assert self._async_client is not None
971
1437
 
972
- if self.client.collection_exists(collection_name):
973
- self.client.delete_collection(collection_name)
1438
+ if await self._async_client.collection_exists(collection_name):
1439
+ await self._async_client.delete_collection(collection_name)
974
1440
 
975
- self.client.create_collection(
1441
+ await self._async_client.create_collection(
976
1442
  collection_name=collection_name,
977
1443
  vectors_config=vectors_config,
978
- sparse_vectors_config=sparse_vectors_config if use_sparse_embeddings else None,
979
- shard_number=self.shard_number,
980
- replication_factor=self.replication_factor,
981
- write_consistency_factor=self.write_consistency_factor,
982
- on_disk_payload=self.on_disk_payload,
983
- hnsw_config=self.hnsw_config,
984
- optimizers_config=self.optimizers_config,
985
- wal_config=self.wal_config,
986
- quantization_config=self.quantization_config,
987
- init_from=self.init_from,
1444
+ sparse_vectors_config=sparse_vectors_config,
1445
+ **collection_params,
988
1446
  )
989
1447
 
990
1448
  def _handle_duplicate_documents(
@@ -1014,12 +1472,38 @@ class QdrantDocumentStore:
1014
1472
 
1015
1473
  return documents
1016
1474
 
1017
- def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
1475
+ async def _handle_duplicate_documents_async(
1476
+ self,
1477
+ documents: List[Document],
1478
+ policy: DuplicatePolicy = None,
1479
+ ):
1018
1480
  """
1019
- Drop duplicate documents based on same hash ID.
1481
+ Asynchronously checks whether any of the passed documents is already existing
1482
+ in the chosen index and returns a list of
1483
+ documents that are not in the index yet.
1020
1484
 
1021
1485
  :param documents: A list of Haystack Document objects.
1486
+ :param policy: The duplicate policy to use when writing documents.
1022
1487
  :returns: A list of Haystack Document objects.
1488
+ """
1489
+
1490
+ if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
1491
+ documents = self._drop_duplicate_documents(documents)
1492
+ documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
1493
+ ids_exist_in_db: List[str] = [doc.id for doc in documents_found]
1494
+
1495
+ if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
1496
+ msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
1497
+ raise DuplicateDocumentError(msg)
1498
+
1499
+ documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
1500
+
1501
+ return documents
1502
+
1503
+ def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
1504
+ """
1505
+ Drop duplicate documents based on same hash ID.
1506
+
1023
1507
  """
1024
1508
  _hash_ids: Set = set()
1025
1509
  _documents: List[Document] = []
@@ -1027,12 +1511,202 @@ class QdrantDocumentStore:
1027
1511
  for document in documents:
1028
1512
  if document.id in _hash_ids:
1029
1513
  logger.info(
1030
- "Duplicate Documents: Document with id '%s' already exists in index '%s'",
1031
- document.id,
1032
- self.index,
1514
+ "Duplicate Documents: Document with id '{document_id}' already exists in index '{index}'",
1515
+ document_id=document.id,
1516
+ index=self.index,
1033
1517
  )
1034
1518
  continue
1035
1519
  _documents.append(document)
1036
1520
  _hash_ids.add(document.id)
1037
1521
 
1038
1522
  return _documents
1523
+
1524
+ def _prepare_collection_params(self):
1525
+ """
1526
+ Prepares the common parameters for collection creation.
1527
+ """
1528
+ return {
1529
+ "shard_number": self.shard_number,
1530
+ "replication_factor": self.replication_factor,
1531
+ "write_consistency_factor": self.write_consistency_factor,
1532
+ "on_disk_payload": self.on_disk_payload,
1533
+ "hnsw_config": self.hnsw_config,
1534
+ "optimizers_config": self.optimizers_config,
1535
+ "wal_config": self.wal_config,
1536
+ "quantization_config": self.quantization_config,
1537
+ "init_from": self.init_from,
1538
+ }
1539
+
1540
+ def _prepare_client_params(self):
1541
+ """
1542
+ Prepares the common parameters for client initialization.
1543
+
1544
+ """
1545
+ return {
1546
+ "location": self.location,
1547
+ "url": self.url,
1548
+ "port": self.port,
1549
+ "grpc_port": self.grpc_port,
1550
+ "prefer_grpc": self.prefer_grpc,
1551
+ "https": self.https,
1552
+ "api_key": self.api_key.resolve_value() if self.api_key else None,
1553
+ "prefix": self.prefix,
1554
+ "timeout": self.timeout,
1555
+ "host": self.host,
1556
+ "path": self.path,
1557
+ "metadata": self.metadata,
1558
+ "force_disable_check_same_thread": self.force_disable_check_same_thread,
1559
+ }
1560
+
1561
+ def _prepare_collection_config(
1562
+ self,
1563
+ embedding_dim: int,
1564
+ distance,
1565
+ on_disk: Optional[bool] = None,
1566
+ use_sparse_embeddings: Optional[bool] = None,
1567
+ sparse_idf: bool = False,
1568
+ ):
1569
+ """
1570
+ Prepares the configuration for creating or recreating a Qdrant collection.
1571
+
1572
+ """
1573
+ if on_disk is None:
1574
+ on_disk = self.on_disk
1575
+
1576
+ if use_sparse_embeddings is None:
1577
+ use_sparse_embeddings = self.use_sparse_embeddings
1578
+
1579
+ # dense vectors configuration
1580
+ vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
1581
+ sparse_vectors_config = None
1582
+
1583
+ if use_sparse_embeddings:
1584
+ # in this case, we need to define named vectors
1585
+ vectors_config = {DENSE_VECTORS_NAME: vectors_config}
1586
+
1587
+ sparse_vectors_config = {
1588
+ SPARSE_VECTORS_NAME: rest.SparseVectorParams(
1589
+ index=rest.SparseIndexParams(
1590
+ on_disk=on_disk,
1591
+ ),
1592
+ modifier=rest.Modifier.IDF if sparse_idf else None,
1593
+ ),
1594
+ }
1595
+
1596
+ return vectors_config, sparse_vectors_config
1597
+
1598
+ def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None):
1599
+ """
1600
+ Validates the filters provided for querying.
1601
+ """
1602
+ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
1603
+ msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
1604
+ raise ValueError(msg)
1605
+
1606
+ if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
1607
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
1608
+ raise ValueError(msg)
1609
+
1610
+ def _process_query_point_results(self, results, scale_score: bool = False):
1611
+ """
1612
+ Processes query results from Qdrant.
1613
+ """
1614
+ documents = [
1615
+ convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
1616
+ for point in results
1617
+ ]
1618
+
1619
+ if scale_score:
1620
+ for document in documents:
1621
+ score = document.score
1622
+ if self.similarity == "cosine":
1623
+ score = (score + 1) / 2
1624
+ else:
1625
+ score = float(1 / (1 + np.exp(-score / 100)))
1626
+ document.score = score
1627
+
1628
+ return documents
1629
+
1630
+ def _process_group_results(self, groups):
1631
+ """
1632
+ Processes grouped query results from Qdrant.
1633
+
1634
+ """
1635
+ if not groups:
1636
+ return []
1637
+
1638
+ return [
1639
+ convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
1640
+ for group in groups
1641
+ for point in group.hits
1642
+ ]
1643
+
1644
+ def _validate_collection_compatibility(
1645
+ self,
1646
+ collection_name: str,
1647
+ collection_info,
1648
+ distance,
1649
+ embedding_dim: int,
1650
+ ):
1651
+ """
1652
+ Validates that an existing collection is compatible with the current configuration.
1653
+ """
1654
+ has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
1655
+
1656
+ if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
1657
+ msg = (
1658
+ f"Collection '{collection_name}' already exists in Qdrant, "
1659
+ f"but it has been originally created outside of Haystack and is not supported. "
1660
+ f"If possible, you should create a new Document Store with Haystack. "
1661
+ f"In case you want to migrate the existing collection, see an example script in "
1662
+ f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
1663
+ f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
1664
+ )
1665
+ raise QdrantStoreError(msg)
1666
+
1667
+ if self.use_sparse_embeddings and not has_named_vectors:
1668
+ msg = (
1669
+ f"Collection '{collection_name}' already exists in Qdrant, "
1670
+ f"but it has been originally created without sparse embedding vectors. "
1671
+ f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
1672
+ f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
1673
+ f"See `migrate_to_sparse_embeddings_support` function in "
1674
+ f"`haystack_integrations.document_stores.qdrant`."
1675
+ )
1676
+ raise QdrantStoreError(msg)
1677
+
1678
+ if not self.use_sparse_embeddings and has_named_vectors:
1679
+ msg = (
1680
+ f"Collection '{collection_name}' already exists in Qdrant, "
1681
+ f"but it has been originally created with sparse embedding vectors."
1682
+ f"If you want to use that collection, please set `use_sparse_embeddings=True`."
1683
+ )
1684
+ raise QdrantStoreError(msg)
1685
+
1686
+ # Get current distance and vector size based on collection configuration
1687
+ if self.use_sparse_embeddings:
1688
+ current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
1689
+ current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
1690
+ else:
1691
+ current_distance = collection_info.config.params.vectors.distance
1692
+ current_vector_size = collection_info.config.params.vectors.size
1693
+
1694
+ # Validate distance metric
1695
+ if current_distance != distance:
1696
+ msg = (
1697
+ f"Collection '{collection_name}' already exists in Qdrant, "
1698
+ f"but it is configured with a similarity '{current_distance.name}'. "
1699
+ f"If you want to use that collection, but with a different "
1700
+ f"similarity, please set `recreate_collection=True` argument."
1701
+ )
1702
+ raise ValueError(msg)
1703
+
1704
+ # Validate embedding dimension
1705
+ if current_vector_size != embedding_dim:
1706
+ msg = (
1707
+ f"Collection '{collection_name}' already exists in Qdrant, "
1708
+ f"but it is configured with a vector size '{current_vector_size}'. "
1709
+ f"If you want to use that collection, but with a different "
1710
+ f"vector size, please set `recreate_collection=True` argument."
1711
+ )
1712
+ raise ValueError(msg)