qdrant-haystack 8.1.0__py3-none-any.whl → 9.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- haystack_integrations/components/retrievers/qdrant/retriever.py +142 -0
- haystack_integrations/document_stores/qdrant/converters.py +1 -17
- haystack_integrations/document_stores/qdrant/document_store.py +899 -226
- haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +4 -3
- {qdrant_haystack-8.1.0.dist-info → qdrant_haystack-9.1.0.dist-info}/METADATA +2 -2
- {qdrant_haystack-8.1.0.dist-info → qdrant_haystack-9.1.0.dist-info}/RECORD +8 -8
- {qdrant_haystack-8.1.0.dist-info → qdrant_haystack-9.1.0.dist-info}/WHEEL +0 -0
- {qdrant_haystack-8.1.0.dist-info → qdrant_haystack-9.1.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
import logging
|
|
3
2
|
from itertools import islice
|
|
4
|
-
from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Union
|
|
3
|
+
from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
7
6
|
import qdrant_client
|
|
8
|
-
from haystack import default_from_dict, default_to_dict
|
|
7
|
+
from haystack import default_from_dict, default_to_dict, logging
|
|
9
8
|
from haystack.dataclasses import Document
|
|
10
9
|
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
11
10
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
@@ -217,6 +216,7 @@ class QdrantDocumentStore:
|
|
|
217
216
|
"""
|
|
218
217
|
|
|
219
218
|
self._client = None
|
|
219
|
+
self._async_client = None
|
|
220
220
|
|
|
221
221
|
# Store the Qdrant client specific attributes
|
|
222
222
|
self.location = location
|
|
@@ -258,24 +258,10 @@ class QdrantDocumentStore:
|
|
|
258
258
|
self.write_batch_size = write_batch_size
|
|
259
259
|
self.scroll_size = scroll_size
|
|
260
260
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
self._client = qdrant_client.QdrantClient(
|
|
265
|
-
location=self.location,
|
|
266
|
-
url=self.url,
|
|
267
|
-
port=self.port,
|
|
268
|
-
grpc_port=self.grpc_port,
|
|
269
|
-
prefer_grpc=self.prefer_grpc,
|
|
270
|
-
https=self.https,
|
|
271
|
-
api_key=self.api_key.resolve_value() if self.api_key else None,
|
|
272
|
-
prefix=self.prefix,
|
|
273
|
-
timeout=self.timeout,
|
|
274
|
-
host=self.host,
|
|
275
|
-
path=self.path,
|
|
276
|
-
metadata=self.metadata,
|
|
277
|
-
force_disable_check_same_thread=self.force_disable_check_same_thread,
|
|
278
|
-
)
|
|
261
|
+
def _initialize_client(self):
|
|
262
|
+
if self._client is None:
|
|
263
|
+
client_params = self._prepare_client_params()
|
|
264
|
+
self._client = qdrant_client.QdrantClient(**client_params)
|
|
279
265
|
# Make sure the collection is properly set up
|
|
280
266
|
self._set_up_collection(
|
|
281
267
|
self.index,
|
|
@@ -287,14 +273,52 @@ class QdrantDocumentStore:
|
|
|
287
273
|
self.on_disk,
|
|
288
274
|
self.payload_fields_to_index,
|
|
289
275
|
)
|
|
290
|
-
|
|
276
|
+
|
|
277
|
+
async def _initialize_async_client(self):
|
|
278
|
+
"""
|
|
279
|
+
Returns the asynchronous Qdrant client, initializing it if necessary.
|
|
280
|
+
"""
|
|
281
|
+
if self._async_client is None:
|
|
282
|
+
client_params = self._prepare_client_params()
|
|
283
|
+
self._async_client = qdrant_client.AsyncQdrantClient(
|
|
284
|
+
**client_params,
|
|
285
|
+
)
|
|
286
|
+
await self._set_up_collection_async(
|
|
287
|
+
self.index,
|
|
288
|
+
self.embedding_dim,
|
|
289
|
+
self.recreate_index,
|
|
290
|
+
self.similarity,
|
|
291
|
+
self.use_sparse_embeddings,
|
|
292
|
+
self.sparse_idf,
|
|
293
|
+
self.on_disk,
|
|
294
|
+
self.payload_fields_to_index,
|
|
295
|
+
)
|
|
291
296
|
|
|
292
297
|
def count_documents(self) -> int:
|
|
293
298
|
"""
|
|
294
299
|
Returns the number of documents present in the Document Store.
|
|
295
300
|
"""
|
|
301
|
+
self._initialize_client()
|
|
302
|
+
assert self._client is not None
|
|
303
|
+
try:
|
|
304
|
+
response = self._client.count(
|
|
305
|
+
collection_name=self.index,
|
|
306
|
+
)
|
|
307
|
+
return response.count
|
|
308
|
+
except (UnexpectedResponse, ValueError):
|
|
309
|
+
# Qdrant local raises ValueError if the collection is not found, but
|
|
310
|
+
# with the remote server UnexpectedResponse is raised. Until that's unified,
|
|
311
|
+
# we need to catch both.
|
|
312
|
+
return 0
|
|
313
|
+
|
|
314
|
+
async def count_documents_async(self) -> int:
|
|
315
|
+
"""
|
|
316
|
+
Asynchronously returns the number of documents present in the document dtore.
|
|
317
|
+
"""
|
|
318
|
+
await self._initialize_async_client()
|
|
319
|
+
assert self._async_client is not None
|
|
296
320
|
try:
|
|
297
|
-
response = self.
|
|
321
|
+
response = await self._async_client.count(
|
|
298
322
|
collection_name=self.index,
|
|
299
323
|
)
|
|
300
324
|
return response.count
|
|
@@ -317,19 +341,29 @@ class QdrantDocumentStore:
|
|
|
317
341
|
:param filters: The filters to apply to the document list.
|
|
318
342
|
:returns: A list of documents that match the given filters.
|
|
319
343
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
raise ValueError(msg)
|
|
344
|
+
# No need to initialize client here as _get_documents_generator
|
|
345
|
+
# will handle client initialization internally
|
|
323
346
|
|
|
324
|
-
|
|
325
|
-
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
326
|
-
raise ValueError(msg)
|
|
347
|
+
self._validate_filters(filters)
|
|
327
348
|
return list(
|
|
328
|
-
self.
|
|
349
|
+
self._get_documents_generator(
|
|
329
350
|
filters,
|
|
330
351
|
)
|
|
331
352
|
)
|
|
332
353
|
|
|
354
|
+
async def filter_documents_async(
|
|
355
|
+
self,
|
|
356
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
357
|
+
) -> List[Document]:
|
|
358
|
+
"""
|
|
359
|
+
Asynchronously returns the documents that match the provided filters.
|
|
360
|
+
"""
|
|
361
|
+
# No need to initialize client here as _get_documents_generator_async
|
|
362
|
+
# will handle client initialization internally
|
|
363
|
+
|
|
364
|
+
self._validate_filters(filters)
|
|
365
|
+
return [doc async for doc in self._get_documents_generator_async(filters)]
|
|
366
|
+
|
|
333
367
|
def write_documents(
|
|
334
368
|
self,
|
|
335
369
|
documents: List[Document],
|
|
@@ -348,13 +382,14 @@ class QdrantDocumentStore:
|
|
|
348
382
|
|
|
349
383
|
:returns: The number of documents written to the document store.
|
|
350
384
|
"""
|
|
385
|
+
|
|
386
|
+
self._initialize_client()
|
|
387
|
+
assert self._client is not None
|
|
388
|
+
|
|
351
389
|
for doc in documents:
|
|
352
390
|
if not isinstance(doc, Document):
|
|
353
391
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
354
392
|
raise ValueError(msg)
|
|
355
|
-
self._set_up_collection(
|
|
356
|
-
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
|
|
357
|
-
)
|
|
358
393
|
|
|
359
394
|
if len(documents) == 0:
|
|
360
395
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -373,7 +408,61 @@ class QdrantDocumentStore:
|
|
|
373
408
|
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
374
409
|
)
|
|
375
410
|
|
|
376
|
-
self.
|
|
411
|
+
self._client.upsert(
|
|
412
|
+
collection_name=self.index,
|
|
413
|
+
points=batch,
|
|
414
|
+
wait=self.wait_result_from_api,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
progress_bar.update(self.write_batch_size)
|
|
418
|
+
return len(document_objects)
|
|
419
|
+
|
|
420
|
+
async def write_documents_async(
|
|
421
|
+
self,
|
|
422
|
+
documents: List[Document],
|
|
423
|
+
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
424
|
+
) -> int:
|
|
425
|
+
"""
|
|
426
|
+
Asynchronously writes documents to Qdrant using the specified policy.
|
|
427
|
+
The QdrantDocumentStore can handle duplicate documents based on the given policy.
|
|
428
|
+
The available policies are:
|
|
429
|
+
- `FAIL`: The operation will raise an error if any document already exists.
|
|
430
|
+
- `OVERWRITE`: Existing documents will be overwritten with the new ones.
|
|
431
|
+
- `SKIP`: Existing documents will be skipped, and only new documents will be added.
|
|
432
|
+
|
|
433
|
+
:param documents: A list of Document objects to write to Qdrant.
|
|
434
|
+
:param policy: The policy for handling duplicate documents.
|
|
435
|
+
|
|
436
|
+
:returns: The number of documents written to the document store.
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
await self._initialize_async_client()
|
|
440
|
+
assert self._async_client is not None
|
|
441
|
+
|
|
442
|
+
for doc in documents:
|
|
443
|
+
if not isinstance(doc, Document):
|
|
444
|
+
msg = f"""DocumentStore.write_documents_async() expects a list of
|
|
445
|
+
Documents but got an element of {type(doc)}."""
|
|
446
|
+
raise ValueError(msg)
|
|
447
|
+
|
|
448
|
+
if len(documents) == 0:
|
|
449
|
+
logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
|
|
450
|
+
return 0
|
|
451
|
+
|
|
452
|
+
document_objects = await self._handle_duplicate_documents_async(
|
|
453
|
+
documents=documents,
|
|
454
|
+
policy=policy,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
|
|
458
|
+
with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
|
|
459
|
+
for document_batch in batched_documents:
|
|
460
|
+
batch = convert_haystack_documents_to_qdrant_points(
|
|
461
|
+
document_batch,
|
|
462
|
+
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
await self._async_client.upsert(
|
|
377
466
|
collection_name=self.index,
|
|
378
467
|
points=batch,
|
|
379
468
|
wait=self.wait_result_from_api,
|
|
@@ -388,9 +477,13 @@ class QdrantDocumentStore:
|
|
|
388
477
|
|
|
389
478
|
:param document_ids: the document ids to delete
|
|
390
479
|
"""
|
|
480
|
+
|
|
481
|
+
self._initialize_client()
|
|
482
|
+
assert self._client is not None
|
|
483
|
+
|
|
391
484
|
ids = [convert_id(_id) for _id in document_ids]
|
|
392
485
|
try:
|
|
393
|
-
self.
|
|
486
|
+
self._client.delete(
|
|
394
487
|
collection_name=self.index,
|
|
395
488
|
points_selector=ids,
|
|
396
489
|
wait=self.wait_result_from_api,
|
|
@@ -400,6 +493,28 @@ class QdrantDocumentStore:
|
|
|
400
493
|
"Called QdrantDocumentStore.delete_documents() on a non-existing ID",
|
|
401
494
|
)
|
|
402
495
|
|
|
496
|
+
async def delete_documents_async(self, document_ids: List[str]) -> None:
|
|
497
|
+
"""
|
|
498
|
+
Asynchronously deletes documents that match the provided `document_ids` from the document store.
|
|
499
|
+
|
|
500
|
+
:param document_ids: the document ids to delete
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
await self._initialize_async_client()
|
|
504
|
+
assert self._async_client is not None
|
|
505
|
+
|
|
506
|
+
ids = [convert_id(_id) for _id in document_ids]
|
|
507
|
+
try:
|
|
508
|
+
await self._async_client.delete(
|
|
509
|
+
collection_name=self.index,
|
|
510
|
+
points_selector=ids,
|
|
511
|
+
wait=self.wait_result_from_api,
|
|
512
|
+
)
|
|
513
|
+
except KeyError:
|
|
514
|
+
logger.warning(
|
|
515
|
+
"Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
|
|
516
|
+
)
|
|
517
|
+
|
|
403
518
|
@classmethod
|
|
404
519
|
def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
|
|
405
520
|
"""
|
|
@@ -430,7 +545,7 @@ class QdrantDocumentStore:
|
|
|
430
545
|
**init_params,
|
|
431
546
|
)
|
|
432
547
|
|
|
433
|
-
def
|
|
548
|
+
def _get_documents_generator(
|
|
434
549
|
self,
|
|
435
550
|
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
436
551
|
) -> Generator[Document, None, None]:
|
|
@@ -441,13 +556,53 @@ class QdrantDocumentStore:
|
|
|
441
556
|
:returns: A generator that yields documents retrieved from Qdrant.
|
|
442
557
|
"""
|
|
443
558
|
|
|
559
|
+
self._initialize_client()
|
|
560
|
+
assert self._client is not None
|
|
561
|
+
|
|
562
|
+
index = self.index
|
|
563
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
564
|
+
|
|
565
|
+
next_offset = None
|
|
566
|
+
stop_scrolling = False
|
|
567
|
+
while not stop_scrolling:
|
|
568
|
+
records, next_offset = self._client.scroll(
|
|
569
|
+
collection_name=index,
|
|
570
|
+
scroll_filter=qdrant_filters,
|
|
571
|
+
limit=self.scroll_size,
|
|
572
|
+
offset=next_offset,
|
|
573
|
+
with_payload=True,
|
|
574
|
+
with_vectors=True,
|
|
575
|
+
)
|
|
576
|
+
stop_scrolling = next_offset is None or (
|
|
577
|
+
isinstance(next_offset, grpc.PointId) and next_offset.num == 0 and next_offset.uuid == ""
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
for record in records:
|
|
581
|
+
yield convert_qdrant_point_to_haystack_document(
|
|
582
|
+
record, use_sparse_embeddings=self.use_sparse_embeddings
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
async def _get_documents_generator_async(
|
|
586
|
+
self,
|
|
587
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
588
|
+
) -> AsyncGenerator[Document, None]:
|
|
589
|
+
"""
|
|
590
|
+
Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
|
|
591
|
+
|
|
592
|
+
:param filters: Filters applied to the retrieved documents.
|
|
593
|
+
:returns: An asynchronous generator that yields documents retrieved from Qdrant.
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
await self._initialize_async_client()
|
|
597
|
+
assert self._async_client is not None
|
|
598
|
+
|
|
444
599
|
index = self.index
|
|
445
600
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
446
601
|
|
|
447
602
|
next_offset = None
|
|
448
603
|
stop_scrolling = False
|
|
449
604
|
while not stop_scrolling:
|
|
450
|
-
records, next_offset = self.
|
|
605
|
+
records, next_offset = await self._async_client.scroll(
|
|
451
606
|
collection_name=index,
|
|
452
607
|
scroll_filter=qdrant_filters,
|
|
453
608
|
limit=self.scroll_size,
|
|
@@ -480,8 +635,44 @@ class QdrantDocumentStore:
|
|
|
480
635
|
"""
|
|
481
636
|
documents: List[Document] = []
|
|
482
637
|
|
|
638
|
+
self._initialize_client()
|
|
639
|
+
assert self._client is not None
|
|
640
|
+
|
|
641
|
+
ids = [convert_id(_id) for _id in ids]
|
|
642
|
+
records = self._client.retrieve(
|
|
643
|
+
collection_name=self.index,
|
|
644
|
+
ids=ids,
|
|
645
|
+
with_payload=True,
|
|
646
|
+
with_vectors=True,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
for record in records:
|
|
650
|
+
documents.append(
|
|
651
|
+
convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
652
|
+
)
|
|
653
|
+
return documents
|
|
654
|
+
|
|
655
|
+
async def get_documents_by_id_async(
|
|
656
|
+
self,
|
|
657
|
+
ids: List[str],
|
|
658
|
+
) -> List[Document]:
|
|
659
|
+
"""
|
|
660
|
+
Retrieves documents from Qdrant by their IDs.
|
|
661
|
+
|
|
662
|
+
:param ids:
|
|
663
|
+
A list of document IDs to retrieve.
|
|
664
|
+
:param index:
|
|
665
|
+
The name of the index to retrieve documents from.
|
|
666
|
+
:returns:
|
|
667
|
+
A list of documents.
|
|
668
|
+
"""
|
|
669
|
+
documents: List[Document] = []
|
|
670
|
+
|
|
671
|
+
await self._initialize_async_client()
|
|
672
|
+
assert self._async_client is not None
|
|
673
|
+
|
|
483
674
|
ids = [convert_id(_id) for _id in ids]
|
|
484
|
-
records = self.
|
|
675
|
+
records = await self._async_client.retrieve(
|
|
485
676
|
collection_name=self.index,
|
|
486
677
|
ids=ids,
|
|
487
678
|
with_payload=True,
|
|
@@ -527,6 +718,8 @@ class QdrantDocumentStore:
|
|
|
527
718
|
:raises QdrantStoreError:
|
|
528
719
|
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
529
720
|
"""
|
|
721
|
+
self._initialize_client()
|
|
722
|
+
assert self._client is not None
|
|
530
723
|
|
|
531
724
|
if not self.use_sparse_embeddings:
|
|
532
725
|
message = (
|
|
@@ -539,7 +732,7 @@ class QdrantDocumentStore:
|
|
|
539
732
|
query_indices = query_sparse_embedding.indices
|
|
540
733
|
query_values = query_sparse_embedding.values
|
|
541
734
|
if group_by:
|
|
542
|
-
groups = self.
|
|
735
|
+
groups = self._client.query_points_groups(
|
|
543
736
|
collection_name=self.index,
|
|
544
737
|
query=rest.SparseVector(
|
|
545
738
|
indices=query_indices,
|
|
@@ -553,17 +746,9 @@ class QdrantDocumentStore:
|
|
|
553
746
|
with_vectors=return_embedding,
|
|
554
747
|
score_threshold=score_threshold,
|
|
555
748
|
).groups
|
|
556
|
-
|
|
557
|
-
[
|
|
558
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
559
|
-
for group in groups
|
|
560
|
-
for point in group.hits
|
|
561
|
-
]
|
|
562
|
-
if groups
|
|
563
|
-
else []
|
|
564
|
-
)
|
|
749
|
+
return self._process_group_results(groups)
|
|
565
750
|
else:
|
|
566
|
-
points = self.
|
|
751
|
+
points = self._client.query_points(
|
|
567
752
|
collection_name=self.index,
|
|
568
753
|
query=rest.SparseVector(
|
|
569
754
|
indices=query_indices,
|
|
@@ -575,16 +760,7 @@ class QdrantDocumentStore:
|
|
|
575
760
|
with_vectors=return_embedding,
|
|
576
761
|
score_threshold=score_threshold,
|
|
577
762
|
).points
|
|
578
|
-
|
|
579
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
580
|
-
for point in points
|
|
581
|
-
]
|
|
582
|
-
if scale_score:
|
|
583
|
-
for document in results:
|
|
584
|
-
score = document.score
|
|
585
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
586
|
-
document.score = score
|
|
587
|
-
return results
|
|
763
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
588
764
|
|
|
589
765
|
def _query_by_embedding(
|
|
590
766
|
self,
|
|
@@ -616,9 +792,12 @@ class QdrantDocumentStore:
|
|
|
616
792
|
|
|
617
793
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
618
794
|
"""
|
|
795
|
+
self._initialize_client()
|
|
796
|
+
assert self._client is not None
|
|
797
|
+
|
|
619
798
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
620
799
|
if group_by:
|
|
621
|
-
groups = self.
|
|
800
|
+
groups = self._client.query_points_groups(
|
|
622
801
|
collection_name=self.index,
|
|
623
802
|
query=query_embedding,
|
|
624
803
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -629,17 +808,10 @@ class QdrantDocumentStore:
|
|
|
629
808
|
with_vectors=return_embedding,
|
|
630
809
|
score_threshold=score_threshold,
|
|
631
810
|
).groups
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
635
|
-
for group in groups
|
|
636
|
-
for point in group.hits
|
|
637
|
-
]
|
|
638
|
-
if groups
|
|
639
|
-
else []
|
|
640
|
-
)
|
|
811
|
+
return self._process_group_results(groups)
|
|
812
|
+
|
|
641
813
|
else:
|
|
642
|
-
points = self.
|
|
814
|
+
points = self._client.query_points(
|
|
643
815
|
collection_name=self.index,
|
|
644
816
|
query=query_embedding,
|
|
645
817
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -648,20 +820,7 @@ class QdrantDocumentStore:
|
|
|
648
820
|
with_vectors=return_embedding,
|
|
649
821
|
score_threshold=score_threshold,
|
|
650
822
|
).points
|
|
651
|
-
|
|
652
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
653
|
-
for point in points
|
|
654
|
-
]
|
|
655
|
-
|
|
656
|
-
if scale_score:
|
|
657
|
-
for document in results:
|
|
658
|
-
score = document.score
|
|
659
|
-
if self.similarity == "cosine":
|
|
660
|
-
score = (score + 1) / 2
|
|
661
|
-
else:
|
|
662
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
663
|
-
document.score = score
|
|
664
|
-
return results
|
|
823
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
665
824
|
|
|
666
825
|
def _query_hybrid(
|
|
667
826
|
self,
|
|
@@ -702,6 +861,10 @@ class QdrantDocumentStore:
|
|
|
702
861
|
|
|
703
862
|
# This implementation is based on the code from the Python Qdrant client:
|
|
704
863
|
# https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
|
|
864
|
+
|
|
865
|
+
self._initialize_client()
|
|
866
|
+
assert self._client is not None
|
|
867
|
+
|
|
705
868
|
if not self.use_sparse_embeddings:
|
|
706
869
|
message = (
|
|
707
870
|
"You are trying to query using sparse embeddings, but the Document Store "
|
|
@@ -713,7 +876,7 @@ class QdrantDocumentStore:
|
|
|
713
876
|
|
|
714
877
|
try:
|
|
715
878
|
if group_by:
|
|
716
|
-
groups = self.
|
|
879
|
+
groups = self._client.query_points_groups(
|
|
717
880
|
collection_name=self.index,
|
|
718
881
|
prefetch=[
|
|
719
882
|
rest.Prefetch(
|
|
@@ -739,7 +902,7 @@ class QdrantDocumentStore:
|
|
|
739
902
|
with_vectors=return_embedding,
|
|
740
903
|
).groups
|
|
741
904
|
else:
|
|
742
|
-
points = self.
|
|
905
|
+
points = self._client.query_points(
|
|
743
906
|
collection_name=self.index,
|
|
744
907
|
prefetch=[
|
|
745
908
|
rest.Prefetch(
|
|
@@ -768,71 +931,339 @@ class QdrantDocumentStore:
|
|
|
768
931
|
raise QdrantStoreError(msg) from e
|
|
769
932
|
|
|
770
933
|
if group_by:
|
|
771
|
-
|
|
772
|
-
[
|
|
773
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
774
|
-
for group in groups
|
|
775
|
-
for point in group.hits
|
|
776
|
-
]
|
|
777
|
-
if groups
|
|
778
|
-
else []
|
|
779
|
-
)
|
|
934
|
+
return self._process_group_results(groups)
|
|
780
935
|
else:
|
|
781
|
-
|
|
936
|
+
return self._process_query_point_results(points)
|
|
782
937
|
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
938
|
+
async def _query_by_sparse_async(
|
|
939
|
+
self,
|
|
940
|
+
query_sparse_embedding: SparseEmbedding,
|
|
941
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
942
|
+
top_k: int = 10,
|
|
943
|
+
scale_score: bool = False,
|
|
944
|
+
return_embedding: bool = False,
|
|
945
|
+
score_threshold: Optional[float] = None,
|
|
946
|
+
group_by: Optional[str] = None,
|
|
947
|
+
group_size: Optional[int] = None,
|
|
948
|
+
) -> List[Document]:
|
|
786
949
|
"""
|
|
787
|
-
|
|
950
|
+
Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
951
|
+
|
|
952
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
953
|
+
:param filters: Filters applied to the retrieved documents.
|
|
954
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
955
|
+
groups to return.
|
|
956
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
957
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
958
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
959
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
960
|
+
depending on the Distance function used.
|
|
961
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
962
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
963
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
964
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
965
|
+
|
|
966
|
+
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
788
967
|
|
|
789
|
-
:param similarity:
|
|
790
|
-
The similarity measure to retrieve the distance.
|
|
791
|
-
:returns:
|
|
792
|
-
The corresponding rest.Distance object.
|
|
793
968
|
:raises QdrantStoreError:
|
|
794
|
-
If the
|
|
969
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
795
970
|
"""
|
|
796
|
-
try:
|
|
797
|
-
return self.SIMILARITY[similarity]
|
|
798
|
-
except KeyError as ke:
|
|
799
|
-
msg = (
|
|
800
|
-
f"Provided similarity '{similarity}' is not supported by Qdrant "
|
|
801
|
-
f"document store. Please choose one of the options: "
|
|
802
|
-
f"{', '.join(self.SIMILARITY.keys())}"
|
|
803
|
-
)
|
|
804
|
-
raise QdrantStoreError(msg) from ke
|
|
805
971
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
Create payload index for the collection if payload_fields_to_index is provided
|
|
809
|
-
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
810
|
-
"""
|
|
811
|
-
if payload_fields_to_index is not None:
|
|
812
|
-
for payload_index in payload_fields_to_index:
|
|
813
|
-
self.client.create_payload_index(
|
|
814
|
-
collection_name=collection_name,
|
|
815
|
-
field_name=payload_index["field_name"],
|
|
816
|
-
field_schema=payload_index["field_schema"],
|
|
817
|
-
)
|
|
972
|
+
await self._initialize_async_client()
|
|
973
|
+
assert self._async_client is not None
|
|
818
974
|
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
975
|
+
if not self.use_sparse_embeddings:
|
|
976
|
+
message = (
|
|
977
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
978
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
979
|
+
)
|
|
980
|
+
raise QdrantStoreError(message)
|
|
981
|
+
|
|
982
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
983
|
+
query_indices = query_sparse_embedding.indices
|
|
984
|
+
query_values = query_sparse_embedding.values
|
|
985
|
+
if group_by:
|
|
986
|
+
response = await self._async_client.query_points_groups(
|
|
987
|
+
collection_name=self.index,
|
|
988
|
+
query=rest.SparseVector(
|
|
989
|
+
indices=query_indices,
|
|
990
|
+
values=query_values,
|
|
991
|
+
),
|
|
992
|
+
using=SPARSE_VECTORS_NAME,
|
|
993
|
+
query_filter=qdrant_filters,
|
|
994
|
+
limit=top_k,
|
|
995
|
+
group_by=group_by,
|
|
996
|
+
group_size=group_size,
|
|
997
|
+
with_vectors=return_embedding,
|
|
998
|
+
score_threshold=score_threshold,
|
|
999
|
+
)
|
|
1000
|
+
groups = response.groups
|
|
1001
|
+
return self._process_group_results(groups)
|
|
1002
|
+
else:
|
|
1003
|
+
response = await self._async_client.query_points(
|
|
1004
|
+
collection_name=self.index,
|
|
1005
|
+
query=rest.SparseVector(
|
|
1006
|
+
indices=query_indices,
|
|
1007
|
+
values=query_values,
|
|
1008
|
+
),
|
|
1009
|
+
using=SPARSE_VECTORS_NAME,
|
|
1010
|
+
query_filter=qdrant_filters,
|
|
1011
|
+
limit=top_k,
|
|
1012
|
+
with_vectors=return_embedding,
|
|
1013
|
+
score_threshold=score_threshold,
|
|
1014
|
+
)
|
|
1015
|
+
points = response.points
|
|
1016
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1017
|
+
|
|
1018
|
+
async def _query_by_embedding_async(
|
|
1019
|
+
self,
|
|
1020
|
+
query_embedding: List[float],
|
|
1021
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
1022
|
+
top_k: int = 10,
|
|
1023
|
+
scale_score: bool = False,
|
|
1024
|
+
return_embedding: bool = False,
|
|
1025
|
+
score_threshold: Optional[float] = None,
|
|
1026
|
+
group_by: Optional[str] = None,
|
|
1027
|
+
group_size: Optional[int] = None,
|
|
1028
|
+
) -> List[Document]:
|
|
1029
|
+
"""
|
|
1030
|
+
Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
1031
|
+
|
|
1032
|
+
:param query_embedding: Dense embedding of the query.
|
|
1033
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1034
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1035
|
+
groups to return.
|
|
1036
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
1037
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1038
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1039
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1040
|
+
depending on the Distance function used.
|
|
1041
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1042
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1043
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1044
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1045
|
+
|
|
1046
|
+
:returns: List of documents that are most similar to `query_embedding`.
|
|
1047
|
+
"""
|
|
1048
|
+
await self._initialize_async_client()
|
|
1049
|
+
assert self._async_client is not None
|
|
1050
|
+
|
|
1051
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1052
|
+
if group_by:
|
|
1053
|
+
response = await self._async_client.query_points_groups(
|
|
1054
|
+
collection_name=self.index,
|
|
1055
|
+
query=query_embedding,
|
|
1056
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1057
|
+
query_filter=qdrant_filters,
|
|
1058
|
+
limit=top_k,
|
|
1059
|
+
group_by=group_by,
|
|
1060
|
+
group_size=group_size,
|
|
1061
|
+
with_vectors=return_embedding,
|
|
1062
|
+
score_threshold=score_threshold,
|
|
1063
|
+
)
|
|
1064
|
+
groups = response.groups
|
|
1065
|
+
return self._process_group_results(groups)
|
|
1066
|
+
else:
|
|
1067
|
+
response = await self._async_client.query_points(
|
|
1068
|
+
collection_name=self.index,
|
|
1069
|
+
query=query_embedding,
|
|
1070
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1071
|
+
query_filter=qdrant_filters,
|
|
1072
|
+
limit=top_k,
|
|
1073
|
+
with_vectors=return_embedding,
|
|
1074
|
+
score_threshold=score_threshold,
|
|
1075
|
+
)
|
|
1076
|
+
points = response.points
|
|
1077
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1078
|
+
|
|
1079
|
+
async def _query_hybrid_async(
|
|
1080
|
+
self,
|
|
1081
|
+
query_embedding: List[float],
|
|
1082
|
+
query_sparse_embedding: SparseEmbedding,
|
|
1083
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
1084
|
+
top_k: int = 10,
|
|
1085
|
+
return_embedding: bool = False,
|
|
1086
|
+
score_threshold: Optional[float] = None,
|
|
1087
|
+
group_by: Optional[str] = None,
|
|
1088
|
+
group_size: Optional[int] = None,
|
|
1089
|
+
) -> List[Document]:
|
|
1090
|
+
"""
|
|
1091
|
+
Asynchronously retrieves documents based on dense and sparse embeddings and fuses
|
|
1092
|
+
the results using Reciprocal Rank Fusion.
|
|
1093
|
+
|
|
1094
|
+
This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
|
|
1095
|
+
Use the `QdrantHybridRetriever` instead.
|
|
1096
|
+
|
|
1097
|
+
:param query_embedding: Dense embedding of the query.
|
|
1098
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
1099
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1100
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1101
|
+
groups to return.
|
|
1102
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1103
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1104
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1105
|
+
depending on the Distance function used.
|
|
1106
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1107
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1108
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1109
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1110
|
+
|
|
1111
|
+
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
1112
|
+
|
|
1113
|
+
:raises QdrantStoreError:
|
|
1114
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
1115
|
+
"""
|
|
1116
|
+
|
|
1117
|
+
await self._initialize_async_client()
|
|
1118
|
+
assert self._async_client is not None
|
|
1119
|
+
|
|
1120
|
+
if not self.use_sparse_embeddings:
|
|
1121
|
+
message = (
|
|
1122
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
1123
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
1124
|
+
)
|
|
1125
|
+
raise QdrantStoreError(message)
|
|
1126
|
+
|
|
1127
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1128
|
+
|
|
1129
|
+
try:
|
|
1130
|
+
if group_by:
|
|
1131
|
+
response = await self._async_client.query_points_groups(
|
|
1132
|
+
collection_name=self.index,
|
|
1133
|
+
prefetch=[
|
|
1134
|
+
rest.Prefetch(
|
|
1135
|
+
query=rest.SparseVector(
|
|
1136
|
+
indices=query_sparse_embedding.indices,
|
|
1137
|
+
values=query_sparse_embedding.values,
|
|
1138
|
+
),
|
|
1139
|
+
using=SPARSE_VECTORS_NAME,
|
|
1140
|
+
filter=qdrant_filters,
|
|
1141
|
+
),
|
|
1142
|
+
rest.Prefetch(
|
|
1143
|
+
query=query_embedding,
|
|
1144
|
+
using=DENSE_VECTORS_NAME,
|
|
1145
|
+
filter=qdrant_filters,
|
|
1146
|
+
),
|
|
1147
|
+
],
|
|
1148
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1149
|
+
limit=top_k,
|
|
1150
|
+
group_by=group_by,
|
|
1151
|
+
group_size=group_size,
|
|
1152
|
+
score_threshold=score_threshold,
|
|
1153
|
+
with_payload=True,
|
|
1154
|
+
with_vectors=return_embedding,
|
|
1155
|
+
)
|
|
1156
|
+
groups = response.groups
|
|
1157
|
+
else:
|
|
1158
|
+
response = await self._async_client.query_points(
|
|
1159
|
+
collection_name=self.index,
|
|
1160
|
+
prefetch=[
|
|
1161
|
+
rest.Prefetch(
|
|
1162
|
+
query=rest.SparseVector(
|
|
1163
|
+
indices=query_sparse_embedding.indices,
|
|
1164
|
+
values=query_sparse_embedding.values,
|
|
1165
|
+
),
|
|
1166
|
+
using=SPARSE_VECTORS_NAME,
|
|
1167
|
+
filter=qdrant_filters,
|
|
1168
|
+
),
|
|
1169
|
+
rest.Prefetch(
|
|
1170
|
+
query=query_embedding,
|
|
1171
|
+
using=DENSE_VECTORS_NAME,
|
|
1172
|
+
filter=qdrant_filters,
|
|
1173
|
+
),
|
|
1174
|
+
],
|
|
1175
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1176
|
+
limit=top_k,
|
|
1177
|
+
score_threshold=score_threshold,
|
|
1178
|
+
with_payload=True,
|
|
1179
|
+
with_vectors=return_embedding,
|
|
1180
|
+
)
|
|
1181
|
+
points = response.points
|
|
1182
|
+
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
msg = "Error during hybrid search"
|
|
1185
|
+
raise QdrantStoreError(msg) from e
|
|
1186
|
+
|
|
1187
|
+
if group_by:
|
|
1188
|
+
return self._process_group_results(groups)
|
|
1189
|
+
else:
|
|
1190
|
+
return self._process_query_point_results(points)
|
|
1191
|
+
|
|
1192
|
+
def get_distance(self, similarity: str) -> rest.Distance:
|
|
1193
|
+
"""
|
|
1194
|
+
Retrieves the distance metric for the specified similarity measure.
|
|
1195
|
+
|
|
1196
|
+
:param similarity:
|
|
1197
|
+
The similarity measure to retrieve the distance.
|
|
1198
|
+
:returns:
|
|
1199
|
+
The corresponding rest.Distance object.
|
|
1200
|
+
:raises QdrantStoreError:
|
|
1201
|
+
If the provided similarity measure is not supported.
|
|
1202
|
+
"""
|
|
1203
|
+
try:
|
|
1204
|
+
return self.SIMILARITY[similarity]
|
|
1205
|
+
except KeyError as ke:
|
|
1206
|
+
msg = (
|
|
1207
|
+
f"Provided similarity '{similarity}' is not supported by Qdrant "
|
|
1208
|
+
f"document store. Please choose one of the options: "
|
|
1209
|
+
f"{', '.join(self.SIMILARITY.keys())}"
|
|
1210
|
+
)
|
|
1211
|
+
raise QdrantStoreError(msg) from ke
|
|
1212
|
+
|
|
1213
|
+
def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
|
|
1214
|
+
"""
|
|
1215
|
+
Create payload index for the collection if payload_fields_to_index is provided
|
|
1216
|
+
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1217
|
+
"""
|
|
1218
|
+
if payload_fields_to_index is not None:
|
|
1219
|
+
for payload_index in payload_fields_to_index:
|
|
1220
|
+
# self._client is initialized at this point
|
|
1221
|
+
# since _initialize_client() is called before this method is executed
|
|
1222
|
+
|
|
1223
|
+
assert self._client is not None
|
|
1224
|
+
self._client.create_payload_index(
|
|
1225
|
+
collection_name=collection_name,
|
|
1226
|
+
field_name=payload_index["field_name"],
|
|
1227
|
+
field_schema=payload_index["field_schema"],
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
async def _create_payload_index_async(
|
|
1231
|
+
self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None
|
|
1232
|
+
):
|
|
1233
|
+
"""
|
|
1234
|
+
Asynchronously create payload index for the collection if payload_fields_to_index is provided
|
|
1235
|
+
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1236
|
+
"""
|
|
1237
|
+
if payload_fields_to_index is not None:
|
|
1238
|
+
for payload_index in payload_fields_to_index:
|
|
1239
|
+
|
|
1240
|
+
# self._async_client is initialized at this point
|
|
1241
|
+
# since _initialize_async_client() is called before this method is executed
|
|
1242
|
+
assert self._async_client is not None
|
|
1243
|
+
|
|
1244
|
+
await self._async_client.create_payload_index(
|
|
1245
|
+
collection_name=collection_name,
|
|
1246
|
+
field_name=payload_index["field_name"],
|
|
1247
|
+
field_schema=payload_index["field_schema"],
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
def _set_up_collection(
|
|
1251
|
+
self,
|
|
1252
|
+
collection_name: str,
|
|
1253
|
+
embedding_dim: int,
|
|
1254
|
+
recreate_collection: bool,
|
|
1255
|
+
similarity: str,
|
|
1256
|
+
use_sparse_embeddings: bool,
|
|
1257
|
+
sparse_idf: bool,
|
|
1258
|
+
on_disk: bool = False,
|
|
1259
|
+
payload_fields_to_index: Optional[List[dict]] = None,
|
|
1260
|
+
):
|
|
1261
|
+
"""
|
|
1262
|
+
Sets up the Qdrant collection with the specified parameters.
|
|
1263
|
+
:param collection_name:
|
|
1264
|
+
The name of the collection to set up.
|
|
1265
|
+
:param embedding_dim:
|
|
1266
|
+
The dimension of the embeddings.
|
|
836
1267
|
:param recreate_collection:
|
|
837
1268
|
Whether to recreate the collection if it already exists.
|
|
838
1269
|
:param similarity:
|
|
@@ -852,9 +1283,13 @@ class QdrantDocumentStore:
|
|
|
852
1283
|
If the collection exists with a different similarity measure or embedding dimension.
|
|
853
1284
|
|
|
854
1285
|
"""
|
|
1286
|
+
|
|
1287
|
+
self._initialize_client()
|
|
1288
|
+
assert self._client is not None
|
|
1289
|
+
|
|
855
1290
|
distance = self.get_distance(similarity)
|
|
856
1291
|
|
|
857
|
-
if recreate_collection or not self.
|
|
1292
|
+
if recreate_collection or not self._client.collection_exists(collection_name):
|
|
858
1293
|
# There is no need to verify the current configuration of that
|
|
859
1294
|
# collection. It might be just recreated again or does not exist yet.
|
|
860
1295
|
self.recreate_collection(
|
|
@@ -864,64 +1299,65 @@ class QdrantDocumentStore:
|
|
|
864
1299
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
865
1300
|
return
|
|
866
1301
|
|
|
867
|
-
collection_info = self.
|
|
1302
|
+
collection_info = self._client.get_collection(collection_name)
|
|
868
1303
|
|
|
869
|
-
|
|
1304
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
870
1305
|
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
1306
|
+
async def _set_up_collection_async(
|
|
1307
|
+
self,
|
|
1308
|
+
collection_name: str,
|
|
1309
|
+
embedding_dim: int,
|
|
1310
|
+
recreate_collection: bool,
|
|
1311
|
+
similarity: str,
|
|
1312
|
+
use_sparse_embeddings: bool,
|
|
1313
|
+
sparse_idf: bool,
|
|
1314
|
+
on_disk: bool = False,
|
|
1315
|
+
payload_fields_to_index: Optional[List[dict]] = None,
|
|
1316
|
+
):
|
|
1317
|
+
"""
|
|
1318
|
+
Asynchronously sets up the Qdrant collection with the specified parameters.
|
|
1319
|
+
:param collection_name:
|
|
1320
|
+
The name of the collection to set up.
|
|
1321
|
+
:param embedding_dim:
|
|
1322
|
+
The dimension of the embeddings.
|
|
1323
|
+
:param recreate_collection:
|
|
1324
|
+
Whether to recreate the collection if it already exists.
|
|
1325
|
+
:param similarity:
|
|
1326
|
+
The similarity measure to use.
|
|
1327
|
+
:param use_sparse_embeddings:
|
|
1328
|
+
Whether to use sparse embeddings.
|
|
1329
|
+
:param sparse_idf:
|
|
1330
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
1331
|
+
:param on_disk:
|
|
1332
|
+
Whether to store the collection on disk.
|
|
1333
|
+
:param payload_fields_to_index:
|
|
1334
|
+
List of payload fields to index.
|
|
881
1335
|
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
887
|
-
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
|
|
888
|
-
f"See `migrate_to_sparse_embeddings_support` function in "
|
|
889
|
-
f"`haystack_integrations.document_stores.qdrant`."
|
|
890
|
-
)
|
|
891
|
-
raise QdrantStoreError(msg)
|
|
1336
|
+
:raises QdrantStoreError:
|
|
1337
|
+
If the collection exists with incompatible settings.
|
|
1338
|
+
:raises ValueError:
|
|
1339
|
+
If the collection exists with a different similarity measure or embedding dimension.
|
|
892
1340
|
|
|
893
|
-
|
|
894
|
-
msg = (
|
|
895
|
-
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
896
|
-
f"but it has been originally created with sparse embedding vectors."
|
|
897
|
-
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
898
|
-
)
|
|
899
|
-
raise QdrantStoreError(msg)
|
|
1341
|
+
"""
|
|
900
1342
|
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
|
|
904
|
-
else:
|
|
905
|
-
current_distance = collection_info.config.params.vectors.distance
|
|
906
|
-
current_vector_size = collection_info.config.params.vectors.size
|
|
1343
|
+
await self._initialize_async_client()
|
|
1344
|
+
assert self._async_client is not None
|
|
907
1345
|
|
|
908
|
-
|
|
909
|
-
msg = (
|
|
910
|
-
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
911
|
-
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
912
|
-
f"If you want to use that collection, but with a different "
|
|
913
|
-
f"similarity, please set `recreate_collection=True` argument."
|
|
914
|
-
)
|
|
915
|
-
raise ValueError(msg)
|
|
1346
|
+
distance = self.get_distance(similarity)
|
|
916
1347
|
|
|
917
|
-
if
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
f"vector size, please set `recreate_collection=True` argument."
|
|
1348
|
+
if recreate_collection or not await self._async_client.collection_exists(collection_name):
|
|
1349
|
+
# There is no need to verify the current configuration of that
|
|
1350
|
+
# collection. It might be just recreated again or does not exist yet.
|
|
1351
|
+
await self.recreate_collection_async(
|
|
1352
|
+
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
|
|
923
1353
|
)
|
|
924
|
-
|
|
1354
|
+
# Create Payload index if payload_fields_to_index is provided
|
|
1355
|
+
await self._create_payload_index_async(collection_name, payload_fields_to_index)
|
|
1356
|
+
return
|
|
1357
|
+
|
|
1358
|
+
collection_info = await self._async_client.get_collection(collection_name)
|
|
1359
|
+
|
|
1360
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
925
1361
|
|
|
926
1362
|
def recreate_collection(
|
|
927
1363
|
self,
|
|
@@ -948,44 +1384,65 @@ class QdrantDocumentStore:
|
|
|
948
1384
|
:param sparse_idf:
|
|
949
1385
|
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
950
1386
|
"""
|
|
951
|
-
|
|
952
|
-
on_disk
|
|
1387
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
1388
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
1389
|
+
)
|
|
1390
|
+
collection_params = self._prepare_collection_params()
|
|
953
1391
|
|
|
954
|
-
|
|
955
|
-
|
|
1392
|
+
self._initialize_client()
|
|
1393
|
+
assert self._client is not None
|
|
956
1394
|
|
|
957
|
-
|
|
958
|
-
|
|
1395
|
+
if self._client.collection_exists(collection_name):
|
|
1396
|
+
self._client.delete_collection(collection_name)
|
|
959
1397
|
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
vectors_config
|
|
1398
|
+
self._client.create_collection(
|
|
1399
|
+
collection_name=collection_name,
|
|
1400
|
+
vectors_config=vectors_config,
|
|
1401
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
1402
|
+
**collection_params,
|
|
1403
|
+
)
|
|
963
1404
|
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
1405
|
+
async def recreate_collection_async(
|
|
1406
|
+
self,
|
|
1407
|
+
collection_name: str,
|
|
1408
|
+
distance,
|
|
1409
|
+
embedding_dim: int,
|
|
1410
|
+
on_disk: Optional[bool] = None,
|
|
1411
|
+
use_sparse_embeddings: Optional[bool] = None,
|
|
1412
|
+
sparse_idf: bool = False,
|
|
1413
|
+
):
|
|
1414
|
+
"""
|
|
1415
|
+
Asynchronously recreates the Qdrant collection with the specified parameters.
|
|
1416
|
+
|
|
1417
|
+
:param collection_name:
|
|
1418
|
+
The name of the collection to recreate.
|
|
1419
|
+
:param distance:
|
|
1420
|
+
The distance metric to use for the collection.
|
|
1421
|
+
:param embedding_dim:
|
|
1422
|
+
The dimension of the embeddings.
|
|
1423
|
+
:param on_disk:
|
|
1424
|
+
Whether to store the collection on disk.
|
|
1425
|
+
:param use_sparse_embeddings:
|
|
1426
|
+
Whether to use sparse embeddings.
|
|
1427
|
+
:param sparse_idf:
|
|
1428
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
1429
|
+
"""
|
|
1430
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
1431
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
1432
|
+
)
|
|
1433
|
+
collection_params = self._prepare_collection_params()
|
|
1434
|
+
|
|
1435
|
+
await self._initialize_async_client()
|
|
1436
|
+
assert self._async_client is not None
|
|
972
1437
|
|
|
973
|
-
if self.
|
|
974
|
-
self.
|
|
1438
|
+
if await self._async_client.collection_exists(collection_name):
|
|
1439
|
+
await self._async_client.delete_collection(collection_name)
|
|
975
1440
|
|
|
976
|
-
self.
|
|
1441
|
+
await self._async_client.create_collection(
|
|
977
1442
|
collection_name=collection_name,
|
|
978
1443
|
vectors_config=vectors_config,
|
|
979
|
-
sparse_vectors_config=sparse_vectors_config
|
|
980
|
-
|
|
981
|
-
replication_factor=self.replication_factor,
|
|
982
|
-
write_consistency_factor=self.write_consistency_factor,
|
|
983
|
-
on_disk_payload=self.on_disk_payload,
|
|
984
|
-
hnsw_config=self.hnsw_config,
|
|
985
|
-
optimizers_config=self.optimizers_config,
|
|
986
|
-
wal_config=self.wal_config,
|
|
987
|
-
quantization_config=self.quantization_config,
|
|
988
|
-
init_from=self.init_from,
|
|
1444
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
1445
|
+
**collection_params,
|
|
989
1446
|
)
|
|
990
1447
|
|
|
991
1448
|
def _handle_duplicate_documents(
|
|
@@ -1015,12 +1472,38 @@ class QdrantDocumentStore:
|
|
|
1015
1472
|
|
|
1016
1473
|
return documents
|
|
1017
1474
|
|
|
1018
|
-
def
|
|
1475
|
+
async def _handle_duplicate_documents_async(
|
|
1476
|
+
self,
|
|
1477
|
+
documents: List[Document],
|
|
1478
|
+
policy: DuplicatePolicy = None,
|
|
1479
|
+
):
|
|
1019
1480
|
"""
|
|
1020
|
-
|
|
1481
|
+
Asynchronously checks whether any of the passed documents is already existing
|
|
1482
|
+
in the chosen index and returns a list of
|
|
1483
|
+
documents that are not in the index yet.
|
|
1021
1484
|
|
|
1022
1485
|
:param documents: A list of Haystack Document objects.
|
|
1486
|
+
:param policy: The duplicate policy to use when writing documents.
|
|
1023
1487
|
:returns: A list of Haystack Document objects.
|
|
1488
|
+
"""
|
|
1489
|
+
|
|
1490
|
+
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
1491
|
+
documents = self._drop_duplicate_documents(documents)
|
|
1492
|
+
documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
|
|
1493
|
+
ids_exist_in_db: List[str] = [doc.id for doc in documents_found]
|
|
1494
|
+
|
|
1495
|
+
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
1496
|
+
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
1497
|
+
raise DuplicateDocumentError(msg)
|
|
1498
|
+
|
|
1499
|
+
documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
|
|
1500
|
+
|
|
1501
|
+
return documents
|
|
1502
|
+
|
|
1503
|
+
def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
|
|
1504
|
+
"""
|
|
1505
|
+
Drop duplicate documents based on same hash ID.
|
|
1506
|
+
|
|
1024
1507
|
"""
|
|
1025
1508
|
_hash_ids: Set = set()
|
|
1026
1509
|
_documents: List[Document] = []
|
|
@@ -1037,3 +1520,193 @@ class QdrantDocumentStore:
|
|
|
1037
1520
|
_hash_ids.add(document.id)
|
|
1038
1521
|
|
|
1039
1522
|
return _documents
|
|
1523
|
+
|
|
1524
|
+
def _prepare_collection_params(self):
|
|
1525
|
+
"""
|
|
1526
|
+
Prepares the common parameters for collection creation.
|
|
1527
|
+
"""
|
|
1528
|
+
return {
|
|
1529
|
+
"shard_number": self.shard_number,
|
|
1530
|
+
"replication_factor": self.replication_factor,
|
|
1531
|
+
"write_consistency_factor": self.write_consistency_factor,
|
|
1532
|
+
"on_disk_payload": self.on_disk_payload,
|
|
1533
|
+
"hnsw_config": self.hnsw_config,
|
|
1534
|
+
"optimizers_config": self.optimizers_config,
|
|
1535
|
+
"wal_config": self.wal_config,
|
|
1536
|
+
"quantization_config": self.quantization_config,
|
|
1537
|
+
"init_from": self.init_from,
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
def _prepare_client_params(self):
|
|
1541
|
+
"""
|
|
1542
|
+
Prepares the common parameters for client initialization.
|
|
1543
|
+
|
|
1544
|
+
"""
|
|
1545
|
+
return {
|
|
1546
|
+
"location": self.location,
|
|
1547
|
+
"url": self.url,
|
|
1548
|
+
"port": self.port,
|
|
1549
|
+
"grpc_port": self.grpc_port,
|
|
1550
|
+
"prefer_grpc": self.prefer_grpc,
|
|
1551
|
+
"https": self.https,
|
|
1552
|
+
"api_key": self.api_key.resolve_value() if self.api_key else None,
|
|
1553
|
+
"prefix": self.prefix,
|
|
1554
|
+
"timeout": self.timeout,
|
|
1555
|
+
"host": self.host,
|
|
1556
|
+
"path": self.path,
|
|
1557
|
+
"metadata": self.metadata,
|
|
1558
|
+
"force_disable_check_same_thread": self.force_disable_check_same_thread,
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
def _prepare_collection_config(
|
|
1562
|
+
self,
|
|
1563
|
+
embedding_dim: int,
|
|
1564
|
+
distance,
|
|
1565
|
+
on_disk: Optional[bool] = None,
|
|
1566
|
+
use_sparse_embeddings: Optional[bool] = None,
|
|
1567
|
+
sparse_idf: bool = False,
|
|
1568
|
+
):
|
|
1569
|
+
"""
|
|
1570
|
+
Prepares the configuration for creating or recreating a Qdrant collection.
|
|
1571
|
+
|
|
1572
|
+
"""
|
|
1573
|
+
if on_disk is None:
|
|
1574
|
+
on_disk = self.on_disk
|
|
1575
|
+
|
|
1576
|
+
if use_sparse_embeddings is None:
|
|
1577
|
+
use_sparse_embeddings = self.use_sparse_embeddings
|
|
1578
|
+
|
|
1579
|
+
# dense vectors configuration
|
|
1580
|
+
vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
|
|
1581
|
+
sparse_vectors_config = None
|
|
1582
|
+
|
|
1583
|
+
if use_sparse_embeddings:
|
|
1584
|
+
# in this case, we need to define named vectors
|
|
1585
|
+
vectors_config = {DENSE_VECTORS_NAME: vectors_config}
|
|
1586
|
+
|
|
1587
|
+
sparse_vectors_config = {
|
|
1588
|
+
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
1589
|
+
index=rest.SparseIndexParams(
|
|
1590
|
+
on_disk=on_disk,
|
|
1591
|
+
),
|
|
1592
|
+
modifier=rest.Modifier.IDF if sparse_idf else None,
|
|
1593
|
+
),
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
return vectors_config, sparse_vectors_config
|
|
1597
|
+
|
|
1598
|
+
def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None):
|
|
1599
|
+
"""
|
|
1600
|
+
Validates the filters provided for querying.
|
|
1601
|
+
"""
|
|
1602
|
+
if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
|
|
1603
|
+
msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
|
|
1604
|
+
raise ValueError(msg)
|
|
1605
|
+
|
|
1606
|
+
if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
|
|
1607
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
1608
|
+
raise ValueError(msg)
|
|
1609
|
+
|
|
1610
|
+
def _process_query_point_results(self, results, scale_score: bool = False):
|
|
1611
|
+
"""
|
|
1612
|
+
Processes query results from Qdrant.
|
|
1613
|
+
"""
|
|
1614
|
+
documents = [
|
|
1615
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1616
|
+
for point in results
|
|
1617
|
+
]
|
|
1618
|
+
|
|
1619
|
+
if scale_score:
|
|
1620
|
+
for document in documents:
|
|
1621
|
+
score = document.score
|
|
1622
|
+
if self.similarity == "cosine":
|
|
1623
|
+
score = (score + 1) / 2
|
|
1624
|
+
else:
|
|
1625
|
+
score = float(1 / (1 + np.exp(-score / 100)))
|
|
1626
|
+
document.score = score
|
|
1627
|
+
|
|
1628
|
+
return documents
|
|
1629
|
+
|
|
1630
|
+
def _process_group_results(self, groups):
|
|
1631
|
+
"""
|
|
1632
|
+
Processes grouped query results from Qdrant.
|
|
1633
|
+
|
|
1634
|
+
"""
|
|
1635
|
+
if not groups:
|
|
1636
|
+
return []
|
|
1637
|
+
|
|
1638
|
+
return [
|
|
1639
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1640
|
+
for group in groups
|
|
1641
|
+
for point in group.hits
|
|
1642
|
+
]
|
|
1643
|
+
|
|
1644
|
+
def _validate_collection_compatibility(
|
|
1645
|
+
self,
|
|
1646
|
+
collection_name: str,
|
|
1647
|
+
collection_info,
|
|
1648
|
+
distance,
|
|
1649
|
+
embedding_dim: int,
|
|
1650
|
+
):
|
|
1651
|
+
"""
|
|
1652
|
+
Validates that an existing collection is compatible with the current configuration.
|
|
1653
|
+
"""
|
|
1654
|
+
has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
|
|
1655
|
+
|
|
1656
|
+
if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
|
|
1657
|
+
msg = (
|
|
1658
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1659
|
+
f"but it has been originally created outside of Haystack and is not supported. "
|
|
1660
|
+
f"If possible, you should create a new Document Store with Haystack. "
|
|
1661
|
+
f"In case you want to migrate the existing collection, see an example script in "
|
|
1662
|
+
f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
|
|
1663
|
+
f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
|
|
1664
|
+
)
|
|
1665
|
+
raise QdrantStoreError(msg)
|
|
1666
|
+
|
|
1667
|
+
if self.use_sparse_embeddings and not has_named_vectors:
|
|
1668
|
+
msg = (
|
|
1669
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1670
|
+
f"but it has been originally created without sparse embedding vectors. "
|
|
1671
|
+
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
1672
|
+
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
|
|
1673
|
+
f"See `migrate_to_sparse_embeddings_support` function in "
|
|
1674
|
+
f"`haystack_integrations.document_stores.qdrant`."
|
|
1675
|
+
)
|
|
1676
|
+
raise QdrantStoreError(msg)
|
|
1677
|
+
|
|
1678
|
+
if not self.use_sparse_embeddings and has_named_vectors:
|
|
1679
|
+
msg = (
|
|
1680
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1681
|
+
f"but it has been originally created with sparse embedding vectors."
|
|
1682
|
+
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
1683
|
+
)
|
|
1684
|
+
raise QdrantStoreError(msg)
|
|
1685
|
+
|
|
1686
|
+
# Get current distance and vector size based on collection configuration
|
|
1687
|
+
if self.use_sparse_embeddings:
|
|
1688
|
+
current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
|
|
1689
|
+
current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
|
|
1690
|
+
else:
|
|
1691
|
+
current_distance = collection_info.config.params.vectors.distance
|
|
1692
|
+
current_vector_size = collection_info.config.params.vectors.size
|
|
1693
|
+
|
|
1694
|
+
# Validate distance metric
|
|
1695
|
+
if current_distance != distance:
|
|
1696
|
+
msg = (
|
|
1697
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1698
|
+
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
1699
|
+
f"If you want to use that collection, but with a different "
|
|
1700
|
+
f"similarity, please set `recreate_collection=True` argument."
|
|
1701
|
+
)
|
|
1702
|
+
raise ValueError(msg)
|
|
1703
|
+
|
|
1704
|
+
# Validate embedding dimension
|
|
1705
|
+
if current_vector_size != embedding_dim:
|
|
1706
|
+
msg = (
|
|
1707
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1708
|
+
f"but it is configured with a vector size '{current_vector_size}'. "
|
|
1709
|
+
f"If you want to use that collection, but with a different "
|
|
1710
|
+
f"vector size, please set `recreate_collection=True` argument."
|
|
1711
|
+
)
|
|
1712
|
+
raise ValueError(msg)
|