qdrant-haystack 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_integrations/components/retrievers/qdrant/retriever.py +142 -0
- haystack_integrations/document_stores/qdrant/document_store.py +901 -227
- {qdrant_haystack-9.0.0.dist-info → qdrant_haystack-9.1.1.dist-info}/METADATA +2 -1
- {qdrant_haystack-9.0.0.dist-info → qdrant_haystack-9.1.1.dist-info}/RECORD +6 -6
- {qdrant_haystack-9.0.0.dist-info → qdrant_haystack-9.1.1.dist-info}/WHEEL +0 -0
- {qdrant_haystack-9.0.0.dist-info → qdrant_haystack-9.1.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from itertools import islice
|
|
3
|
-
from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Union
|
|
3
|
+
from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import qdrant_client
|
|
@@ -216,6 +216,7 @@ class QdrantDocumentStore:
|
|
|
216
216
|
"""
|
|
217
217
|
|
|
218
218
|
self._client = None
|
|
219
|
+
self._async_client = None
|
|
219
220
|
|
|
220
221
|
# Store the Qdrant client specific attributes
|
|
221
222
|
self.location = location
|
|
@@ -257,24 +258,10 @@ class QdrantDocumentStore:
|
|
|
257
258
|
self.write_batch_size = write_batch_size
|
|
258
259
|
self.scroll_size = scroll_size
|
|
259
260
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
self._client = qdrant_client.QdrantClient(
|
|
264
|
-
location=self.location,
|
|
265
|
-
url=self.url,
|
|
266
|
-
port=self.port,
|
|
267
|
-
grpc_port=self.grpc_port,
|
|
268
|
-
prefer_grpc=self.prefer_grpc,
|
|
269
|
-
https=self.https,
|
|
270
|
-
api_key=self.api_key.resolve_value() if self.api_key else None,
|
|
271
|
-
prefix=self.prefix,
|
|
272
|
-
timeout=self.timeout,
|
|
273
|
-
host=self.host,
|
|
274
|
-
path=self.path,
|
|
275
|
-
metadata=self.metadata,
|
|
276
|
-
force_disable_check_same_thread=self.force_disable_check_same_thread,
|
|
277
|
-
)
|
|
261
|
+
def _initialize_client(self):
|
|
262
|
+
if self._client is None:
|
|
263
|
+
client_params = self._prepare_client_params()
|
|
264
|
+
self._client = qdrant_client.QdrantClient(**client_params)
|
|
278
265
|
# Make sure the collection is properly set up
|
|
279
266
|
self._set_up_collection(
|
|
280
267
|
self.index,
|
|
@@ -286,14 +273,52 @@ class QdrantDocumentStore:
|
|
|
286
273
|
self.on_disk,
|
|
287
274
|
self.payload_fields_to_index,
|
|
288
275
|
)
|
|
289
|
-
|
|
276
|
+
|
|
277
|
+
async def _initialize_async_client(self):
|
|
278
|
+
"""
|
|
279
|
+
Returns the asynchronous Qdrant client, initializing it if necessary.
|
|
280
|
+
"""
|
|
281
|
+
if self._async_client is None:
|
|
282
|
+
client_params = self._prepare_client_params()
|
|
283
|
+
self._async_client = qdrant_client.AsyncQdrantClient(
|
|
284
|
+
**client_params,
|
|
285
|
+
)
|
|
286
|
+
await self._set_up_collection_async(
|
|
287
|
+
self.index,
|
|
288
|
+
self.embedding_dim,
|
|
289
|
+
self.recreate_index,
|
|
290
|
+
self.similarity,
|
|
291
|
+
self.use_sparse_embeddings,
|
|
292
|
+
self.sparse_idf,
|
|
293
|
+
self.on_disk,
|
|
294
|
+
self.payload_fields_to_index,
|
|
295
|
+
)
|
|
290
296
|
|
|
291
297
|
def count_documents(self) -> int:
|
|
292
298
|
"""
|
|
293
299
|
Returns the number of documents present in the Document Store.
|
|
294
300
|
"""
|
|
301
|
+
self._initialize_client()
|
|
302
|
+
assert self._client is not None
|
|
303
|
+
try:
|
|
304
|
+
response = self._client.count(
|
|
305
|
+
collection_name=self.index,
|
|
306
|
+
)
|
|
307
|
+
return response.count
|
|
308
|
+
except (UnexpectedResponse, ValueError):
|
|
309
|
+
# Qdrant local raises ValueError if the collection is not found, but
|
|
310
|
+
# with the remote server UnexpectedResponse is raised. Until that's unified,
|
|
311
|
+
# we need to catch both.
|
|
312
|
+
return 0
|
|
313
|
+
|
|
314
|
+
async def count_documents_async(self) -> int:
|
|
315
|
+
"""
|
|
316
|
+
Asynchronously returns the number of documents present in the document dtore.
|
|
317
|
+
"""
|
|
318
|
+
await self._initialize_async_client()
|
|
319
|
+
assert self._async_client is not None
|
|
295
320
|
try:
|
|
296
|
-
response = self.
|
|
321
|
+
response = await self._async_client.count(
|
|
297
322
|
collection_name=self.index,
|
|
298
323
|
)
|
|
299
324
|
return response.count
|
|
@@ -316,19 +341,29 @@ class QdrantDocumentStore:
|
|
|
316
341
|
:param filters: The filters to apply to the document list.
|
|
317
342
|
:returns: A list of documents that match the given filters.
|
|
318
343
|
"""
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
raise ValueError(msg)
|
|
344
|
+
# No need to initialize client here as _get_documents_generator
|
|
345
|
+
# will handle client initialization internally
|
|
322
346
|
|
|
323
|
-
|
|
324
|
-
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
325
|
-
raise ValueError(msg)
|
|
347
|
+
self._validate_filters(filters)
|
|
326
348
|
return list(
|
|
327
|
-
self.
|
|
349
|
+
self._get_documents_generator(
|
|
328
350
|
filters,
|
|
329
351
|
)
|
|
330
352
|
)
|
|
331
353
|
|
|
354
|
+
async def filter_documents_async(
|
|
355
|
+
self,
|
|
356
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
357
|
+
) -> List[Document]:
|
|
358
|
+
"""
|
|
359
|
+
Asynchronously returns the documents that match the provided filters.
|
|
360
|
+
"""
|
|
361
|
+
# No need to initialize client here as _get_documents_generator_async
|
|
362
|
+
# will handle client initialization internally
|
|
363
|
+
|
|
364
|
+
self._validate_filters(filters)
|
|
365
|
+
return [doc async for doc in self._get_documents_generator_async(filters)]
|
|
366
|
+
|
|
332
367
|
def write_documents(
|
|
333
368
|
self,
|
|
334
369
|
documents: List[Document],
|
|
@@ -347,13 +382,14 @@ class QdrantDocumentStore:
|
|
|
347
382
|
|
|
348
383
|
:returns: The number of documents written to the document store.
|
|
349
384
|
"""
|
|
385
|
+
|
|
386
|
+
self._initialize_client()
|
|
387
|
+
assert self._client is not None
|
|
388
|
+
|
|
350
389
|
for doc in documents:
|
|
351
390
|
if not isinstance(doc, Document):
|
|
352
391
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
353
392
|
raise ValueError(msg)
|
|
354
|
-
self._set_up_collection(
|
|
355
|
-
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
|
|
356
|
-
)
|
|
357
393
|
|
|
358
394
|
if len(documents) == 0:
|
|
359
395
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -372,7 +408,61 @@ class QdrantDocumentStore:
|
|
|
372
408
|
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
373
409
|
)
|
|
374
410
|
|
|
375
|
-
self.
|
|
411
|
+
self._client.upsert(
|
|
412
|
+
collection_name=self.index,
|
|
413
|
+
points=batch,
|
|
414
|
+
wait=self.wait_result_from_api,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
progress_bar.update(self.write_batch_size)
|
|
418
|
+
return len(document_objects)
|
|
419
|
+
|
|
420
|
+
async def write_documents_async(
|
|
421
|
+
self,
|
|
422
|
+
documents: List[Document],
|
|
423
|
+
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
424
|
+
) -> int:
|
|
425
|
+
"""
|
|
426
|
+
Asynchronously writes documents to Qdrant using the specified policy.
|
|
427
|
+
The QdrantDocumentStore can handle duplicate documents based on the given policy.
|
|
428
|
+
The available policies are:
|
|
429
|
+
- `FAIL`: The operation will raise an error if any document already exists.
|
|
430
|
+
- `OVERWRITE`: Existing documents will be overwritten with the new ones.
|
|
431
|
+
- `SKIP`: Existing documents will be skipped, and only new documents will be added.
|
|
432
|
+
|
|
433
|
+
:param documents: A list of Document objects to write to Qdrant.
|
|
434
|
+
:param policy: The policy for handling duplicate documents.
|
|
435
|
+
|
|
436
|
+
:returns: The number of documents written to the document store.
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
await self._initialize_async_client()
|
|
440
|
+
assert self._async_client is not None
|
|
441
|
+
|
|
442
|
+
for doc in documents:
|
|
443
|
+
if not isinstance(doc, Document):
|
|
444
|
+
msg = f"""DocumentStore.write_documents_async() expects a list of
|
|
445
|
+
Documents but got an element of {type(doc)}."""
|
|
446
|
+
raise ValueError(msg)
|
|
447
|
+
|
|
448
|
+
if len(documents) == 0:
|
|
449
|
+
logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
|
|
450
|
+
return 0
|
|
451
|
+
|
|
452
|
+
document_objects = await self._handle_duplicate_documents_async(
|
|
453
|
+
documents=documents,
|
|
454
|
+
policy=policy,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
|
|
458
|
+
with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
|
|
459
|
+
for document_batch in batched_documents:
|
|
460
|
+
batch = convert_haystack_documents_to_qdrant_points(
|
|
461
|
+
document_batch,
|
|
462
|
+
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
await self._async_client.upsert(
|
|
376
466
|
collection_name=self.index,
|
|
377
467
|
points=batch,
|
|
378
468
|
wait=self.wait_result_from_api,
|
|
@@ -387,9 +477,13 @@ class QdrantDocumentStore:
|
|
|
387
477
|
|
|
388
478
|
:param document_ids: the document ids to delete
|
|
389
479
|
"""
|
|
480
|
+
|
|
481
|
+
self._initialize_client()
|
|
482
|
+
assert self._client is not None
|
|
483
|
+
|
|
390
484
|
ids = [convert_id(_id) for _id in document_ids]
|
|
391
485
|
try:
|
|
392
|
-
self.
|
|
486
|
+
self._client.delete(
|
|
393
487
|
collection_name=self.index,
|
|
394
488
|
points_selector=ids,
|
|
395
489
|
wait=self.wait_result_from_api,
|
|
@@ -399,6 +493,28 @@ class QdrantDocumentStore:
|
|
|
399
493
|
"Called QdrantDocumentStore.delete_documents() on a non-existing ID",
|
|
400
494
|
)
|
|
401
495
|
|
|
496
|
+
async def delete_documents_async(self, document_ids: List[str]) -> None:
|
|
497
|
+
"""
|
|
498
|
+
Asynchronously deletes documents that match the provided `document_ids` from the document store.
|
|
499
|
+
|
|
500
|
+
:param document_ids: the document ids to delete
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
await self._initialize_async_client()
|
|
504
|
+
assert self._async_client is not None
|
|
505
|
+
|
|
506
|
+
ids = [convert_id(_id) for _id in document_ids]
|
|
507
|
+
try:
|
|
508
|
+
await self._async_client.delete(
|
|
509
|
+
collection_name=self.index,
|
|
510
|
+
points_selector=ids,
|
|
511
|
+
wait=self.wait_result_from_api,
|
|
512
|
+
)
|
|
513
|
+
except KeyError:
|
|
514
|
+
logger.warning(
|
|
515
|
+
"Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
|
|
516
|
+
)
|
|
517
|
+
|
|
402
518
|
@classmethod
|
|
403
519
|
def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
|
|
404
520
|
"""
|
|
@@ -429,7 +545,7 @@ class QdrantDocumentStore:
|
|
|
429
545
|
**init_params,
|
|
430
546
|
)
|
|
431
547
|
|
|
432
|
-
def
|
|
548
|
+
def _get_documents_generator(
|
|
433
549
|
self,
|
|
434
550
|
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
435
551
|
) -> Generator[Document, None, None]:
|
|
@@ -440,13 +556,53 @@ class QdrantDocumentStore:
|
|
|
440
556
|
:returns: A generator that yields documents retrieved from Qdrant.
|
|
441
557
|
"""
|
|
442
558
|
|
|
559
|
+
self._initialize_client()
|
|
560
|
+
assert self._client is not None
|
|
561
|
+
|
|
562
|
+
index = self.index
|
|
563
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
564
|
+
|
|
565
|
+
next_offset = None
|
|
566
|
+
stop_scrolling = False
|
|
567
|
+
while not stop_scrolling:
|
|
568
|
+
records, next_offset = self._client.scroll(
|
|
569
|
+
collection_name=index,
|
|
570
|
+
scroll_filter=qdrant_filters,
|
|
571
|
+
limit=self.scroll_size,
|
|
572
|
+
offset=next_offset,
|
|
573
|
+
with_payload=True,
|
|
574
|
+
with_vectors=True,
|
|
575
|
+
)
|
|
576
|
+
stop_scrolling = next_offset is None or (
|
|
577
|
+
isinstance(next_offset, grpc.PointId) and next_offset.num == 0 and next_offset.uuid == ""
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
for record in records:
|
|
581
|
+
yield convert_qdrant_point_to_haystack_document(
|
|
582
|
+
record, use_sparse_embeddings=self.use_sparse_embeddings
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
async def _get_documents_generator_async(
|
|
586
|
+
self,
|
|
587
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
588
|
+
) -> AsyncGenerator[Document, None]:
|
|
589
|
+
"""
|
|
590
|
+
Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
|
|
591
|
+
|
|
592
|
+
:param filters: Filters applied to the retrieved documents.
|
|
593
|
+
:returns: An asynchronous generator that yields documents retrieved from Qdrant.
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
await self._initialize_async_client()
|
|
597
|
+
assert self._async_client is not None
|
|
598
|
+
|
|
443
599
|
index = self.index
|
|
444
600
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
445
601
|
|
|
446
602
|
next_offset = None
|
|
447
603
|
stop_scrolling = False
|
|
448
604
|
while not stop_scrolling:
|
|
449
|
-
records, next_offset = self.
|
|
605
|
+
records, next_offset = await self._async_client.scroll(
|
|
450
606
|
collection_name=index,
|
|
451
607
|
scroll_filter=qdrant_filters,
|
|
452
608
|
limit=self.scroll_size,
|
|
@@ -479,8 +635,44 @@ class QdrantDocumentStore:
|
|
|
479
635
|
"""
|
|
480
636
|
documents: List[Document] = []
|
|
481
637
|
|
|
638
|
+
self._initialize_client()
|
|
639
|
+
assert self._client is not None
|
|
640
|
+
|
|
641
|
+
ids = [convert_id(_id) for _id in ids]
|
|
642
|
+
records = self._client.retrieve(
|
|
643
|
+
collection_name=self.index,
|
|
644
|
+
ids=ids,
|
|
645
|
+
with_payload=True,
|
|
646
|
+
with_vectors=True,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
for record in records:
|
|
650
|
+
documents.append(
|
|
651
|
+
convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
652
|
+
)
|
|
653
|
+
return documents
|
|
654
|
+
|
|
655
|
+
async def get_documents_by_id_async(
|
|
656
|
+
self,
|
|
657
|
+
ids: List[str],
|
|
658
|
+
) -> List[Document]:
|
|
659
|
+
"""
|
|
660
|
+
Retrieves documents from Qdrant by their IDs.
|
|
661
|
+
|
|
662
|
+
:param ids:
|
|
663
|
+
A list of document IDs to retrieve.
|
|
664
|
+
:param index:
|
|
665
|
+
The name of the index to retrieve documents from.
|
|
666
|
+
:returns:
|
|
667
|
+
A list of documents.
|
|
668
|
+
"""
|
|
669
|
+
documents: List[Document] = []
|
|
670
|
+
|
|
671
|
+
await self._initialize_async_client()
|
|
672
|
+
assert self._async_client is not None
|
|
673
|
+
|
|
482
674
|
ids = [convert_id(_id) for _id in ids]
|
|
483
|
-
records = self.
|
|
675
|
+
records = await self._async_client.retrieve(
|
|
484
676
|
collection_name=self.index,
|
|
485
677
|
ids=ids,
|
|
486
678
|
with_payload=True,
|
|
@@ -526,6 +718,8 @@ class QdrantDocumentStore:
|
|
|
526
718
|
:raises QdrantStoreError:
|
|
527
719
|
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
528
720
|
"""
|
|
721
|
+
self._initialize_client()
|
|
722
|
+
assert self._client is not None
|
|
529
723
|
|
|
530
724
|
if not self.use_sparse_embeddings:
|
|
531
725
|
message = (
|
|
@@ -538,7 +732,7 @@ class QdrantDocumentStore:
|
|
|
538
732
|
query_indices = query_sparse_embedding.indices
|
|
539
733
|
query_values = query_sparse_embedding.values
|
|
540
734
|
if group_by:
|
|
541
|
-
groups = self.
|
|
735
|
+
groups = self._client.query_points_groups(
|
|
542
736
|
collection_name=self.index,
|
|
543
737
|
query=rest.SparseVector(
|
|
544
738
|
indices=query_indices,
|
|
@@ -552,17 +746,9 @@ class QdrantDocumentStore:
|
|
|
552
746
|
with_vectors=return_embedding,
|
|
553
747
|
score_threshold=score_threshold,
|
|
554
748
|
).groups
|
|
555
|
-
|
|
556
|
-
[
|
|
557
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
558
|
-
for group in groups
|
|
559
|
-
for point in group.hits
|
|
560
|
-
]
|
|
561
|
-
if groups
|
|
562
|
-
else []
|
|
563
|
-
)
|
|
749
|
+
return self._process_group_results(groups)
|
|
564
750
|
else:
|
|
565
|
-
points = self.
|
|
751
|
+
points = self._client.query_points(
|
|
566
752
|
collection_name=self.index,
|
|
567
753
|
query=rest.SparseVector(
|
|
568
754
|
indices=query_indices,
|
|
@@ -574,16 +760,7 @@ class QdrantDocumentStore:
|
|
|
574
760
|
with_vectors=return_embedding,
|
|
575
761
|
score_threshold=score_threshold,
|
|
576
762
|
).points
|
|
577
|
-
|
|
578
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
579
|
-
for point in points
|
|
580
|
-
]
|
|
581
|
-
if scale_score:
|
|
582
|
-
for document in results:
|
|
583
|
-
score = document.score
|
|
584
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
585
|
-
document.score = score
|
|
586
|
-
return results
|
|
763
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
587
764
|
|
|
588
765
|
def _query_by_embedding(
|
|
589
766
|
self,
|
|
@@ -615,9 +792,12 @@ class QdrantDocumentStore:
|
|
|
615
792
|
|
|
616
793
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
617
794
|
"""
|
|
795
|
+
self._initialize_client()
|
|
796
|
+
assert self._client is not None
|
|
797
|
+
|
|
618
798
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
619
799
|
if group_by:
|
|
620
|
-
groups = self.
|
|
800
|
+
groups = self._client.query_points_groups(
|
|
621
801
|
collection_name=self.index,
|
|
622
802
|
query=query_embedding,
|
|
623
803
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -628,17 +808,10 @@ class QdrantDocumentStore:
|
|
|
628
808
|
with_vectors=return_embedding,
|
|
629
809
|
score_threshold=score_threshold,
|
|
630
810
|
).groups
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
634
|
-
for group in groups
|
|
635
|
-
for point in group.hits
|
|
636
|
-
]
|
|
637
|
-
if groups
|
|
638
|
-
else []
|
|
639
|
-
)
|
|
811
|
+
return self._process_group_results(groups)
|
|
812
|
+
|
|
640
813
|
else:
|
|
641
|
-
points = self.
|
|
814
|
+
points = self._client.query_points(
|
|
642
815
|
collection_name=self.index,
|
|
643
816
|
query=query_embedding,
|
|
644
817
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -647,20 +820,7 @@ class QdrantDocumentStore:
|
|
|
647
820
|
with_vectors=return_embedding,
|
|
648
821
|
score_threshold=score_threshold,
|
|
649
822
|
).points
|
|
650
|
-
|
|
651
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
652
|
-
for point in points
|
|
653
|
-
]
|
|
654
|
-
|
|
655
|
-
if scale_score:
|
|
656
|
-
for document in results:
|
|
657
|
-
score = document.score
|
|
658
|
-
if self.similarity == "cosine":
|
|
659
|
-
score = (score + 1) / 2
|
|
660
|
-
else:
|
|
661
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
662
|
-
document.score = score
|
|
663
|
-
return results
|
|
823
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
664
824
|
|
|
665
825
|
def _query_hybrid(
|
|
666
826
|
self,
|
|
@@ -701,6 +861,10 @@ class QdrantDocumentStore:
|
|
|
701
861
|
|
|
702
862
|
# This implementation is based on the code from the Python Qdrant client:
|
|
703
863
|
# https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
|
|
864
|
+
|
|
865
|
+
self._initialize_client()
|
|
866
|
+
assert self._client is not None
|
|
867
|
+
|
|
704
868
|
if not self.use_sparse_embeddings:
|
|
705
869
|
message = (
|
|
706
870
|
"You are trying to query using sparse embeddings, but the Document Store "
|
|
@@ -712,7 +876,7 @@ class QdrantDocumentStore:
|
|
|
712
876
|
|
|
713
877
|
try:
|
|
714
878
|
if group_by:
|
|
715
|
-
groups = self.
|
|
879
|
+
groups = self._client.query_points_groups(
|
|
716
880
|
collection_name=self.index,
|
|
717
881
|
prefetch=[
|
|
718
882
|
rest.Prefetch(
|
|
@@ -738,7 +902,7 @@ class QdrantDocumentStore:
|
|
|
738
902
|
with_vectors=return_embedding,
|
|
739
903
|
).groups
|
|
740
904
|
else:
|
|
741
|
-
points = self.
|
|
905
|
+
points = self._client.query_points(
|
|
742
906
|
collection_name=self.index,
|
|
743
907
|
prefetch=[
|
|
744
908
|
rest.Prefetch(
|
|
@@ -767,71 +931,339 @@ class QdrantDocumentStore:
|
|
|
767
931
|
raise QdrantStoreError(msg) from e
|
|
768
932
|
|
|
769
933
|
if group_by:
|
|
770
|
-
|
|
771
|
-
[
|
|
772
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
773
|
-
for group in groups
|
|
774
|
-
for point in group.hits
|
|
775
|
-
]
|
|
776
|
-
if groups
|
|
777
|
-
else []
|
|
778
|
-
)
|
|
934
|
+
return self._process_group_results(groups)
|
|
779
935
|
else:
|
|
780
|
-
|
|
936
|
+
return self._process_query_point_results(points)
|
|
781
937
|
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
938
|
+
async def _query_by_sparse_async(
|
|
939
|
+
self,
|
|
940
|
+
query_sparse_embedding: SparseEmbedding,
|
|
941
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
942
|
+
top_k: int = 10,
|
|
943
|
+
scale_score: bool = False,
|
|
944
|
+
return_embedding: bool = False,
|
|
945
|
+
score_threshold: Optional[float] = None,
|
|
946
|
+
group_by: Optional[str] = None,
|
|
947
|
+
group_size: Optional[int] = None,
|
|
948
|
+
) -> List[Document]:
|
|
785
949
|
"""
|
|
786
|
-
|
|
950
|
+
Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
951
|
+
|
|
952
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
953
|
+
:param filters: Filters applied to the retrieved documents.
|
|
954
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
955
|
+
groups to return.
|
|
956
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
957
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
958
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
959
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
960
|
+
depending on the Distance function used.
|
|
961
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
962
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
963
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
964
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
965
|
+
|
|
966
|
+
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
787
967
|
|
|
788
|
-
:param similarity:
|
|
789
|
-
The similarity measure to retrieve the distance.
|
|
790
|
-
:returns:
|
|
791
|
-
The corresponding rest.Distance object.
|
|
792
968
|
:raises QdrantStoreError:
|
|
793
|
-
If the
|
|
969
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
794
970
|
"""
|
|
795
|
-
try:
|
|
796
|
-
return self.SIMILARITY[similarity]
|
|
797
|
-
except KeyError as ke:
|
|
798
|
-
msg = (
|
|
799
|
-
f"Provided similarity '{similarity}' is not supported by Qdrant "
|
|
800
|
-
f"document store. Please choose one of the options: "
|
|
801
|
-
f"{', '.join(self.SIMILARITY.keys())}"
|
|
802
|
-
)
|
|
803
|
-
raise QdrantStoreError(msg) from ke
|
|
804
971
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
Create payload index for the collection if payload_fields_to_index is provided
|
|
808
|
-
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
809
|
-
"""
|
|
810
|
-
if payload_fields_to_index is not None:
|
|
811
|
-
for payload_index in payload_fields_to_index:
|
|
812
|
-
self.client.create_payload_index(
|
|
813
|
-
collection_name=collection_name,
|
|
814
|
-
field_name=payload_index["field_name"],
|
|
815
|
-
field_schema=payload_index["field_schema"],
|
|
816
|
-
)
|
|
972
|
+
await self._initialize_async_client()
|
|
973
|
+
assert self._async_client is not None
|
|
817
974
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
975
|
+
if not self.use_sparse_embeddings:
|
|
976
|
+
message = (
|
|
977
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
978
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
979
|
+
)
|
|
980
|
+
raise QdrantStoreError(message)
|
|
981
|
+
|
|
982
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
983
|
+
query_indices = query_sparse_embedding.indices
|
|
984
|
+
query_values = query_sparse_embedding.values
|
|
985
|
+
if group_by:
|
|
986
|
+
response = await self._async_client.query_points_groups(
|
|
987
|
+
collection_name=self.index,
|
|
988
|
+
query=rest.SparseVector(
|
|
989
|
+
indices=query_indices,
|
|
990
|
+
values=query_values,
|
|
991
|
+
),
|
|
992
|
+
using=SPARSE_VECTORS_NAME,
|
|
993
|
+
query_filter=qdrant_filters,
|
|
994
|
+
limit=top_k,
|
|
995
|
+
group_by=group_by,
|
|
996
|
+
group_size=group_size,
|
|
997
|
+
with_vectors=return_embedding,
|
|
998
|
+
score_threshold=score_threshold,
|
|
999
|
+
)
|
|
1000
|
+
groups = response.groups
|
|
1001
|
+
return self._process_group_results(groups)
|
|
1002
|
+
else:
|
|
1003
|
+
response = await self._async_client.query_points(
|
|
1004
|
+
collection_name=self.index,
|
|
1005
|
+
query=rest.SparseVector(
|
|
1006
|
+
indices=query_indices,
|
|
1007
|
+
values=query_values,
|
|
1008
|
+
),
|
|
1009
|
+
using=SPARSE_VECTORS_NAME,
|
|
1010
|
+
query_filter=qdrant_filters,
|
|
1011
|
+
limit=top_k,
|
|
1012
|
+
with_vectors=return_embedding,
|
|
1013
|
+
score_threshold=score_threshold,
|
|
1014
|
+
)
|
|
1015
|
+
points = response.points
|
|
1016
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1017
|
+
|
|
1018
|
+
async def _query_by_embedding_async(
|
|
1019
|
+
self,
|
|
1020
|
+
query_embedding: List[float],
|
|
1021
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
1022
|
+
top_k: int = 10,
|
|
1023
|
+
scale_score: bool = False,
|
|
1024
|
+
return_embedding: bool = False,
|
|
1025
|
+
score_threshold: Optional[float] = None,
|
|
1026
|
+
group_by: Optional[str] = None,
|
|
1027
|
+
group_size: Optional[int] = None,
|
|
1028
|
+
) -> List[Document]:
|
|
1029
|
+
"""
|
|
1030
|
+
Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
1031
|
+
|
|
1032
|
+
:param query_embedding: Dense embedding of the query.
|
|
1033
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1034
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1035
|
+
groups to return.
|
|
1036
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
1037
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1038
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1039
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1040
|
+
depending on the Distance function used.
|
|
1041
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1042
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1043
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1044
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1045
|
+
|
|
1046
|
+
:returns: List of documents that are most similar to `query_embedding`.
|
|
1047
|
+
"""
|
|
1048
|
+
await self._initialize_async_client()
|
|
1049
|
+
assert self._async_client is not None
|
|
1050
|
+
|
|
1051
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1052
|
+
if group_by:
|
|
1053
|
+
response = await self._async_client.query_points_groups(
|
|
1054
|
+
collection_name=self.index,
|
|
1055
|
+
query=query_embedding,
|
|
1056
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1057
|
+
query_filter=qdrant_filters,
|
|
1058
|
+
limit=top_k,
|
|
1059
|
+
group_by=group_by,
|
|
1060
|
+
group_size=group_size,
|
|
1061
|
+
with_vectors=return_embedding,
|
|
1062
|
+
score_threshold=score_threshold,
|
|
1063
|
+
)
|
|
1064
|
+
groups = response.groups
|
|
1065
|
+
return self._process_group_results(groups)
|
|
1066
|
+
else:
|
|
1067
|
+
response = await self._async_client.query_points(
|
|
1068
|
+
collection_name=self.index,
|
|
1069
|
+
query=query_embedding,
|
|
1070
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1071
|
+
query_filter=qdrant_filters,
|
|
1072
|
+
limit=top_k,
|
|
1073
|
+
with_vectors=return_embedding,
|
|
1074
|
+
score_threshold=score_threshold,
|
|
1075
|
+
)
|
|
1076
|
+
points = response.points
|
|
1077
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1078
|
+
|
|
1079
|
+
async def _query_hybrid_async(
|
|
1080
|
+
self,
|
|
1081
|
+
query_embedding: List[float],
|
|
1082
|
+
query_sparse_embedding: SparseEmbedding,
|
|
1083
|
+
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
1084
|
+
top_k: int = 10,
|
|
1085
|
+
return_embedding: bool = False,
|
|
1086
|
+
score_threshold: Optional[float] = None,
|
|
1087
|
+
group_by: Optional[str] = None,
|
|
1088
|
+
group_size: Optional[int] = None,
|
|
1089
|
+
) -> List[Document]:
|
|
1090
|
+
"""
|
|
1091
|
+
Asynchronously retrieves documents based on dense and sparse embeddings and fuses
|
|
1092
|
+
the results using Reciprocal Rank Fusion.
|
|
1093
|
+
|
|
1094
|
+
This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
|
|
1095
|
+
Use the `QdrantHybridRetriever` instead.
|
|
1096
|
+
|
|
1097
|
+
:param query_embedding: Dense embedding of the query.
|
|
1098
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
1099
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1100
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1101
|
+
groups to return.
|
|
1102
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1103
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1104
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1105
|
+
depending on the Distance function used.
|
|
1106
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1107
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1108
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1109
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1110
|
+
|
|
1111
|
+
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
1112
|
+
|
|
1113
|
+
:raises QdrantStoreError:
|
|
1114
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
1115
|
+
"""
|
|
1116
|
+
|
|
1117
|
+
await self._initialize_async_client()
|
|
1118
|
+
assert self._async_client is not None
|
|
1119
|
+
|
|
1120
|
+
if not self.use_sparse_embeddings:
|
|
1121
|
+
message = (
|
|
1122
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
1123
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
1124
|
+
)
|
|
1125
|
+
raise QdrantStoreError(message)
|
|
1126
|
+
|
|
1127
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1128
|
+
|
|
1129
|
+
try:
|
|
1130
|
+
if group_by:
|
|
1131
|
+
response = await self._async_client.query_points_groups(
|
|
1132
|
+
collection_name=self.index,
|
|
1133
|
+
prefetch=[
|
|
1134
|
+
rest.Prefetch(
|
|
1135
|
+
query=rest.SparseVector(
|
|
1136
|
+
indices=query_sparse_embedding.indices,
|
|
1137
|
+
values=query_sparse_embedding.values,
|
|
1138
|
+
),
|
|
1139
|
+
using=SPARSE_VECTORS_NAME,
|
|
1140
|
+
filter=qdrant_filters,
|
|
1141
|
+
),
|
|
1142
|
+
rest.Prefetch(
|
|
1143
|
+
query=query_embedding,
|
|
1144
|
+
using=DENSE_VECTORS_NAME,
|
|
1145
|
+
filter=qdrant_filters,
|
|
1146
|
+
),
|
|
1147
|
+
],
|
|
1148
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1149
|
+
limit=top_k,
|
|
1150
|
+
group_by=group_by,
|
|
1151
|
+
group_size=group_size,
|
|
1152
|
+
score_threshold=score_threshold,
|
|
1153
|
+
with_payload=True,
|
|
1154
|
+
with_vectors=return_embedding,
|
|
1155
|
+
)
|
|
1156
|
+
groups = response.groups
|
|
1157
|
+
else:
|
|
1158
|
+
response = await self._async_client.query_points(
|
|
1159
|
+
collection_name=self.index,
|
|
1160
|
+
prefetch=[
|
|
1161
|
+
rest.Prefetch(
|
|
1162
|
+
query=rest.SparseVector(
|
|
1163
|
+
indices=query_sparse_embedding.indices,
|
|
1164
|
+
values=query_sparse_embedding.values,
|
|
1165
|
+
),
|
|
1166
|
+
using=SPARSE_VECTORS_NAME,
|
|
1167
|
+
filter=qdrant_filters,
|
|
1168
|
+
),
|
|
1169
|
+
rest.Prefetch(
|
|
1170
|
+
query=query_embedding,
|
|
1171
|
+
using=DENSE_VECTORS_NAME,
|
|
1172
|
+
filter=qdrant_filters,
|
|
1173
|
+
),
|
|
1174
|
+
],
|
|
1175
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1176
|
+
limit=top_k,
|
|
1177
|
+
score_threshold=score_threshold,
|
|
1178
|
+
with_payload=True,
|
|
1179
|
+
with_vectors=return_embedding,
|
|
1180
|
+
)
|
|
1181
|
+
points = response.points
|
|
1182
|
+
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
msg = "Error during hybrid search"
|
|
1185
|
+
raise QdrantStoreError(msg) from e
|
|
1186
|
+
|
|
1187
|
+
if group_by:
|
|
1188
|
+
return self._process_group_results(groups)
|
|
1189
|
+
else:
|
|
1190
|
+
return self._process_query_point_results(points)
|
|
1191
|
+
|
|
1192
|
+
def get_distance(self, similarity: str) -> rest.Distance:
|
|
1193
|
+
"""
|
|
1194
|
+
Retrieves the distance metric for the specified similarity measure.
|
|
1195
|
+
|
|
1196
|
+
:param similarity:
|
|
1197
|
+
The similarity measure to retrieve the distance.
|
|
1198
|
+
:returns:
|
|
1199
|
+
The corresponding rest.Distance object.
|
|
1200
|
+
:raises QdrantStoreError:
|
|
1201
|
+
If the provided similarity measure is not supported.
|
|
1202
|
+
"""
|
|
1203
|
+
try:
|
|
1204
|
+
return self.SIMILARITY[similarity]
|
|
1205
|
+
except KeyError as ke:
|
|
1206
|
+
msg = (
|
|
1207
|
+
f"Provided similarity '{similarity}' is not supported by Qdrant "
|
|
1208
|
+
f"document store. Please choose one of the options: "
|
|
1209
|
+
f"{', '.join(self.SIMILARITY.keys())}"
|
|
1210
|
+
)
|
|
1211
|
+
raise QdrantStoreError(msg) from ke
|
|
1212
|
+
|
|
1213
|
+
def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
|
|
1214
|
+
"""
|
|
1215
|
+
Create payload index for the collection if payload_fields_to_index is provided
|
|
1216
|
+
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1217
|
+
"""
|
|
1218
|
+
if payload_fields_to_index is not None:
|
|
1219
|
+
for payload_index in payload_fields_to_index:
|
|
1220
|
+
# self._client is initialized at this point
|
|
1221
|
+
# since _initialize_client() is called before this method is executed
|
|
1222
|
+
|
|
1223
|
+
assert self._client is not None
|
|
1224
|
+
self._client.create_payload_index(
|
|
1225
|
+
collection_name=collection_name,
|
|
1226
|
+
field_name=payload_index["field_name"],
|
|
1227
|
+
field_schema=payload_index["field_schema"],
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
async def _create_payload_index_async(
|
|
1231
|
+
self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None
|
|
1232
|
+
):
|
|
1233
|
+
"""
|
|
1234
|
+
Asynchronously create payload index for the collection if payload_fields_to_index is provided
|
|
1235
|
+
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1236
|
+
"""
|
|
1237
|
+
if payload_fields_to_index is not None:
|
|
1238
|
+
for payload_index in payload_fields_to_index:
|
|
1239
|
+
|
|
1240
|
+
# self._async_client is initialized at this point
|
|
1241
|
+
# since _initialize_async_client() is called before this method is executed
|
|
1242
|
+
assert self._async_client is not None
|
|
1243
|
+
|
|
1244
|
+
await self._async_client.create_payload_index(
|
|
1245
|
+
collection_name=collection_name,
|
|
1246
|
+
field_name=payload_index["field_name"],
|
|
1247
|
+
field_schema=payload_index["field_schema"],
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
def _set_up_collection(
|
|
1251
|
+
self,
|
|
1252
|
+
collection_name: str,
|
|
1253
|
+
embedding_dim: int,
|
|
1254
|
+
recreate_collection: bool,
|
|
1255
|
+
similarity: str,
|
|
1256
|
+
use_sparse_embeddings: bool,
|
|
1257
|
+
sparse_idf: bool,
|
|
1258
|
+
on_disk: bool = False,
|
|
1259
|
+
payload_fields_to_index: Optional[List[dict]] = None,
|
|
1260
|
+
):
|
|
1261
|
+
"""
|
|
1262
|
+
Sets up the Qdrant collection with the specified parameters.
|
|
1263
|
+
:param collection_name:
|
|
1264
|
+
The name of the collection to set up.
|
|
1265
|
+
:param embedding_dim:
|
|
1266
|
+
The dimension of the embeddings.
|
|
835
1267
|
:param recreate_collection:
|
|
836
1268
|
Whether to recreate the collection if it already exists.
|
|
837
1269
|
:param similarity:
|
|
@@ -851,9 +1283,13 @@ class QdrantDocumentStore:
|
|
|
851
1283
|
If the collection exists with a different similarity measure or embedding dimension.
|
|
852
1284
|
|
|
853
1285
|
"""
|
|
1286
|
+
|
|
1287
|
+
self._initialize_client()
|
|
1288
|
+
assert self._client is not None
|
|
1289
|
+
|
|
854
1290
|
distance = self.get_distance(similarity)
|
|
855
1291
|
|
|
856
|
-
if recreate_collection or not self.
|
|
1292
|
+
if recreate_collection or not self._client.collection_exists(collection_name):
|
|
857
1293
|
# There is no need to verify the current configuration of that
|
|
858
1294
|
# collection. It might be just recreated again or does not exist yet.
|
|
859
1295
|
self.recreate_collection(
|
|
@@ -863,64 +1299,65 @@ class QdrantDocumentStore:
|
|
|
863
1299
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
864
1300
|
return
|
|
865
1301
|
|
|
866
|
-
collection_info = self.
|
|
1302
|
+
collection_info = self._client.get_collection(collection_name)
|
|
867
1303
|
|
|
868
|
-
|
|
1304
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
869
1305
|
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1306
|
+
async def _set_up_collection_async(
|
|
1307
|
+
self,
|
|
1308
|
+
collection_name: str,
|
|
1309
|
+
embedding_dim: int,
|
|
1310
|
+
recreate_collection: bool,
|
|
1311
|
+
similarity: str,
|
|
1312
|
+
use_sparse_embeddings: bool,
|
|
1313
|
+
sparse_idf: bool,
|
|
1314
|
+
on_disk: bool = False,
|
|
1315
|
+
payload_fields_to_index: Optional[List[dict]] = None,
|
|
1316
|
+
):
|
|
1317
|
+
"""
|
|
1318
|
+
Asynchronously sets up the Qdrant collection with the specified parameters.
|
|
1319
|
+
:param collection_name:
|
|
1320
|
+
The name of the collection to set up.
|
|
1321
|
+
:param embedding_dim:
|
|
1322
|
+
The dimension of the embeddings.
|
|
1323
|
+
:param recreate_collection:
|
|
1324
|
+
Whether to recreate the collection if it already exists.
|
|
1325
|
+
:param similarity:
|
|
1326
|
+
The similarity measure to use.
|
|
1327
|
+
:param use_sparse_embeddings:
|
|
1328
|
+
Whether to use sparse embeddings.
|
|
1329
|
+
:param sparse_idf:
|
|
1330
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
1331
|
+
:param on_disk:
|
|
1332
|
+
Whether to store the collection on disk.
|
|
1333
|
+
:param payload_fields_to_index:
|
|
1334
|
+
List of payload fields to index.
|
|
880
1335
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
886
|
-
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
|
|
887
|
-
f"See `migrate_to_sparse_embeddings_support` function in "
|
|
888
|
-
f"`haystack_integrations.document_stores.qdrant`."
|
|
889
|
-
)
|
|
890
|
-
raise QdrantStoreError(msg)
|
|
1336
|
+
:raises QdrantStoreError:
|
|
1337
|
+
If the collection exists with incompatible settings.
|
|
1338
|
+
:raises ValueError:
|
|
1339
|
+
If the collection exists with a different similarity measure or embedding dimension.
|
|
891
1340
|
|
|
892
|
-
|
|
893
|
-
msg = (
|
|
894
|
-
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
895
|
-
f"but it has been originally created with sparse embedding vectors."
|
|
896
|
-
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
897
|
-
)
|
|
898
|
-
raise QdrantStoreError(msg)
|
|
1341
|
+
"""
|
|
899
1342
|
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
|
|
903
|
-
else:
|
|
904
|
-
current_distance = collection_info.config.params.vectors.distance
|
|
905
|
-
current_vector_size = collection_info.config.params.vectors.size
|
|
1343
|
+
await self._initialize_async_client()
|
|
1344
|
+
assert self._async_client is not None
|
|
906
1345
|
|
|
907
|
-
|
|
908
|
-
msg = (
|
|
909
|
-
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
910
|
-
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
911
|
-
f"If you want to use that collection, but with a different "
|
|
912
|
-
f"similarity, please set `recreate_collection=True` argument."
|
|
913
|
-
)
|
|
914
|
-
raise ValueError(msg)
|
|
1346
|
+
distance = self.get_distance(similarity)
|
|
915
1347
|
|
|
916
|
-
if
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
f"vector size, please set `recreate_collection=True` argument."
|
|
1348
|
+
if recreate_collection or not await self._async_client.collection_exists(collection_name):
|
|
1349
|
+
# There is no need to verify the current configuration of that
|
|
1350
|
+
# collection. It might be just recreated again or does not exist yet.
|
|
1351
|
+
await self.recreate_collection_async(
|
|
1352
|
+
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
|
|
922
1353
|
)
|
|
923
|
-
|
|
1354
|
+
# Create Payload index if payload_fields_to_index is provided
|
|
1355
|
+
await self._create_payload_index_async(collection_name, payload_fields_to_index)
|
|
1356
|
+
return
|
|
1357
|
+
|
|
1358
|
+
collection_info = await self._async_client.get_collection(collection_name)
|
|
1359
|
+
|
|
1360
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
924
1361
|
|
|
925
1362
|
def recreate_collection(
|
|
926
1363
|
self,
|
|
@@ -947,44 +1384,65 @@ class QdrantDocumentStore:
|
|
|
947
1384
|
:param sparse_idf:
|
|
948
1385
|
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
949
1386
|
"""
|
|
950
|
-
|
|
951
|
-
on_disk
|
|
1387
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
1388
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
1389
|
+
)
|
|
1390
|
+
collection_params = self._prepare_collection_params()
|
|
952
1391
|
|
|
953
|
-
|
|
954
|
-
|
|
1392
|
+
self._initialize_client()
|
|
1393
|
+
assert self._client is not None
|
|
955
1394
|
|
|
956
|
-
|
|
957
|
-
|
|
1395
|
+
if self._client.collection_exists(collection_name):
|
|
1396
|
+
self._client.delete_collection(collection_name)
|
|
958
1397
|
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
vectors_config
|
|
1398
|
+
self._client.create_collection(
|
|
1399
|
+
collection_name=collection_name,
|
|
1400
|
+
vectors_config=vectors_config,
|
|
1401
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
1402
|
+
**collection_params,
|
|
1403
|
+
)
|
|
962
1404
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1405
|
+
async def recreate_collection_async(
|
|
1406
|
+
self,
|
|
1407
|
+
collection_name: str,
|
|
1408
|
+
distance,
|
|
1409
|
+
embedding_dim: int,
|
|
1410
|
+
on_disk: Optional[bool] = None,
|
|
1411
|
+
use_sparse_embeddings: Optional[bool] = None,
|
|
1412
|
+
sparse_idf: bool = False,
|
|
1413
|
+
):
|
|
1414
|
+
"""
|
|
1415
|
+
Asynchronously recreates the Qdrant collection with the specified parameters.
|
|
1416
|
+
|
|
1417
|
+
:param collection_name:
|
|
1418
|
+
The name of the collection to recreate.
|
|
1419
|
+
:param distance:
|
|
1420
|
+
The distance metric to use for the collection.
|
|
1421
|
+
:param embedding_dim:
|
|
1422
|
+
The dimension of the embeddings.
|
|
1423
|
+
:param on_disk:
|
|
1424
|
+
Whether to store the collection on disk.
|
|
1425
|
+
:param use_sparse_embeddings:
|
|
1426
|
+
Whether to use sparse embeddings.
|
|
1427
|
+
:param sparse_idf:
|
|
1428
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
1429
|
+
"""
|
|
1430
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
1431
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
1432
|
+
)
|
|
1433
|
+
collection_params = self._prepare_collection_params()
|
|
1434
|
+
|
|
1435
|
+
await self._initialize_async_client()
|
|
1436
|
+
assert self._async_client is not None
|
|
971
1437
|
|
|
972
|
-
if self.
|
|
973
|
-
self.
|
|
1438
|
+
if await self._async_client.collection_exists(collection_name):
|
|
1439
|
+
await self._async_client.delete_collection(collection_name)
|
|
974
1440
|
|
|
975
|
-
self.
|
|
1441
|
+
await self._async_client.create_collection(
|
|
976
1442
|
collection_name=collection_name,
|
|
977
1443
|
vectors_config=vectors_config,
|
|
978
|
-
sparse_vectors_config=sparse_vectors_config
|
|
979
|
-
|
|
980
|
-
replication_factor=self.replication_factor,
|
|
981
|
-
write_consistency_factor=self.write_consistency_factor,
|
|
982
|
-
on_disk_payload=self.on_disk_payload,
|
|
983
|
-
hnsw_config=self.hnsw_config,
|
|
984
|
-
optimizers_config=self.optimizers_config,
|
|
985
|
-
wal_config=self.wal_config,
|
|
986
|
-
quantization_config=self.quantization_config,
|
|
987
|
-
init_from=self.init_from,
|
|
1444
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
1445
|
+
**collection_params,
|
|
988
1446
|
)
|
|
989
1447
|
|
|
990
1448
|
def _handle_duplicate_documents(
|
|
@@ -1014,12 +1472,38 @@ class QdrantDocumentStore:
|
|
|
1014
1472
|
|
|
1015
1473
|
return documents
|
|
1016
1474
|
|
|
1017
|
-
def
|
|
1475
|
+
async def _handle_duplicate_documents_async(
|
|
1476
|
+
self,
|
|
1477
|
+
documents: List[Document],
|
|
1478
|
+
policy: DuplicatePolicy = None,
|
|
1479
|
+
):
|
|
1018
1480
|
"""
|
|
1019
|
-
|
|
1481
|
+
Asynchronously checks whether any of the passed documents is already existing
|
|
1482
|
+
in the chosen index and returns a list of
|
|
1483
|
+
documents that are not in the index yet.
|
|
1020
1484
|
|
|
1021
1485
|
:param documents: A list of Haystack Document objects.
|
|
1486
|
+
:param policy: The duplicate policy to use when writing documents.
|
|
1022
1487
|
:returns: A list of Haystack Document objects.
|
|
1488
|
+
"""
|
|
1489
|
+
|
|
1490
|
+
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
1491
|
+
documents = self._drop_duplicate_documents(documents)
|
|
1492
|
+
documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
|
|
1493
|
+
ids_exist_in_db: List[str] = [doc.id for doc in documents_found]
|
|
1494
|
+
|
|
1495
|
+
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
1496
|
+
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
1497
|
+
raise DuplicateDocumentError(msg)
|
|
1498
|
+
|
|
1499
|
+
documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
|
|
1500
|
+
|
|
1501
|
+
return documents
|
|
1502
|
+
|
|
1503
|
+
def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]:
|
|
1504
|
+
"""
|
|
1505
|
+
Drop duplicate documents based on same hash ID.
|
|
1506
|
+
|
|
1023
1507
|
"""
|
|
1024
1508
|
_hash_ids: Set = set()
|
|
1025
1509
|
_documents: List[Document] = []
|
|
@@ -1027,12 +1511,202 @@ class QdrantDocumentStore:
|
|
|
1027
1511
|
for document in documents:
|
|
1028
1512
|
if document.id in _hash_ids:
|
|
1029
1513
|
logger.info(
|
|
1030
|
-
"Duplicate Documents: Document with id '
|
|
1031
|
-
document.id,
|
|
1032
|
-
self.index,
|
|
1514
|
+
"Duplicate Documents: Document with id '{document_id}' already exists in index '{index}'",
|
|
1515
|
+
document_id=document.id,
|
|
1516
|
+
index=self.index,
|
|
1033
1517
|
)
|
|
1034
1518
|
continue
|
|
1035
1519
|
_documents.append(document)
|
|
1036
1520
|
_hash_ids.add(document.id)
|
|
1037
1521
|
|
|
1038
1522
|
return _documents
|
|
1523
|
+
|
|
1524
|
+
def _prepare_collection_params(self):
|
|
1525
|
+
"""
|
|
1526
|
+
Prepares the common parameters for collection creation.
|
|
1527
|
+
"""
|
|
1528
|
+
return {
|
|
1529
|
+
"shard_number": self.shard_number,
|
|
1530
|
+
"replication_factor": self.replication_factor,
|
|
1531
|
+
"write_consistency_factor": self.write_consistency_factor,
|
|
1532
|
+
"on_disk_payload": self.on_disk_payload,
|
|
1533
|
+
"hnsw_config": self.hnsw_config,
|
|
1534
|
+
"optimizers_config": self.optimizers_config,
|
|
1535
|
+
"wal_config": self.wal_config,
|
|
1536
|
+
"quantization_config": self.quantization_config,
|
|
1537
|
+
"init_from": self.init_from,
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
def _prepare_client_params(self):
|
|
1541
|
+
"""
|
|
1542
|
+
Prepares the common parameters for client initialization.
|
|
1543
|
+
|
|
1544
|
+
"""
|
|
1545
|
+
return {
|
|
1546
|
+
"location": self.location,
|
|
1547
|
+
"url": self.url,
|
|
1548
|
+
"port": self.port,
|
|
1549
|
+
"grpc_port": self.grpc_port,
|
|
1550
|
+
"prefer_grpc": self.prefer_grpc,
|
|
1551
|
+
"https": self.https,
|
|
1552
|
+
"api_key": self.api_key.resolve_value() if self.api_key else None,
|
|
1553
|
+
"prefix": self.prefix,
|
|
1554
|
+
"timeout": self.timeout,
|
|
1555
|
+
"host": self.host,
|
|
1556
|
+
"path": self.path,
|
|
1557
|
+
"metadata": self.metadata,
|
|
1558
|
+
"force_disable_check_same_thread": self.force_disable_check_same_thread,
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
def _prepare_collection_config(
|
|
1562
|
+
self,
|
|
1563
|
+
embedding_dim: int,
|
|
1564
|
+
distance,
|
|
1565
|
+
on_disk: Optional[bool] = None,
|
|
1566
|
+
use_sparse_embeddings: Optional[bool] = None,
|
|
1567
|
+
sparse_idf: bool = False,
|
|
1568
|
+
):
|
|
1569
|
+
"""
|
|
1570
|
+
Prepares the configuration for creating or recreating a Qdrant collection.
|
|
1571
|
+
|
|
1572
|
+
"""
|
|
1573
|
+
if on_disk is None:
|
|
1574
|
+
on_disk = self.on_disk
|
|
1575
|
+
|
|
1576
|
+
if use_sparse_embeddings is None:
|
|
1577
|
+
use_sparse_embeddings = self.use_sparse_embeddings
|
|
1578
|
+
|
|
1579
|
+
# dense vectors configuration
|
|
1580
|
+
vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
|
|
1581
|
+
sparse_vectors_config = None
|
|
1582
|
+
|
|
1583
|
+
if use_sparse_embeddings:
|
|
1584
|
+
# in this case, we need to define named vectors
|
|
1585
|
+
vectors_config = {DENSE_VECTORS_NAME: vectors_config}
|
|
1586
|
+
|
|
1587
|
+
sparse_vectors_config = {
|
|
1588
|
+
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
1589
|
+
index=rest.SparseIndexParams(
|
|
1590
|
+
on_disk=on_disk,
|
|
1591
|
+
),
|
|
1592
|
+
modifier=rest.Modifier.IDF if sparse_idf else None,
|
|
1593
|
+
),
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
return vectors_config, sparse_vectors_config
|
|
1597
|
+
|
|
1598
|
+
def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None):
|
|
1599
|
+
"""
|
|
1600
|
+
Validates the filters provided for querying.
|
|
1601
|
+
"""
|
|
1602
|
+
if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
|
|
1603
|
+
msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
|
|
1604
|
+
raise ValueError(msg)
|
|
1605
|
+
|
|
1606
|
+
if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
|
|
1607
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
1608
|
+
raise ValueError(msg)
|
|
1609
|
+
|
|
1610
|
+
def _process_query_point_results(self, results, scale_score: bool = False):
|
|
1611
|
+
"""
|
|
1612
|
+
Processes query results from Qdrant.
|
|
1613
|
+
"""
|
|
1614
|
+
documents = [
|
|
1615
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1616
|
+
for point in results
|
|
1617
|
+
]
|
|
1618
|
+
|
|
1619
|
+
if scale_score:
|
|
1620
|
+
for document in documents:
|
|
1621
|
+
score = document.score
|
|
1622
|
+
if self.similarity == "cosine":
|
|
1623
|
+
score = (score + 1) / 2
|
|
1624
|
+
else:
|
|
1625
|
+
score = float(1 / (1 + np.exp(-score / 100)))
|
|
1626
|
+
document.score = score
|
|
1627
|
+
|
|
1628
|
+
return documents
|
|
1629
|
+
|
|
1630
|
+
def _process_group_results(self, groups):
|
|
1631
|
+
"""
|
|
1632
|
+
Processes grouped query results from Qdrant.
|
|
1633
|
+
|
|
1634
|
+
"""
|
|
1635
|
+
if not groups:
|
|
1636
|
+
return []
|
|
1637
|
+
|
|
1638
|
+
return [
|
|
1639
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1640
|
+
for group in groups
|
|
1641
|
+
for point in group.hits
|
|
1642
|
+
]
|
|
1643
|
+
|
|
1644
|
+
def _validate_collection_compatibility(
|
|
1645
|
+
self,
|
|
1646
|
+
collection_name: str,
|
|
1647
|
+
collection_info,
|
|
1648
|
+
distance,
|
|
1649
|
+
embedding_dim: int,
|
|
1650
|
+
):
|
|
1651
|
+
"""
|
|
1652
|
+
Validates that an existing collection is compatible with the current configuration.
|
|
1653
|
+
"""
|
|
1654
|
+
has_named_vectors = isinstance(collection_info.config.params.vectors, dict)
|
|
1655
|
+
|
|
1656
|
+
if has_named_vectors and DENSE_VECTORS_NAME not in collection_info.config.params.vectors:
|
|
1657
|
+
msg = (
|
|
1658
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1659
|
+
f"but it has been originally created outside of Haystack and is not supported. "
|
|
1660
|
+
f"If possible, you should create a new Document Store with Haystack. "
|
|
1661
|
+
f"In case you want to migrate the existing collection, see an example script in "
|
|
1662
|
+
f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
|
|
1663
|
+
f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
|
|
1664
|
+
)
|
|
1665
|
+
raise QdrantStoreError(msg)
|
|
1666
|
+
|
|
1667
|
+
if self.use_sparse_embeddings and not has_named_vectors:
|
|
1668
|
+
msg = (
|
|
1669
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1670
|
+
f"but it has been originally created without sparse embedding vectors. "
|
|
1671
|
+
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
1672
|
+
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
|
|
1673
|
+
f"See `migrate_to_sparse_embeddings_support` function in "
|
|
1674
|
+
f"`haystack_integrations.document_stores.qdrant`."
|
|
1675
|
+
)
|
|
1676
|
+
raise QdrantStoreError(msg)
|
|
1677
|
+
|
|
1678
|
+
if not self.use_sparse_embeddings and has_named_vectors:
|
|
1679
|
+
msg = (
|
|
1680
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1681
|
+
f"but it has been originally created with sparse embedding vectors."
|
|
1682
|
+
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
1683
|
+
)
|
|
1684
|
+
raise QdrantStoreError(msg)
|
|
1685
|
+
|
|
1686
|
+
# Get current distance and vector size based on collection configuration
|
|
1687
|
+
if self.use_sparse_embeddings:
|
|
1688
|
+
current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
|
|
1689
|
+
current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
|
|
1690
|
+
else:
|
|
1691
|
+
current_distance = collection_info.config.params.vectors.distance
|
|
1692
|
+
current_vector_size = collection_info.config.params.vectors.size
|
|
1693
|
+
|
|
1694
|
+
# Validate distance metric
|
|
1695
|
+
if current_distance != distance:
|
|
1696
|
+
msg = (
|
|
1697
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1698
|
+
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
1699
|
+
f"If you want to use that collection, but with a different "
|
|
1700
|
+
f"similarity, please set `recreate_collection=True` argument."
|
|
1701
|
+
)
|
|
1702
|
+
raise ValueError(msg)
|
|
1703
|
+
|
|
1704
|
+
# Validate embedding dimension
|
|
1705
|
+
if current_vector_size != embedding_dim:
|
|
1706
|
+
msg = (
|
|
1707
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1708
|
+
f"but it is configured with a vector size '{current_vector_size}'. "
|
|
1709
|
+
f"If you want to use that collection, but with a different "
|
|
1710
|
+
f"vector size, please set `recreate_collection=True` argument."
|
|
1711
|
+
)
|
|
1712
|
+
raise ValueError(msg)
|