qdrant-haystack 9.1.1__py3-none-any.whl → 10.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_integrations/components/retrievers/py.typed +0 -0
- haystack_integrations/components/retrievers/qdrant/retriever.py +158 -87
- haystack_integrations/document_stores/py.typed +0 -0
- haystack_integrations/document_stores/qdrant/converters.py +13 -12
- haystack_integrations/document_stores/qdrant/document_store.py +945 -171
- haystack_integrations/document_stores/qdrant/filters.py +87 -168
- haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +11 -7
- {qdrant_haystack-9.1.1.dist-info → qdrant_haystack-10.2.0.dist-info}/METADATA +9 -25
- qdrant_haystack-10.2.0.dist-info/RECORD +13 -0
- {qdrant_haystack-9.1.1.dist-info → qdrant_haystack-10.2.0.dist-info}/WHEEL +1 -1
- qdrant_haystack-9.1.1.dist-info/RECORD +0 -11
- {qdrant_haystack-9.1.1.dist-info → qdrant_haystack-10.2.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
from collections.abc import AsyncGenerator, Generator
|
|
2
3
|
from itertools import islice
|
|
3
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, ClassVar, cast
|
|
4
5
|
|
|
5
|
-
import numpy as np
|
|
6
6
|
import qdrant_client
|
|
7
7
|
from haystack import default_from_dict, default_to_dict, logging
|
|
8
8
|
from haystack.dataclasses import Document
|
|
@@ -10,7 +10,7 @@ from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
|
10
10
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
11
11
|
from haystack.document_stores.types import DuplicatePolicy
|
|
12
12
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
13
|
-
from
|
|
13
|
+
from numpy import exp
|
|
14
14
|
from qdrant_client.http import models as rest
|
|
15
15
|
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
16
16
|
from tqdm import tqdm
|
|
@@ -26,15 +26,21 @@ from .filters import convert_filters_to_qdrant
|
|
|
26
26
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
29
|
+
# Default group size to apply when using group_by
|
|
30
|
+
# - Our methods use None as the default for optional group_size parameter.
|
|
31
|
+
# - Qdrant expects an integer and internally defaults to 3 when performing grouped queries.
|
|
32
|
+
# - When group_by is specified but group_size is None, we use this value instead of passing None.
|
|
33
|
+
DEFAULT_GROUP_SIZE = 3
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
class QdrantStoreError(DocumentStoreError):
|
|
31
37
|
pass
|
|
32
38
|
|
|
33
39
|
|
|
34
|
-
FilterType =
|
|
40
|
+
FilterType = dict[str, dict[str, Any] | list[Any] | str | int | float | bool]
|
|
35
41
|
|
|
36
42
|
|
|
37
|
-
def get_batches_from_generator(iterable, n):
|
|
43
|
+
def get_batches_from_generator(iterable: list, n: int) -> Generator:
|
|
38
44
|
"""
|
|
39
45
|
Batch elements of an iterable into fixed-length chunks or blocks.
|
|
40
46
|
"""
|
|
@@ -47,9 +53,8 @@ def get_batches_from_generator(iterable, n):
|
|
|
47
53
|
|
|
48
54
|
class QdrantDocumentStore:
|
|
49
55
|
"""
|
|
50
|
-
A QdrantDocumentStore implementation that you
|
|
51
|
-
|
|
52
|
-
and Qdrant Cloud Cluster deployments.
|
|
56
|
+
A QdrantDocumentStore implementation that you can use with any Qdrant instance: in-memory, disk-persisted,
|
|
57
|
+
Docker-based, and Qdrant Cloud Cluster deployments.
|
|
53
58
|
|
|
54
59
|
Usage example by creating an in-memory instance:
|
|
55
60
|
|
|
@@ -59,7 +64,8 @@ class QdrantDocumentStore:
|
|
|
59
64
|
|
|
60
65
|
document_store = QdrantDocumentStore(
|
|
61
66
|
":memory:",
|
|
62
|
-
recreate_index=True
|
|
67
|
+
recreate_index=True,
|
|
68
|
+
embedding_dim=5
|
|
63
69
|
)
|
|
64
70
|
document_store.write_documents([
|
|
65
71
|
Document(content="This is first", embedding=[0.0]*5),
|
|
@@ -84,7 +90,7 @@ class QdrantDocumentStore:
|
|
|
84
90
|
```
|
|
85
91
|
"""
|
|
86
92
|
|
|
87
|
-
SIMILARITY: ClassVar[
|
|
93
|
+
SIMILARITY: ClassVar[dict[str, rest.Distance]] = {
|
|
88
94
|
"cosine": rest.Distance.COSINE,
|
|
89
95
|
"dot_product": rest.Distance.DOT,
|
|
90
96
|
"l2": rest.Distance.EUCLID,
|
|
@@ -92,17 +98,17 @@ class QdrantDocumentStore:
|
|
|
92
98
|
|
|
93
99
|
def __init__(
|
|
94
100
|
self,
|
|
95
|
-
location:
|
|
96
|
-
url:
|
|
101
|
+
location: str | None = None,
|
|
102
|
+
url: str | None = None,
|
|
97
103
|
port: int = 6333,
|
|
98
104
|
grpc_port: int = 6334,
|
|
99
105
|
prefer_grpc: bool = False,
|
|
100
|
-
https:
|
|
101
|
-
api_key:
|
|
102
|
-
prefix:
|
|
103
|
-
timeout:
|
|
104
|
-
host:
|
|
105
|
-
path:
|
|
106
|
+
https: bool | None = None,
|
|
107
|
+
api_key: Secret | None = None,
|
|
108
|
+
prefix: str | None = None,
|
|
109
|
+
timeout: int | None = None,
|
|
110
|
+
host: str | None = None,
|
|
111
|
+
path: str | None = None,
|
|
106
112
|
force_disable_check_same_thread: bool = False,
|
|
107
113
|
index: str = "Document",
|
|
108
114
|
embedding_dim: int = 768,
|
|
@@ -113,24 +119,25 @@ class QdrantDocumentStore:
|
|
|
113
119
|
return_embedding: bool = False,
|
|
114
120
|
progress_bar: bool = True,
|
|
115
121
|
recreate_index: bool = False,
|
|
116
|
-
shard_number:
|
|
117
|
-
replication_factor:
|
|
118
|
-
write_consistency_factor:
|
|
119
|
-
on_disk_payload:
|
|
120
|
-
hnsw_config:
|
|
121
|
-
optimizers_config:
|
|
122
|
-
wal_config:
|
|
123
|
-
quantization_config:
|
|
124
|
-
init_from: Optional[dict] = None,
|
|
122
|
+
shard_number: int | None = None,
|
|
123
|
+
replication_factor: int | None = None,
|
|
124
|
+
write_consistency_factor: int | None = None,
|
|
125
|
+
on_disk_payload: bool | None = None,
|
|
126
|
+
hnsw_config: dict | None = None,
|
|
127
|
+
optimizers_config: dict | None = None,
|
|
128
|
+
wal_config: dict | None = None,
|
|
129
|
+
quantization_config: dict | None = None,
|
|
125
130
|
wait_result_from_api: bool = True,
|
|
126
|
-
metadata:
|
|
131
|
+
metadata: dict | None = None,
|
|
127
132
|
write_batch_size: int = 100,
|
|
128
133
|
scroll_size: int = 10_000,
|
|
129
|
-
payload_fields_to_index:
|
|
130
|
-
):
|
|
134
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
135
|
+
) -> None:
|
|
131
136
|
"""
|
|
137
|
+
Initializes a QdrantDocumentStore.
|
|
138
|
+
|
|
132
139
|
:param location:
|
|
133
|
-
If `memory` - use in-memory Qdrant instance.
|
|
140
|
+
If `":memory:"` - use in-memory Qdrant instance.
|
|
134
141
|
If `str` - use it as a URL parameter.
|
|
135
142
|
If `None` - use default values for host and port.
|
|
136
143
|
:param url:
|
|
@@ -164,7 +171,7 @@ class QdrantDocumentStore:
|
|
|
164
171
|
Dimension of the embeddings.
|
|
165
172
|
:param on_disk:
|
|
166
173
|
Whether to store the collection on disk.
|
|
167
|
-
:param
|
|
174
|
+
:param use_sparse_embeddings:
|
|
168
175
|
If set to `True`, enables support for sparse embeddings.
|
|
169
176
|
:param sparse_idf:
|
|
170
177
|
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
|
|
@@ -201,8 +208,6 @@ class QdrantDocumentStore:
|
|
|
201
208
|
Params for Write-Ahead-Log.
|
|
202
209
|
:param quantization_config:
|
|
203
210
|
Params for quantization. If `None`, quantization will be disabled.
|
|
204
|
-
:param init_from:
|
|
205
|
-
Use data stored in another collection to initialize this collection.
|
|
206
211
|
:param wait_result_from_api:
|
|
207
212
|
Whether to wait for the result from the API after each request.
|
|
208
213
|
:param metadata:
|
|
@@ -215,8 +220,8 @@ class QdrantDocumentStore:
|
|
|
215
220
|
List of payload fields to index.
|
|
216
221
|
"""
|
|
217
222
|
|
|
218
|
-
self._client = None
|
|
219
|
-
self._async_client = None
|
|
223
|
+
self._client: qdrant_client.QdrantClient | None = None
|
|
224
|
+
self._async_client: qdrant_client.AsyncQdrantClient | None = None
|
|
220
225
|
|
|
221
226
|
# Store the Qdrant client specific attributes
|
|
222
227
|
self.location = location
|
|
@@ -232,7 +237,6 @@ class QdrantDocumentStore:
|
|
|
232
237
|
self.path = path
|
|
233
238
|
self.force_disable_check_same_thread = force_disable_check_same_thread
|
|
234
239
|
self.metadata = metadata or {}
|
|
235
|
-
self.api_key = api_key
|
|
236
240
|
|
|
237
241
|
# Store the Qdrant collection specific attributes
|
|
238
242
|
self.shard_number = shard_number
|
|
@@ -243,7 +247,6 @@ class QdrantDocumentStore:
|
|
|
243
247
|
self.optimizers_config = optimizers_config
|
|
244
248
|
self.wal_config = wal_config
|
|
245
249
|
self.quantization_config = quantization_config
|
|
246
|
-
self.init_from = init_from
|
|
247
250
|
self.wait_result_from_api = wait_result_from_api
|
|
248
251
|
self.recreate_index = recreate_index
|
|
249
252
|
self.payload_fields_to_index = payload_fields_to_index
|
|
@@ -258,9 +261,10 @@ class QdrantDocumentStore:
|
|
|
258
261
|
self.write_batch_size = write_batch_size
|
|
259
262
|
self.scroll_size = scroll_size
|
|
260
263
|
|
|
261
|
-
def _initialize_client(self):
|
|
264
|
+
def _initialize_client(self) -> None:
|
|
262
265
|
if self._client is None:
|
|
263
266
|
client_params = self._prepare_client_params()
|
|
267
|
+
# This step adds the api-key and User-Agent to metadata
|
|
264
268
|
self._client = qdrant_client.QdrantClient(**client_params)
|
|
265
269
|
# Make sure the collection is properly set up
|
|
266
270
|
self._set_up_collection(
|
|
@@ -274,7 +278,7 @@ class QdrantDocumentStore:
|
|
|
274
278
|
self.payload_fields_to_index,
|
|
275
279
|
)
|
|
276
280
|
|
|
277
|
-
async def _initialize_async_client(self):
|
|
281
|
+
async def _initialize_async_client(self) -> None:
|
|
278
282
|
"""
|
|
279
283
|
Returns the asynchronous Qdrant client, initializing it if necessary.
|
|
280
284
|
"""
|
|
@@ -330,8 +334,8 @@ class QdrantDocumentStore:
|
|
|
330
334
|
|
|
331
335
|
def filter_documents(
|
|
332
336
|
self,
|
|
333
|
-
filters:
|
|
334
|
-
) ->
|
|
337
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
338
|
+
) -> list[Document]:
|
|
335
339
|
"""
|
|
336
340
|
Returns the documents that match the provided filters.
|
|
337
341
|
|
|
@@ -344,7 +348,7 @@ class QdrantDocumentStore:
|
|
|
344
348
|
# No need to initialize client here as _get_documents_generator
|
|
345
349
|
# will handle client initialization internally
|
|
346
350
|
|
|
347
|
-
|
|
351
|
+
QdrantDocumentStore._validate_filters(filters)
|
|
348
352
|
return list(
|
|
349
353
|
self._get_documents_generator(
|
|
350
354
|
filters,
|
|
@@ -353,20 +357,20 @@ class QdrantDocumentStore:
|
|
|
353
357
|
|
|
354
358
|
async def filter_documents_async(
|
|
355
359
|
self,
|
|
356
|
-
filters:
|
|
357
|
-
) ->
|
|
360
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
361
|
+
) -> list[Document]:
|
|
358
362
|
"""
|
|
359
363
|
Asynchronously returns the documents that match the provided filters.
|
|
360
364
|
"""
|
|
361
365
|
# No need to initialize client here as _get_documents_generator_async
|
|
362
366
|
# will handle client initialization internally
|
|
363
367
|
|
|
364
|
-
|
|
368
|
+
QdrantDocumentStore._validate_filters(filters)
|
|
365
369
|
return [doc async for doc in self._get_documents_generator_async(filters)]
|
|
366
370
|
|
|
367
371
|
def write_documents(
|
|
368
372
|
self,
|
|
369
|
-
documents:
|
|
373
|
+
documents: list[Document],
|
|
370
374
|
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
371
375
|
) -> int:
|
|
372
376
|
"""
|
|
@@ -419,7 +423,7 @@ class QdrantDocumentStore:
|
|
|
419
423
|
|
|
420
424
|
async def write_documents_async(
|
|
421
425
|
self,
|
|
422
|
-
documents:
|
|
426
|
+
documents: list[Document],
|
|
423
427
|
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
424
428
|
) -> int:
|
|
425
429
|
"""
|
|
@@ -471,7 +475,7 @@ class QdrantDocumentStore:
|
|
|
471
475
|
progress_bar.update(self.write_batch_size)
|
|
472
476
|
return len(document_objects)
|
|
473
477
|
|
|
474
|
-
def delete_documents(self, document_ids:
|
|
478
|
+
def delete_documents(self, document_ids: list[str]) -> None:
|
|
475
479
|
"""
|
|
476
480
|
Deletes documents that match the provided `document_ids` from the document store.
|
|
477
481
|
|
|
@@ -481,11 +485,10 @@ class QdrantDocumentStore:
|
|
|
481
485
|
self._initialize_client()
|
|
482
486
|
assert self._client is not None
|
|
483
487
|
|
|
484
|
-
ids = [convert_id(_id) for _id in document_ids]
|
|
485
488
|
try:
|
|
486
489
|
self._client.delete(
|
|
487
490
|
collection_name=self.index,
|
|
488
|
-
points_selector=
|
|
491
|
+
points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
|
|
489
492
|
wait=self.wait_result_from_api,
|
|
490
493
|
)
|
|
491
494
|
except KeyError:
|
|
@@ -493,7 +496,7 @@ class QdrantDocumentStore:
|
|
|
493
496
|
"Called QdrantDocumentStore.delete_documents() on a non-existing ID",
|
|
494
497
|
)
|
|
495
498
|
|
|
496
|
-
async def delete_documents_async(self, document_ids:
|
|
499
|
+
async def delete_documents_async(self, document_ids: list[str]) -> None:
|
|
497
500
|
"""
|
|
498
501
|
Asynchronously deletes documents that match the provided `document_ids` from the document store.
|
|
499
502
|
|
|
@@ -503,11 +506,10 @@ class QdrantDocumentStore:
|
|
|
503
506
|
await self._initialize_async_client()
|
|
504
507
|
assert self._async_client is not None
|
|
505
508
|
|
|
506
|
-
ids = [convert_id(_id) for _id in document_ids]
|
|
507
509
|
try:
|
|
508
510
|
await self._async_client.delete(
|
|
509
511
|
collection_name=self.index,
|
|
510
|
-
points_selector=
|
|
512
|
+
points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
|
|
511
513
|
wait=self.wait_result_from_api,
|
|
512
514
|
)
|
|
513
515
|
except KeyError:
|
|
@@ -515,8 +517,748 @@ class QdrantDocumentStore:
|
|
|
515
517
|
"Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
|
|
516
518
|
)
|
|
517
519
|
|
|
520
|
+
def delete_by_filter(self, filters: dict[str, Any]) -> int:
|
|
521
|
+
"""
|
|
522
|
+
Deletes all documents that match the provided filters.
|
|
523
|
+
|
|
524
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
525
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
526
|
+
|
|
527
|
+
:returns:
|
|
528
|
+
The number of documents deleted.
|
|
529
|
+
"""
|
|
530
|
+
self._initialize_client()
|
|
531
|
+
assert self._client is not None
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
535
|
+
if qdrant_filter is None:
|
|
536
|
+
return 0
|
|
537
|
+
|
|
538
|
+
count_response = self._client.count(
|
|
539
|
+
collection_name=self.index,
|
|
540
|
+
count_filter=qdrant_filter,
|
|
541
|
+
)
|
|
542
|
+
deleted_count = count_response.count
|
|
543
|
+
|
|
544
|
+
self._client.delete(
|
|
545
|
+
collection_name=self.index,
|
|
546
|
+
points_selector=rest.FilterSelector(filter=qdrant_filter),
|
|
547
|
+
wait=self.wait_result_from_api,
|
|
548
|
+
)
|
|
549
|
+
return deleted_count
|
|
550
|
+
|
|
551
|
+
except Exception as e:
|
|
552
|
+
msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
|
|
553
|
+
raise QdrantStoreError(msg) from e
|
|
554
|
+
|
|
555
|
+
async def delete_by_filter_async(self, filters: dict[str, Any]) -> int:
|
|
556
|
+
"""
|
|
557
|
+
Asynchronously deletes all documents that match the provided filters.
|
|
558
|
+
|
|
559
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
560
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
561
|
+
|
|
562
|
+
:returns:
|
|
563
|
+
The number of documents deleted.
|
|
564
|
+
"""
|
|
565
|
+
await self._initialize_async_client()
|
|
566
|
+
assert self._async_client is not None
|
|
567
|
+
|
|
568
|
+
try:
|
|
569
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
570
|
+
if qdrant_filter is None:
|
|
571
|
+
return 0
|
|
572
|
+
|
|
573
|
+
count_response = await self._async_client.count(
|
|
574
|
+
collection_name=self.index,
|
|
575
|
+
count_filter=qdrant_filter,
|
|
576
|
+
)
|
|
577
|
+
deleted_count = count_response.count
|
|
578
|
+
|
|
579
|
+
await self._async_client.delete(
|
|
580
|
+
collection_name=self.index,
|
|
581
|
+
points_selector=rest.FilterSelector(filter=qdrant_filter),
|
|
582
|
+
wait=self.wait_result_from_api,
|
|
583
|
+
)
|
|
584
|
+
return deleted_count
|
|
585
|
+
|
|
586
|
+
except Exception as e:
|
|
587
|
+
msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
|
|
588
|
+
raise QdrantStoreError(msg) from e
|
|
589
|
+
|
|
590
|
+
@staticmethod
|
|
591
|
+
def _check_stop_scrolling(next_offset: Any) -> bool:
|
|
592
|
+
"""
|
|
593
|
+
Checks if scrolling should stop based on the next_offset value.
|
|
594
|
+
|
|
595
|
+
:param next_offset: The offset returned from the scroll operation.
|
|
596
|
+
:returns: True if scrolling should stop, False otherwise.
|
|
597
|
+
"""
|
|
598
|
+
return next_offset is None or (
|
|
599
|
+
hasattr(next_offset, "num")
|
|
600
|
+
and hasattr(next_offset, "uuid")
|
|
601
|
+
and next_offset.num == 0
|
|
602
|
+
and next_offset.uuid == ""
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
@staticmethod
|
|
606
|
+
def _metadata_fields_info_from_schema(payload_schema: dict[str, Any]) -> dict[str, str]:
|
|
607
|
+
"""Build field name -> type dict from Qdrant payload_schema. Used by get_metadata_fields_info (sync/async)."""
|
|
608
|
+
fields_info: dict[str, str] = {}
|
|
609
|
+
for field_name, field_config in payload_schema.items():
|
|
610
|
+
if hasattr(field_config, "data_type"):
|
|
611
|
+
fields_info[field_name] = str(field_config.data_type)
|
|
612
|
+
else:
|
|
613
|
+
fields_info[field_name] = "unknown"
|
|
614
|
+
return fields_info
|
|
615
|
+
|
|
616
|
+
@staticmethod
|
|
617
|
+
def _process_records_min_max(
|
|
618
|
+
records: list[Any], metadata_field: str, min_value: Any, max_value: Any
|
|
619
|
+
) -> tuple[Any, Any]:
|
|
620
|
+
"""Update min/max from a batch of Qdrant records. Used by get_metadata_field_min_max (sync/async)."""
|
|
621
|
+
for record in records:
|
|
622
|
+
if record.payload and "meta" in record.payload:
|
|
623
|
+
meta = record.payload["meta"]
|
|
624
|
+
if metadata_field in meta:
|
|
625
|
+
value = meta[metadata_field]
|
|
626
|
+
if value is not None:
|
|
627
|
+
if min_value is None or value < min_value:
|
|
628
|
+
min_value = value
|
|
629
|
+
if max_value is None or value > max_value:
|
|
630
|
+
max_value = value
|
|
631
|
+
return min_value, max_value
|
|
632
|
+
|
|
633
|
+
@staticmethod
|
|
634
|
+
def _process_records_count_unique(
|
|
635
|
+
records: list[Any], metadata_fields: list[str], unique_values_by_field: dict[str, set[Any]]
|
|
636
|
+
) -> None:
|
|
637
|
+
"""
|
|
638
|
+
Update unique_values_by_field from a batch of Qdrant records.
|
|
639
|
+
|
|
640
|
+
Used by count_unique_metadata_by_filter (sync/async).
|
|
641
|
+
"""
|
|
642
|
+
for record in records:
|
|
643
|
+
if record.payload and "meta" in record.payload:
|
|
644
|
+
meta = record.payload["meta"]
|
|
645
|
+
for field in metadata_fields:
|
|
646
|
+
if field in meta:
|
|
647
|
+
value = meta[field]
|
|
648
|
+
if value is not None:
|
|
649
|
+
if isinstance(value, (list, dict)):
|
|
650
|
+
unique_values_by_field[field].add(str(value))
|
|
651
|
+
else:
|
|
652
|
+
unique_values_by_field[field].add(value)
|
|
653
|
+
|
|
654
|
+
@staticmethod
|
|
655
|
+
def _process_records_unique_values(
|
|
656
|
+
records: list[Any],
|
|
657
|
+
metadata_field: str,
|
|
658
|
+
unique_values: list[Any],
|
|
659
|
+
unique_values_set: set[Any],
|
|
660
|
+
offset: int,
|
|
661
|
+
limit: int,
|
|
662
|
+
) -> bool:
|
|
663
|
+
"""Collect unique values from a batch of records. Returns True when len(unique_values) >= offset + limit."""
|
|
664
|
+
for record in records:
|
|
665
|
+
if record.payload and "meta" in record.payload:
|
|
666
|
+
meta = record.payload["meta"]
|
|
667
|
+
if metadata_field in meta:
|
|
668
|
+
value = meta[metadata_field]
|
|
669
|
+
if value is not None:
|
|
670
|
+
hashable_value = str(value) if isinstance(value, (list, dict)) else value
|
|
671
|
+
if hashable_value not in unique_values_set:
|
|
672
|
+
unique_values_set.add(hashable_value)
|
|
673
|
+
unique_values.append(value)
|
|
674
|
+
if len(unique_values) >= offset + limit:
|
|
675
|
+
return True
|
|
676
|
+
return False
|
|
677
|
+
|
|
678
|
+
@staticmethod
|
|
679
|
+
def _create_updated_point_from_record(record: Any, meta: dict[str, Any]) -> rest.PointStruct:
|
|
680
|
+
"""
|
|
681
|
+
Creates an updated PointStruct from a Qdrant record with merged metadata.
|
|
682
|
+
|
|
683
|
+
:param record: The Qdrant record to update.
|
|
684
|
+
:param meta: The metadata fields to merge with existing metadata.
|
|
685
|
+
:returns: A PointStruct with updated metadata and preserved vectors.
|
|
686
|
+
"""
|
|
687
|
+
# merge existing payload with new metadata
|
|
688
|
+
# Metadata is stored under the "meta" key in the payload
|
|
689
|
+
updated_payload = dict(record.payload or {})
|
|
690
|
+
if "meta" not in updated_payload:
|
|
691
|
+
updated_payload["meta"] = {}
|
|
692
|
+
updated_payload["meta"].update(meta)
|
|
693
|
+
|
|
694
|
+
# create updated point preserving vectors
|
|
695
|
+
# Type cast needed because record.vector type doesn't include all PointStruct vector types
|
|
696
|
+
vector_value = record.vector if record.vector is not None else {}
|
|
697
|
+
return rest.PointStruct(
|
|
698
|
+
id=record.id,
|
|
699
|
+
vector=cast(Any, vector_value),
|
|
700
|
+
payload=updated_payload,
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
|
|
704
|
+
"""
|
|
705
|
+
Updates the metadata of all documents that match the provided filters.
|
|
706
|
+
|
|
707
|
+
**Note**: This operation is not atomic. Documents matching the filter are fetched first,
|
|
708
|
+
then updated. If documents are modified between the fetch and update operations,
|
|
709
|
+
those changes may be lost.
|
|
710
|
+
|
|
711
|
+
:param filters: The filters to apply to select documents for updating.
|
|
712
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
713
|
+
:param meta: The metadata fields to update. This will be merged with existing metadata.
|
|
714
|
+
|
|
715
|
+
:returns:
|
|
716
|
+
The number of documents updated.
|
|
717
|
+
"""
|
|
718
|
+
self._initialize_client()
|
|
719
|
+
assert self._client is not None
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
723
|
+
if qdrant_filter is None:
|
|
724
|
+
return 0
|
|
725
|
+
|
|
726
|
+
# get all matching documents using scroll
|
|
727
|
+
updated_points = []
|
|
728
|
+
next_offset = None
|
|
729
|
+
|
|
730
|
+
while True:
|
|
731
|
+
records, next_offset = self._client.scroll(
|
|
732
|
+
collection_name=self.index,
|
|
733
|
+
scroll_filter=qdrant_filter,
|
|
734
|
+
limit=self.scroll_size,
|
|
735
|
+
offset=next_offset,
|
|
736
|
+
with_payload=True,
|
|
737
|
+
with_vectors=True,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# update payload for each record
|
|
741
|
+
for record in records:
|
|
742
|
+
updated_points.append(self._create_updated_point_from_record(record, meta))
|
|
743
|
+
|
|
744
|
+
if self._check_stop_scrolling(next_offset):
|
|
745
|
+
break
|
|
746
|
+
|
|
747
|
+
if not updated_points:
|
|
748
|
+
return 0
|
|
749
|
+
|
|
750
|
+
# upsert updated points back in batches
|
|
751
|
+
for batch in get_batches_from_generator(updated_points, self.write_batch_size):
|
|
752
|
+
self._client.upsert(
|
|
753
|
+
collection_name=self.index,
|
|
754
|
+
points=list(batch),
|
|
755
|
+
wait=self.wait_result_from_api,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
logger.info(
|
|
759
|
+
"Updated {n_docs} documents in collection '{name}' using filters.",
|
|
760
|
+
n_docs=len(updated_points),
|
|
761
|
+
name=self.index,
|
|
762
|
+
)
|
|
763
|
+
return len(updated_points)
|
|
764
|
+
except Exception as e:
|
|
765
|
+
msg = f"Failed to update documents by filter in Qdrant: {e!s}"
|
|
766
|
+
raise QdrantStoreError(msg) from e
|
|
767
|
+
|
|
768
|
+
async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
|
|
769
|
+
"""
|
|
770
|
+
Asynchronously updates the metadata of all documents that match the provided filters.
|
|
771
|
+
|
|
772
|
+
**Note**: This operation is not atomic. Documents matching the filter are fetched first,
|
|
773
|
+
then updated. If documents are modified between the fetch and update operations,
|
|
774
|
+
those changes may be lost.
|
|
775
|
+
|
|
776
|
+
:param filters: The filters to apply to select documents for updating.
|
|
777
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
778
|
+
:param meta: The metadata fields to update. This will be merged with existing metadata.
|
|
779
|
+
|
|
780
|
+
:returns:
|
|
781
|
+
The number of documents updated.
|
|
782
|
+
"""
|
|
783
|
+
await self._initialize_async_client()
|
|
784
|
+
assert self._async_client is not None
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
788
|
+
if qdrant_filter is None:
|
|
789
|
+
return 0
|
|
790
|
+
|
|
791
|
+
updated_points = []
|
|
792
|
+
next_offset = None
|
|
793
|
+
|
|
794
|
+
while True:
|
|
795
|
+
records, next_offset = await self._async_client.scroll(
|
|
796
|
+
collection_name=self.index,
|
|
797
|
+
scroll_filter=qdrant_filter,
|
|
798
|
+
limit=self.scroll_size,
|
|
799
|
+
offset=next_offset,
|
|
800
|
+
with_payload=True,
|
|
801
|
+
with_vectors=True,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# update payload for each record
|
|
805
|
+
for record in records:
|
|
806
|
+
updated_points.append(self._create_updated_point_from_record(record, meta))
|
|
807
|
+
|
|
808
|
+
if self._check_stop_scrolling(next_offset):
|
|
809
|
+
break
|
|
810
|
+
|
|
811
|
+
if not updated_points:
|
|
812
|
+
return 0
|
|
813
|
+
|
|
814
|
+
# upsert updated points back in batches
|
|
815
|
+
for batch in get_batches_from_generator(updated_points, self.write_batch_size):
|
|
816
|
+
await self._async_client.upsert(
|
|
817
|
+
collection_name=self.index,
|
|
818
|
+
points=list(batch),
|
|
819
|
+
wait=self.wait_result_from_api,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
logger.info(
|
|
823
|
+
"Updated {n_docs} documents in collection '{name}' using filters.",
|
|
824
|
+
n_docs=len(updated_points),
|
|
825
|
+
name=self.index,
|
|
826
|
+
)
|
|
827
|
+
return len(updated_points)
|
|
828
|
+
except Exception as e:
|
|
829
|
+
msg = f"Failed to update documents by filter in Qdrant: {e!s}"
|
|
830
|
+
raise QdrantStoreError(msg) from e
|
|
831
|
+
|
|
832
|
+
def delete_all_documents(self, recreate_index: bool = False) -> None:
|
|
833
|
+
"""
|
|
834
|
+
Deletes all documents from the document store.
|
|
835
|
+
|
|
836
|
+
:param recreate_index: Whether to recreate the index after deleting all documents.
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
self._initialize_client()
|
|
840
|
+
assert self._client is not None
|
|
841
|
+
|
|
842
|
+
if recreate_index:
|
|
843
|
+
# get current collection config as json
|
|
844
|
+
collection_info = self._client.get_collection(collection_name=self.index)
|
|
845
|
+
info_json = collection_info.model_dump()
|
|
846
|
+
|
|
847
|
+
# deal with the Optional use_sparse_embeddings
|
|
848
|
+
sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
|
|
849
|
+
use_sparse_embeddings = True if sparse_vectors else False
|
|
850
|
+
|
|
851
|
+
# deal with the Optional sparse_idf
|
|
852
|
+
hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
|
|
853
|
+
sparse_idf = True if use_sparse_embeddings and hnsw_config else False
|
|
854
|
+
|
|
855
|
+
# recreate collection
|
|
856
|
+
self._set_up_collection(
|
|
857
|
+
collection_name=self.index,
|
|
858
|
+
embedding_dim=info_json["config"]["params"]["vectors"]["size"],
|
|
859
|
+
recreate_collection=True,
|
|
860
|
+
similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
|
|
861
|
+
use_sparse_embeddings=use_sparse_embeddings,
|
|
862
|
+
sparse_idf=sparse_idf,
|
|
863
|
+
on_disk=info_json["config"]["hnsw_config"]["on_disk"],
|
|
864
|
+
payload_fields_to_index=info_json["payload_schema"],
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
else:
|
|
868
|
+
try:
|
|
869
|
+
self._client.delete(
|
|
870
|
+
collection_name=self.index,
|
|
871
|
+
points_selector=rest.FilterSelector(
|
|
872
|
+
filter=rest.Filter(
|
|
873
|
+
must=[],
|
|
874
|
+
)
|
|
875
|
+
),
|
|
876
|
+
wait=self.wait_result_from_api,
|
|
877
|
+
)
|
|
878
|
+
except Exception as e:
|
|
879
|
+
logger.warning(
|
|
880
|
+
f"Error {e} when calling QdrantDocumentStore.delete_all_documents()",
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
async def delete_all_documents_async(self, recreate_index: bool = False) -> None:
|
|
884
|
+
"""
|
|
885
|
+
Asynchronously deletes all documents from the document store.
|
|
886
|
+
|
|
887
|
+
:param recreate_index: Whether to recreate the index after deleting all documents.
|
|
888
|
+
"""
|
|
889
|
+
|
|
890
|
+
await self._initialize_async_client()
|
|
891
|
+
assert self._async_client is not None
|
|
892
|
+
|
|
893
|
+
if recreate_index:
|
|
894
|
+
# get current collection config as json
|
|
895
|
+
collection_info = await self._async_client.get_collection(collection_name=self.index)
|
|
896
|
+
info_json = collection_info.model_dump()
|
|
897
|
+
|
|
898
|
+
# deal with the Optional use_sparse_embeddings
|
|
899
|
+
sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
|
|
900
|
+
use_sparse_embeddings = True if sparse_vectors else False
|
|
901
|
+
|
|
902
|
+
# deal with the Optional sparse_idf
|
|
903
|
+
hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
|
|
904
|
+
sparse_idf = True if use_sparse_embeddings and hnsw_config else False
|
|
905
|
+
|
|
906
|
+
# recreate collection
|
|
907
|
+
await self._set_up_collection_async(
|
|
908
|
+
collection_name=self.index,
|
|
909
|
+
embedding_dim=info_json["config"]["params"]["vectors"]["size"],
|
|
910
|
+
recreate_collection=True,
|
|
911
|
+
similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
|
|
912
|
+
use_sparse_embeddings=use_sparse_embeddings,
|
|
913
|
+
sparse_idf=sparse_idf,
|
|
914
|
+
on_disk=info_json["config"]["hnsw_config"]["on_disk"],
|
|
915
|
+
payload_fields_to_index=info_json["payload_schema"],
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
else:
|
|
919
|
+
try:
|
|
920
|
+
await self._async_client.delete(
|
|
921
|
+
collection_name=self.index,
|
|
922
|
+
points_selector=rest.FilterSelector(
|
|
923
|
+
filter=rest.Filter(
|
|
924
|
+
must=[],
|
|
925
|
+
)
|
|
926
|
+
),
|
|
927
|
+
wait=self.wait_result_from_api,
|
|
928
|
+
)
|
|
929
|
+
except Exception as e:
|
|
930
|
+
logger.warning(
|
|
931
|
+
f"Error {e} when calling QdrantDocumentStore.delete_all_documents_async()",
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
|
|
935
|
+
"""
|
|
936
|
+
Returns the number of documents that match the provided filters.
|
|
937
|
+
|
|
938
|
+
:param filters: The filters to apply to count documents.
|
|
939
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
940
|
+
|
|
941
|
+
:returns: The number of documents that match the filters.
|
|
942
|
+
"""
|
|
943
|
+
self._initialize_client()
|
|
944
|
+
assert self._client is not None
|
|
945
|
+
|
|
946
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
947
|
+
try:
|
|
948
|
+
response = self._client.count(
|
|
949
|
+
collection_name=self.index,
|
|
950
|
+
count_filter=qdrant_filter,
|
|
951
|
+
)
|
|
952
|
+
return response.count
|
|
953
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
954
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter()")
|
|
955
|
+
return 0
|
|
956
|
+
|
|
957
|
+
async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int:
|
|
958
|
+
"""
|
|
959
|
+
Asynchronously returns the number of documents that match the provided filters.
|
|
960
|
+
|
|
961
|
+
:param filters: The filters to apply to select documents for counting.
|
|
962
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
963
|
+
|
|
964
|
+
:returns:
|
|
965
|
+
The number of documents that match the filters.
|
|
966
|
+
"""
|
|
967
|
+
await self._initialize_async_client()
|
|
968
|
+
assert self._async_client is not None
|
|
969
|
+
|
|
970
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
971
|
+
try:
|
|
972
|
+
response = await self._async_client.count(
|
|
973
|
+
collection_name=self.index,
|
|
974
|
+
count_filter=qdrant_filter,
|
|
975
|
+
)
|
|
976
|
+
return response.count
|
|
977
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
978
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter_async()")
|
|
979
|
+
return 0
|
|
980
|
+
|
|
981
|
+
def get_metadata_fields_info(self) -> dict[str, str]:
|
|
982
|
+
"""
|
|
983
|
+
Returns the information about the fields from the collection.
|
|
984
|
+
|
|
985
|
+
:returns:
|
|
986
|
+
A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
|
|
987
|
+
"""
|
|
988
|
+
self._initialize_client()
|
|
989
|
+
assert self._client is not None
|
|
990
|
+
|
|
991
|
+
try:
|
|
992
|
+
collection_info = self._client.get_collection(self.index)
|
|
993
|
+
payload_schema = collection_info.payload_schema or {}
|
|
994
|
+
return self._metadata_fields_info_from_schema(payload_schema)
|
|
995
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
996
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info()")
|
|
997
|
+
return {}
|
|
998
|
+
|
|
999
|
+
async def get_metadata_fields_info_async(self) -> dict[str, str]:
|
|
1000
|
+
"""
|
|
1001
|
+
Asynchronously returns the information about the fields from the collection.
|
|
1002
|
+
|
|
1003
|
+
:returns:
|
|
1004
|
+
A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
|
|
1005
|
+
"""
|
|
1006
|
+
await self._initialize_async_client()
|
|
1007
|
+
assert self._async_client is not None
|
|
1008
|
+
|
|
1009
|
+
try:
|
|
1010
|
+
collection_info = await self._async_client.get_collection(self.index)
|
|
1011
|
+
payload_schema = collection_info.payload_schema or {}
|
|
1012
|
+
return self._metadata_fields_info_from_schema(payload_schema)
|
|
1013
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
1014
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info_async()")
|
|
1015
|
+
return {}
|
|
1016
|
+
|
|
1017
|
+
def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]:
|
|
1018
|
+
"""
|
|
1019
|
+
Returns the minimum and maximum values for the given metadata field.
|
|
1020
|
+
|
|
1021
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
|
|
1022
|
+
|
|
1023
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1024
|
+
metadata field across all documents. Returns an empty dict if no documents have the field.
|
|
1025
|
+
"""
|
|
1026
|
+
self._initialize_client()
|
|
1027
|
+
assert self._client is not None
|
|
1028
|
+
|
|
1029
|
+
try:
|
|
1030
|
+
min_value: Any = None
|
|
1031
|
+
max_value: Any = None
|
|
1032
|
+
next_offset = None
|
|
1033
|
+
|
|
1034
|
+
while True:
|
|
1035
|
+
records, next_offset = self._client.scroll(
|
|
1036
|
+
collection_name=self.index,
|
|
1037
|
+
scroll_filter=None,
|
|
1038
|
+
limit=self.scroll_size,
|
|
1039
|
+
offset=next_offset,
|
|
1040
|
+
with_payload=True,
|
|
1041
|
+
with_vectors=False,
|
|
1042
|
+
)
|
|
1043
|
+
min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
|
|
1044
|
+
if self._check_stop_scrolling(next_offset):
|
|
1045
|
+
break
|
|
1046
|
+
|
|
1047
|
+
if min_value is not None and max_value is not None:
|
|
1048
|
+
return {"min": min_value, "max": max_value}
|
|
1049
|
+
return {}
|
|
1050
|
+
except Exception as e:
|
|
1051
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max()")
|
|
1052
|
+
return {}
|
|
1053
|
+
|
|
1054
|
+
async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, Any]:
|
|
1055
|
+
"""
|
|
1056
|
+
Asynchronously returns the minimum and maximum values for the given metadata field.
|
|
1057
|
+
|
|
1058
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
|
|
1059
|
+
|
|
1060
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1061
|
+
metadata field across all documents. Returns an empty dict if no documents have the field.
|
|
1062
|
+
"""
|
|
1063
|
+
await self._initialize_async_client()
|
|
1064
|
+
assert self._async_client is not None
|
|
1065
|
+
|
|
1066
|
+
try:
|
|
1067
|
+
min_value: Any = None
|
|
1068
|
+
max_value: Any = None
|
|
1069
|
+
next_offset = None
|
|
1070
|
+
|
|
1071
|
+
while True:
|
|
1072
|
+
records, next_offset = await self._async_client.scroll(
|
|
1073
|
+
collection_name=self.index,
|
|
1074
|
+
scroll_filter=None,
|
|
1075
|
+
limit=self.scroll_size,
|
|
1076
|
+
offset=next_offset,
|
|
1077
|
+
with_payload=True,
|
|
1078
|
+
with_vectors=False,
|
|
1079
|
+
)
|
|
1080
|
+
min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
|
|
1081
|
+
if self._check_stop_scrolling(next_offset):
|
|
1082
|
+
break
|
|
1083
|
+
|
|
1084
|
+
if min_value is not None and max_value is not None:
|
|
1085
|
+
return {"min": min_value, "max": max_value}
|
|
1086
|
+
return {}
|
|
1087
|
+
except Exception as e:
|
|
1088
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max_async()")
|
|
1089
|
+
return {}
|
|
1090
|
+
|
|
1091
|
+
def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
|
|
1092
|
+
"""
|
|
1093
|
+
Returns the number of unique values for each specified metadata field among documents that match the filters.
|
|
1094
|
+
|
|
1095
|
+
:param filters: The filters to restrict the documents considered.
|
|
1096
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1097
|
+
:param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
|
|
1098
|
+
|
|
1099
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1100
|
+
documents.
|
|
1101
|
+
"""
|
|
1102
|
+
self._initialize_client()
|
|
1103
|
+
assert self._client is not None
|
|
1104
|
+
|
|
1105
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1106
|
+
unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
|
|
1107
|
+
|
|
1108
|
+
try:
|
|
1109
|
+
next_offset = None
|
|
1110
|
+
while True:
|
|
1111
|
+
records, next_offset = self._client.scroll(
|
|
1112
|
+
collection_name=self.index,
|
|
1113
|
+
scroll_filter=qdrant_filter,
|
|
1114
|
+
limit=self.scroll_size,
|
|
1115
|
+
offset=next_offset,
|
|
1116
|
+
with_payload=True,
|
|
1117
|
+
with_vectors=False,
|
|
1118
|
+
)
|
|
1119
|
+
self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
|
|
1120
|
+
if self._check_stop_scrolling(next_offset):
|
|
1121
|
+
break
|
|
1122
|
+
|
|
1123
|
+
return {field: len(unique_values_by_field[field]) for field in metadata_fields}
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter()")
|
|
1126
|
+
return dict.fromkeys(metadata_fields, 0)
|
|
1127
|
+
|
|
1128
|
+
async def count_unique_metadata_by_filter_async(
|
|
1129
|
+
self, filters: dict[str, Any], metadata_fields: list[str]
|
|
1130
|
+
) -> dict[str, int]:
|
|
1131
|
+
"""
|
|
1132
|
+
Asynchronously returns the number of unique values for each specified metadata field among documents that
|
|
1133
|
+
match the filters.
|
|
1134
|
+
|
|
1135
|
+
:param filters: The filters to restrict the documents considered.
|
|
1136
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1137
|
+
:param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
|
|
1138
|
+
|
|
1139
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1140
|
+
documents.
|
|
1141
|
+
"""
|
|
1142
|
+
await self._initialize_async_client()
|
|
1143
|
+
assert self._async_client is not None
|
|
1144
|
+
|
|
1145
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1146
|
+
unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
|
|
1147
|
+
|
|
1148
|
+
try:
|
|
1149
|
+
next_offset = None
|
|
1150
|
+
while True:
|
|
1151
|
+
records, next_offset = await self._async_client.scroll(
|
|
1152
|
+
collection_name=self.index,
|
|
1153
|
+
scroll_filter=qdrant_filter,
|
|
1154
|
+
limit=self.scroll_size,
|
|
1155
|
+
offset=next_offset,
|
|
1156
|
+
with_payload=True,
|
|
1157
|
+
with_vectors=False,
|
|
1158
|
+
)
|
|
1159
|
+
self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
|
|
1160
|
+
if self._check_stop_scrolling(next_offset):
|
|
1161
|
+
break
|
|
1162
|
+
|
|
1163
|
+
return {field: len(unique_values_by_field[field]) for field in metadata_fields}
|
|
1164
|
+
except Exception as e:
|
|
1165
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter_async()")
|
|
1166
|
+
return dict.fromkeys(metadata_fields, 0)
|
|
1167
|
+
|
|
1168
|
+
def get_metadata_field_unique_values(
|
|
1169
|
+
self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
|
|
1170
|
+
) -> list[Any]:
|
|
1171
|
+
"""
|
|
1172
|
+
Returns unique values for a metadata field, with optional filters and offset/limit pagination.
|
|
1173
|
+
|
|
1174
|
+
Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
|
|
1175
|
+
|
|
1176
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
|
|
1177
|
+
:param filters: Optional filters to restrict the documents considered.
|
|
1178
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1179
|
+
:param limit: Maximum number of unique values to return per page. Defaults to 100.
|
|
1180
|
+
:param offset: Number of unique values to skip (for pagination). Defaults to 0.
|
|
1181
|
+
|
|
1182
|
+
:returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
|
|
1183
|
+
"""
|
|
1184
|
+
self._initialize_client()
|
|
1185
|
+
assert self._client is not None
|
|
1186
|
+
|
|
1187
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1188
|
+
unique_values: list[Any] = []
|
|
1189
|
+
unique_values_set: set[Any] = set()
|
|
1190
|
+
|
|
1191
|
+
try:
|
|
1192
|
+
next_offset = None
|
|
1193
|
+
while len(unique_values) < offset + limit:
|
|
1194
|
+
records, next_offset = self._client.scroll(
|
|
1195
|
+
collection_name=self.index,
|
|
1196
|
+
scroll_filter=qdrant_filter,
|
|
1197
|
+
limit=self.scroll_size,
|
|
1198
|
+
offset=next_offset,
|
|
1199
|
+
with_payload=True,
|
|
1200
|
+
with_vectors=False,
|
|
1201
|
+
)
|
|
1202
|
+
if self._process_records_unique_values(
|
|
1203
|
+
records, metadata_field, unique_values, unique_values_set, offset, limit
|
|
1204
|
+
):
|
|
1205
|
+
break
|
|
1206
|
+
if self._check_stop_scrolling(next_offset):
|
|
1207
|
+
break
|
|
1208
|
+
|
|
1209
|
+
return unique_values[offset : offset + limit]
|
|
1210
|
+
except Exception as e:
|
|
1211
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values()")
|
|
1212
|
+
return []
|
|
1213
|
+
|
|
1214
|
+
async def get_metadata_field_unique_values_async(
|
|
1215
|
+
self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
|
|
1216
|
+
) -> list[Any]:
|
|
1217
|
+
"""
|
|
1218
|
+
Asynchronously returns unique values for a metadata field, with optional filters and offset/limit pagination.
|
|
1219
|
+
|
|
1220
|
+
Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
|
|
1221
|
+
|
|
1222
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
|
|
1223
|
+
:param filters: Optional filters to restrict the documents considered.
|
|
1224
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1225
|
+
:param limit: Maximum number of unique values to return per page. Defaults to 100.
|
|
1226
|
+
:param offset: Number of unique values to skip (for pagination). Defaults to 0.
|
|
1227
|
+
|
|
1228
|
+
:returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
|
|
1229
|
+
"""
|
|
1230
|
+
await self._initialize_async_client()
|
|
1231
|
+
assert self._async_client is not None
|
|
1232
|
+
|
|
1233
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1234
|
+
unique_values: list[Any] = []
|
|
1235
|
+
unique_values_set: set[Any] = set()
|
|
1236
|
+
|
|
1237
|
+
try:
|
|
1238
|
+
next_offset = None
|
|
1239
|
+
while len(unique_values) < offset + limit:
|
|
1240
|
+
records, next_offset = await self._async_client.scroll(
|
|
1241
|
+
collection_name=self.index,
|
|
1242
|
+
scroll_filter=qdrant_filter,
|
|
1243
|
+
limit=self.scroll_size,
|
|
1244
|
+
offset=next_offset,
|
|
1245
|
+
with_payload=True,
|
|
1246
|
+
with_vectors=False,
|
|
1247
|
+
)
|
|
1248
|
+
if self._process_records_unique_values(
|
|
1249
|
+
records, metadata_field, unique_values, unique_values_set, offset, limit
|
|
1250
|
+
):
|
|
1251
|
+
break
|
|
1252
|
+
if self._check_stop_scrolling(next_offset):
|
|
1253
|
+
break
|
|
1254
|
+
|
|
1255
|
+
return unique_values[offset : offset + limit]
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values_async()")
|
|
1258
|
+
return []
|
|
1259
|
+
|
|
518
1260
|
@classmethod
|
|
519
|
-
def from_dict(cls, data:
|
|
1261
|
+
def from_dict(cls, data: dict[str, Any]) -> "QdrantDocumentStore":
|
|
520
1262
|
"""
|
|
521
1263
|
Deserializes the component from a dictionary.
|
|
522
1264
|
|
|
@@ -528,7 +1270,7 @@ class QdrantDocumentStore:
|
|
|
528
1270
|
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
529
1271
|
return default_from_dict(cls, data)
|
|
530
1272
|
|
|
531
|
-
def to_dict(self) ->
|
|
1273
|
+
def to_dict(self) -> dict[str, Any]:
|
|
532
1274
|
"""
|
|
533
1275
|
Serializes the component to a dictionary.
|
|
534
1276
|
|
|
@@ -547,7 +1289,7 @@ class QdrantDocumentStore:
|
|
|
547
1289
|
|
|
548
1290
|
def _get_documents_generator(
|
|
549
1291
|
self,
|
|
550
|
-
filters:
|
|
1292
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
551
1293
|
) -> Generator[Document, None, None]:
|
|
552
1294
|
"""
|
|
553
1295
|
Returns a generator that yields documents from Qdrant based on the provided filters.
|
|
@@ -574,8 +1316,11 @@ class QdrantDocumentStore:
|
|
|
574
1316
|
with_vectors=True,
|
|
575
1317
|
)
|
|
576
1318
|
stop_scrolling = next_offset is None or (
|
|
577
|
-
|
|
578
|
-
|
|
1319
|
+
hasattr(next_offset, "num")
|
|
1320
|
+
and hasattr(next_offset, "uuid")
|
|
1321
|
+
and next_offset.num == 0
|
|
1322
|
+
and next_offset.uuid == ""
|
|
1323
|
+
) # PointId always has num and uuid
|
|
579
1324
|
|
|
580
1325
|
for record in records:
|
|
581
1326
|
yield convert_qdrant_point_to_haystack_document(
|
|
@@ -584,7 +1329,7 @@ class QdrantDocumentStore:
|
|
|
584
1329
|
|
|
585
1330
|
async def _get_documents_generator_async(
|
|
586
1331
|
self,
|
|
587
|
-
filters:
|
|
1332
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
588
1333
|
) -> AsyncGenerator[Document, None]:
|
|
589
1334
|
"""
|
|
590
1335
|
Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
|
|
@@ -611,8 +1356,11 @@ class QdrantDocumentStore:
|
|
|
611
1356
|
with_vectors=True,
|
|
612
1357
|
)
|
|
613
1358
|
stop_scrolling = next_offset is None or (
|
|
614
|
-
|
|
615
|
-
|
|
1359
|
+
hasattr(next_offset, "num")
|
|
1360
|
+
and hasattr(next_offset, "uuid")
|
|
1361
|
+
and next_offset.num == 0
|
|
1362
|
+
and next_offset.uuid == ""
|
|
1363
|
+
) # PointId always has num and uuid
|
|
616
1364
|
|
|
617
1365
|
for record in records:
|
|
618
1366
|
yield convert_qdrant_point_to_haystack_document(
|
|
@@ -621,19 +1369,17 @@ class QdrantDocumentStore:
|
|
|
621
1369
|
|
|
622
1370
|
def get_documents_by_id(
|
|
623
1371
|
self,
|
|
624
|
-
ids:
|
|
625
|
-
) ->
|
|
1372
|
+
ids: list[str],
|
|
1373
|
+
) -> list[Document]:
|
|
626
1374
|
"""
|
|
627
1375
|
Retrieves documents from Qdrant by their IDs.
|
|
628
1376
|
|
|
629
1377
|
:param ids:
|
|
630
1378
|
A list of document IDs to retrieve.
|
|
631
|
-
:param index:
|
|
632
|
-
The name of the index to retrieve documents from.
|
|
633
1379
|
:returns:
|
|
634
1380
|
A list of documents.
|
|
635
1381
|
"""
|
|
636
|
-
documents:
|
|
1382
|
+
documents: list[Document] = []
|
|
637
1383
|
|
|
638
1384
|
self._initialize_client()
|
|
639
1385
|
assert self._client is not None
|
|
@@ -654,19 +1400,17 @@ class QdrantDocumentStore:
|
|
|
654
1400
|
|
|
655
1401
|
async def get_documents_by_id_async(
|
|
656
1402
|
self,
|
|
657
|
-
ids:
|
|
658
|
-
) ->
|
|
1403
|
+
ids: list[str],
|
|
1404
|
+
) -> list[Document]:
|
|
659
1405
|
"""
|
|
660
1406
|
Retrieves documents from Qdrant by their IDs.
|
|
661
1407
|
|
|
662
1408
|
:param ids:
|
|
663
1409
|
A list of document IDs to retrieve.
|
|
664
|
-
:param index:
|
|
665
|
-
The name of the index to retrieve documents from.
|
|
666
1410
|
:returns:
|
|
667
1411
|
A list of documents.
|
|
668
1412
|
"""
|
|
669
|
-
documents:
|
|
1413
|
+
documents: list[Document] = []
|
|
670
1414
|
|
|
671
1415
|
await self._initialize_async_client()
|
|
672
1416
|
assert self._async_client is not None
|
|
@@ -688,14 +1432,14 @@ class QdrantDocumentStore:
|
|
|
688
1432
|
def _query_by_sparse(
|
|
689
1433
|
self,
|
|
690
1434
|
query_sparse_embedding: SparseEmbedding,
|
|
691
|
-
filters:
|
|
1435
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
692
1436
|
top_k: int = 10,
|
|
693
1437
|
scale_score: bool = False,
|
|
694
1438
|
return_embedding: bool = False,
|
|
695
|
-
score_threshold:
|
|
696
|
-
group_by:
|
|
697
|
-
group_size:
|
|
698
|
-
) ->
|
|
1439
|
+
score_threshold: float | None = None,
|
|
1440
|
+
group_by: str | None = None,
|
|
1441
|
+
group_size: int | None = None,
|
|
1442
|
+
) -> list[Document]:
|
|
699
1443
|
"""
|
|
700
1444
|
Queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
701
1445
|
|
|
@@ -742,7 +1486,7 @@ class QdrantDocumentStore:
|
|
|
742
1486
|
query_filter=qdrant_filters,
|
|
743
1487
|
limit=top_k,
|
|
744
1488
|
group_by=group_by,
|
|
745
|
-
group_size=group_size,
|
|
1489
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
746
1490
|
with_vectors=return_embedding,
|
|
747
1491
|
score_threshold=score_threshold,
|
|
748
1492
|
).groups
|
|
@@ -764,15 +1508,15 @@ class QdrantDocumentStore:
|
|
|
764
1508
|
|
|
765
1509
|
def _query_by_embedding(
|
|
766
1510
|
self,
|
|
767
|
-
query_embedding:
|
|
768
|
-
filters:
|
|
1511
|
+
query_embedding: list[float],
|
|
1512
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
769
1513
|
top_k: int = 10,
|
|
770
1514
|
scale_score: bool = False,
|
|
771
1515
|
return_embedding: bool = False,
|
|
772
|
-
score_threshold:
|
|
773
|
-
group_by:
|
|
774
|
-
group_size:
|
|
775
|
-
) ->
|
|
1516
|
+
score_threshold: float | None = None,
|
|
1517
|
+
group_by: str | None = None,
|
|
1518
|
+
group_size: int | None = None,
|
|
1519
|
+
) -> list[Document]:
|
|
776
1520
|
"""
|
|
777
1521
|
Queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
778
1522
|
|
|
@@ -804,7 +1548,7 @@ class QdrantDocumentStore:
|
|
|
804
1548
|
query_filter=qdrant_filters,
|
|
805
1549
|
limit=top_k,
|
|
806
1550
|
group_by=group_by,
|
|
807
|
-
group_size=group_size,
|
|
1551
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
808
1552
|
with_vectors=return_embedding,
|
|
809
1553
|
score_threshold=score_threshold,
|
|
810
1554
|
).groups
|
|
@@ -824,15 +1568,15 @@ class QdrantDocumentStore:
|
|
|
824
1568
|
|
|
825
1569
|
def _query_hybrid(
|
|
826
1570
|
self,
|
|
827
|
-
query_embedding:
|
|
1571
|
+
query_embedding: list[float],
|
|
828
1572
|
query_sparse_embedding: SparseEmbedding,
|
|
829
|
-
filters:
|
|
1573
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
830
1574
|
top_k: int = 10,
|
|
831
1575
|
return_embedding: bool = False,
|
|
832
|
-
score_threshold:
|
|
833
|
-
group_by:
|
|
834
|
-
group_size:
|
|
835
|
-
) ->
|
|
1576
|
+
score_threshold: float | None = None,
|
|
1577
|
+
group_by: str | None = None,
|
|
1578
|
+
group_size: int | None = None,
|
|
1579
|
+
) -> list[Document]:
|
|
836
1580
|
"""
|
|
837
1581
|
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
838
1582
|
|
|
@@ -896,7 +1640,7 @@ class QdrantDocumentStore:
|
|
|
896
1640
|
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
897
1641
|
limit=top_k,
|
|
898
1642
|
group_by=group_by,
|
|
899
|
-
group_size=group_size,
|
|
1643
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
900
1644
|
score_threshold=score_threshold,
|
|
901
1645
|
with_payload=True,
|
|
902
1646
|
with_vectors=return_embedding,
|
|
@@ -938,14 +1682,14 @@ class QdrantDocumentStore:
|
|
|
938
1682
|
async def _query_by_sparse_async(
|
|
939
1683
|
self,
|
|
940
1684
|
query_sparse_embedding: SparseEmbedding,
|
|
941
|
-
filters:
|
|
1685
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
942
1686
|
top_k: int = 10,
|
|
943
1687
|
scale_score: bool = False,
|
|
944
1688
|
return_embedding: bool = False,
|
|
945
|
-
score_threshold:
|
|
946
|
-
group_by:
|
|
947
|
-
group_size:
|
|
948
|
-
) ->
|
|
1689
|
+
score_threshold: float | None = None,
|
|
1690
|
+
group_by: str | None = None,
|
|
1691
|
+
group_size: int | None = None,
|
|
1692
|
+
) -> list[Document]:
|
|
949
1693
|
"""
|
|
950
1694
|
Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
951
1695
|
|
|
@@ -993,14 +1737,14 @@ class QdrantDocumentStore:
|
|
|
993
1737
|
query_filter=qdrant_filters,
|
|
994
1738
|
limit=top_k,
|
|
995
1739
|
group_by=group_by,
|
|
996
|
-
group_size=group_size,
|
|
1740
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
997
1741
|
with_vectors=return_embedding,
|
|
998
1742
|
score_threshold=score_threshold,
|
|
999
1743
|
)
|
|
1000
1744
|
groups = response.groups
|
|
1001
1745
|
return self._process_group_results(groups)
|
|
1002
1746
|
else:
|
|
1003
|
-
|
|
1747
|
+
query_response = await self._async_client.query_points(
|
|
1004
1748
|
collection_name=self.index,
|
|
1005
1749
|
query=rest.SparseVector(
|
|
1006
1750
|
indices=query_indices,
|
|
@@ -1012,20 +1756,20 @@ class QdrantDocumentStore:
|
|
|
1012
1756
|
with_vectors=return_embedding,
|
|
1013
1757
|
score_threshold=score_threshold,
|
|
1014
1758
|
)
|
|
1015
|
-
points =
|
|
1759
|
+
points = query_response.points
|
|
1016
1760
|
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1017
1761
|
|
|
1018
1762
|
async def _query_by_embedding_async(
|
|
1019
1763
|
self,
|
|
1020
|
-
query_embedding:
|
|
1021
|
-
filters:
|
|
1764
|
+
query_embedding: list[float],
|
|
1765
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1022
1766
|
top_k: int = 10,
|
|
1023
1767
|
scale_score: bool = False,
|
|
1024
1768
|
return_embedding: bool = False,
|
|
1025
|
-
score_threshold:
|
|
1026
|
-
group_by:
|
|
1027
|
-
group_size:
|
|
1028
|
-
) ->
|
|
1769
|
+
score_threshold: float | None = None,
|
|
1770
|
+
group_by: str | None = None,
|
|
1771
|
+
group_size: int | None = None,
|
|
1772
|
+
) -> list[Document]:
|
|
1029
1773
|
"""
|
|
1030
1774
|
Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
1031
1775
|
|
|
@@ -1057,14 +1801,14 @@ class QdrantDocumentStore:
|
|
|
1057
1801
|
query_filter=qdrant_filters,
|
|
1058
1802
|
limit=top_k,
|
|
1059
1803
|
group_by=group_by,
|
|
1060
|
-
group_size=group_size,
|
|
1804
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
1061
1805
|
with_vectors=return_embedding,
|
|
1062
1806
|
score_threshold=score_threshold,
|
|
1063
1807
|
)
|
|
1064
1808
|
groups = response.groups
|
|
1065
1809
|
return self._process_group_results(groups)
|
|
1066
1810
|
else:
|
|
1067
|
-
|
|
1811
|
+
query_response = await self._async_client.query_points(
|
|
1068
1812
|
collection_name=self.index,
|
|
1069
1813
|
query=query_embedding,
|
|
1070
1814
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -1073,20 +1817,20 @@ class QdrantDocumentStore:
|
|
|
1073
1817
|
with_vectors=return_embedding,
|
|
1074
1818
|
score_threshold=score_threshold,
|
|
1075
1819
|
)
|
|
1076
|
-
points =
|
|
1820
|
+
points = query_response.points
|
|
1077
1821
|
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1078
1822
|
|
|
1079
1823
|
async def _query_hybrid_async(
|
|
1080
1824
|
self,
|
|
1081
|
-
query_embedding:
|
|
1825
|
+
query_embedding: list[float],
|
|
1082
1826
|
query_sparse_embedding: SparseEmbedding,
|
|
1083
|
-
filters:
|
|
1827
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1084
1828
|
top_k: int = 10,
|
|
1085
1829
|
return_embedding: bool = False,
|
|
1086
|
-
score_threshold:
|
|
1087
|
-
group_by:
|
|
1088
|
-
group_size:
|
|
1089
|
-
) ->
|
|
1830
|
+
score_threshold: float | None = None,
|
|
1831
|
+
group_by: str | None = None,
|
|
1832
|
+
group_size: int | None = None,
|
|
1833
|
+
) -> list[Document]:
|
|
1090
1834
|
"""
|
|
1091
1835
|
Asynchronously retrieves documents based on dense and sparse embeddings and fuses
|
|
1092
1836
|
the results using Reciprocal Rank Fusion.
|
|
@@ -1148,14 +1892,14 @@ class QdrantDocumentStore:
|
|
|
1148
1892
|
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1149
1893
|
limit=top_k,
|
|
1150
1894
|
group_by=group_by,
|
|
1151
|
-
group_size=group_size,
|
|
1895
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
1152
1896
|
score_threshold=score_threshold,
|
|
1153
1897
|
with_payload=True,
|
|
1154
1898
|
with_vectors=return_embedding,
|
|
1155
1899
|
)
|
|
1156
1900
|
groups = response.groups
|
|
1157
1901
|
else:
|
|
1158
|
-
|
|
1902
|
+
query_response = await self._async_client.query_points(
|
|
1159
1903
|
collection_name=self.index,
|
|
1160
1904
|
prefetch=[
|
|
1161
1905
|
rest.Prefetch(
|
|
@@ -1178,7 +1922,7 @@ class QdrantDocumentStore:
|
|
|
1178
1922
|
with_payload=True,
|
|
1179
1923
|
with_vectors=return_embedding,
|
|
1180
1924
|
)
|
|
1181
|
-
points =
|
|
1925
|
+
points = query_response.points
|
|
1182
1926
|
|
|
1183
1927
|
except Exception as e:
|
|
1184
1928
|
msg = "Error during hybrid search"
|
|
@@ -1210,9 +1954,10 @@ class QdrantDocumentStore:
|
|
|
1210
1954
|
)
|
|
1211
1955
|
raise QdrantStoreError(msg) from ke
|
|
1212
1956
|
|
|
1213
|
-
def _create_payload_index(self, collection_name: str, payload_fields_to_index:
|
|
1957
|
+
def _create_payload_index(self, collection_name: str, payload_fields_to_index: list[dict] | None = None) -> None:
|
|
1214
1958
|
"""
|
|
1215
|
-
Create payload index for the collection if payload_fields_to_index is provided
|
|
1959
|
+
Create payload index for the collection if payload_fields_to_index is provided.
|
|
1960
|
+
|
|
1216
1961
|
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1217
1962
|
"""
|
|
1218
1963
|
if payload_fields_to_index is not None:
|
|
@@ -1228,15 +1973,15 @@ class QdrantDocumentStore:
|
|
|
1228
1973
|
)
|
|
1229
1974
|
|
|
1230
1975
|
async def _create_payload_index_async(
|
|
1231
|
-
self, collection_name: str, payload_fields_to_index:
|
|
1232
|
-
):
|
|
1976
|
+
self, collection_name: str, payload_fields_to_index: list[dict] | None = None
|
|
1977
|
+
) -> None:
|
|
1233
1978
|
"""
|
|
1234
|
-
Asynchronously create payload index for the collection if payload_fields_to_index is provided
|
|
1979
|
+
Asynchronously create payload index for the collection if payload_fields_to_index is provided.
|
|
1980
|
+
|
|
1235
1981
|
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1236
1982
|
"""
|
|
1237
1983
|
if payload_fields_to_index is not None:
|
|
1238
1984
|
for payload_index in payload_fields_to_index:
|
|
1239
|
-
|
|
1240
1985
|
# self._async_client is initialized at this point
|
|
1241
1986
|
# since _initialize_async_client() is called before this method is executed
|
|
1242
1987
|
assert self._async_client is not None
|
|
@@ -1256,10 +2001,11 @@ class QdrantDocumentStore:
|
|
|
1256
2001
|
use_sparse_embeddings: bool,
|
|
1257
2002
|
sparse_idf: bool,
|
|
1258
2003
|
on_disk: bool = False,
|
|
1259
|
-
payload_fields_to_index:
|
|
1260
|
-
):
|
|
2004
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
2005
|
+
) -> None:
|
|
1261
2006
|
"""
|
|
1262
2007
|
Sets up the Qdrant collection with the specified parameters.
|
|
2008
|
+
|
|
1263
2009
|
:param collection_name:
|
|
1264
2010
|
The name of the collection to set up.
|
|
1265
2011
|
:param embedding_dim:
|
|
@@ -1312,10 +2058,11 @@ class QdrantDocumentStore:
|
|
|
1312
2058
|
use_sparse_embeddings: bool,
|
|
1313
2059
|
sparse_idf: bool,
|
|
1314
2060
|
on_disk: bool = False,
|
|
1315
|
-
payload_fields_to_index:
|
|
1316
|
-
):
|
|
2061
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
2062
|
+
) -> None:
|
|
1317
2063
|
"""
|
|
1318
2064
|
Asynchronously sets up the Qdrant collection with the specified parameters.
|
|
2065
|
+
|
|
1319
2066
|
:param collection_name:
|
|
1320
2067
|
The name of the collection to set up.
|
|
1321
2068
|
:param embedding_dim:
|
|
@@ -1362,12 +2109,12 @@ class QdrantDocumentStore:
|
|
|
1362
2109
|
def recreate_collection(
|
|
1363
2110
|
self,
|
|
1364
2111
|
collection_name: str,
|
|
1365
|
-
distance,
|
|
2112
|
+
distance: rest.Distance,
|
|
1366
2113
|
embedding_dim: int,
|
|
1367
|
-
on_disk:
|
|
1368
|
-
use_sparse_embeddings:
|
|
2114
|
+
on_disk: bool | None = None,
|
|
2115
|
+
use_sparse_embeddings: bool | None = None,
|
|
1369
2116
|
sparse_idf: bool = False,
|
|
1370
|
-
):
|
|
2117
|
+
) -> None:
|
|
1371
2118
|
"""
|
|
1372
2119
|
Recreates the Qdrant collection with the specified parameters.
|
|
1373
2120
|
|
|
@@ -1405,12 +2152,12 @@ class QdrantDocumentStore:
|
|
|
1405
2152
|
async def recreate_collection_async(
|
|
1406
2153
|
self,
|
|
1407
2154
|
collection_name: str,
|
|
1408
|
-
distance,
|
|
2155
|
+
distance: rest.Distance,
|
|
1409
2156
|
embedding_dim: int,
|
|
1410
|
-
on_disk:
|
|
1411
|
-
use_sparse_embeddings:
|
|
2157
|
+
on_disk: bool | None = None,
|
|
2158
|
+
use_sparse_embeddings: bool | None = None,
|
|
1412
2159
|
sparse_idf: bool = False,
|
|
1413
|
-
):
|
|
2160
|
+
) -> None:
|
|
1414
2161
|
"""
|
|
1415
2162
|
Asynchronously recreates the Qdrant collection with the specified parameters.
|
|
1416
2163
|
|
|
@@ -1447,9 +2194,9 @@ class QdrantDocumentStore:
|
|
|
1447
2194
|
|
|
1448
2195
|
def _handle_duplicate_documents(
|
|
1449
2196
|
self,
|
|
1450
|
-
documents:
|
|
1451
|
-
policy: DuplicatePolicy = None,
|
|
1452
|
-
):
|
|
2197
|
+
documents: list[Document],
|
|
2198
|
+
policy: DuplicatePolicy | None = None,
|
|
2199
|
+
) -> list[Document]:
|
|
1453
2200
|
"""
|
|
1454
2201
|
Checks whether any of the passed documents is already existing in the chosen index and returns a list of
|
|
1455
2202
|
documents that are not in the index yet.
|
|
@@ -1462,7 +2209,7 @@ class QdrantDocumentStore:
|
|
|
1462
2209
|
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
1463
2210
|
documents = self._drop_duplicate_documents(documents)
|
|
1464
2211
|
documents_found = self.get_documents_by_id(ids=[doc.id for doc in documents])
|
|
1465
|
-
ids_exist_in_db:
|
|
2212
|
+
ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
|
|
1466
2213
|
|
|
1467
2214
|
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
1468
2215
|
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
@@ -1474,9 +2221,9 @@ class QdrantDocumentStore:
|
|
|
1474
2221
|
|
|
1475
2222
|
async def _handle_duplicate_documents_async(
|
|
1476
2223
|
self,
|
|
1477
|
-
documents:
|
|
1478
|
-
policy: DuplicatePolicy = None,
|
|
1479
|
-
):
|
|
2224
|
+
documents: list[Document],
|
|
2225
|
+
policy: DuplicatePolicy | None = None,
|
|
2226
|
+
) -> list[Document]:
|
|
1480
2227
|
"""
|
|
1481
2228
|
Asynchronously checks whether any of the passed documents is already existing
|
|
1482
2229
|
in the chosen index and returns a list of
|
|
@@ -1490,7 +2237,7 @@ class QdrantDocumentStore:
|
|
|
1490
2237
|
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
1491
2238
|
documents = self._drop_duplicate_documents(documents)
|
|
1492
2239
|
documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
|
|
1493
|
-
ids_exist_in_db:
|
|
2240
|
+
ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
|
|
1494
2241
|
|
|
1495
2242
|
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
1496
2243
|
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
@@ -1500,13 +2247,13 @@ class QdrantDocumentStore:
|
|
|
1500
2247
|
|
|
1501
2248
|
return documents
|
|
1502
2249
|
|
|
1503
|
-
def _drop_duplicate_documents(self, documents:
|
|
2250
|
+
def _drop_duplicate_documents(self, documents: list[Document]) -> list[Document]:
|
|
1504
2251
|
"""
|
|
1505
2252
|
Drop duplicate documents based on same hash ID.
|
|
1506
2253
|
|
|
1507
2254
|
"""
|
|
1508
|
-
_hash_ids:
|
|
1509
|
-
_documents:
|
|
2255
|
+
_hash_ids: set = set()
|
|
2256
|
+
_documents: list[Document] = []
|
|
1510
2257
|
|
|
1511
2258
|
for document in documents:
|
|
1512
2259
|
if document.id in _hash_ids:
|
|
@@ -1521,7 +2268,7 @@ class QdrantDocumentStore:
|
|
|
1521
2268
|
|
|
1522
2269
|
return _documents
|
|
1523
2270
|
|
|
1524
|
-
def _prepare_collection_params(self):
|
|
2271
|
+
def _prepare_collection_params(self) -> dict[str, Any]:
|
|
1525
2272
|
"""
|
|
1526
2273
|
Prepares the common parameters for collection creation.
|
|
1527
2274
|
"""
|
|
@@ -1534,10 +2281,9 @@ class QdrantDocumentStore:
|
|
|
1534
2281
|
"optimizers_config": self.optimizers_config,
|
|
1535
2282
|
"wal_config": self.wal_config,
|
|
1536
2283
|
"quantization_config": self.quantization_config,
|
|
1537
|
-
"init_from": self.init_from,
|
|
1538
2284
|
}
|
|
1539
2285
|
|
|
1540
|
-
def _prepare_client_params(self):
|
|
2286
|
+
def _prepare_client_params(self) -> dict[str, Any]:
|
|
1541
2287
|
"""
|
|
1542
2288
|
Prepares the common parameters for client initialization.
|
|
1543
2289
|
|
|
@@ -1554,18 +2300,21 @@ class QdrantDocumentStore:
|
|
|
1554
2300
|
"timeout": self.timeout,
|
|
1555
2301
|
"host": self.host,
|
|
1556
2302
|
"path": self.path,
|
|
1557
|
-
|
|
2303
|
+
# NOTE: We purposefully expand the fields of self.metadata to avoid modifying the original self.metadata
|
|
2304
|
+
# class attribute. For example, the resolved api key is added to metadata by the QdrantClient class
|
|
2305
|
+
# when using a hosted Qdrant service, which means running to_dict() exposes the api key.
|
|
2306
|
+
"metadata": {**self.metadata},
|
|
1558
2307
|
"force_disable_check_same_thread": self.force_disable_check_same_thread,
|
|
1559
2308
|
}
|
|
1560
2309
|
|
|
1561
2310
|
def _prepare_collection_config(
|
|
1562
2311
|
self,
|
|
1563
2312
|
embedding_dim: int,
|
|
1564
|
-
distance,
|
|
1565
|
-
on_disk:
|
|
1566
|
-
use_sparse_embeddings:
|
|
2313
|
+
distance: rest.Distance,
|
|
2314
|
+
on_disk: bool | None = None,
|
|
2315
|
+
use_sparse_embeddings: bool | None = None,
|
|
1567
2316
|
sparse_idf: bool = False,
|
|
1568
|
-
):
|
|
2317
|
+
) -> tuple[dict[str, rest.VectorParams] | rest.VectorParams, dict[str, rest.SparseVectorParams] | None]:
|
|
1569
2318
|
"""
|
|
1570
2319
|
Prepares the configuration for creating or recreating a Qdrant collection.
|
|
1571
2320
|
|
|
@@ -1577,12 +2326,14 @@ class QdrantDocumentStore:
|
|
|
1577
2326
|
use_sparse_embeddings = self.use_sparse_embeddings
|
|
1578
2327
|
|
|
1579
2328
|
# dense vectors configuration
|
|
1580
|
-
|
|
1581
|
-
|
|
2329
|
+
base_vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
|
|
2330
|
+
vectors_config: rest.VectorParams | dict[str, rest.VectorParams] = base_vectors_config
|
|
2331
|
+
|
|
2332
|
+
sparse_vectors_config: dict[str, rest.SparseVectorParams] | None = None
|
|
1582
2333
|
|
|
1583
2334
|
if use_sparse_embeddings:
|
|
1584
2335
|
# in this case, we need to define named vectors
|
|
1585
|
-
vectors_config = {DENSE_VECTORS_NAME:
|
|
2336
|
+
vectors_config = {DENSE_VECTORS_NAME: base_vectors_config}
|
|
1586
2337
|
|
|
1587
2338
|
sparse_vectors_config = {
|
|
1588
2339
|
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
@@ -1595,9 +2346,13 @@ class QdrantDocumentStore:
|
|
|
1595
2346
|
|
|
1596
2347
|
return vectors_config, sparse_vectors_config
|
|
1597
2348
|
|
|
1598
|
-
|
|
2349
|
+
@staticmethod
|
|
2350
|
+
def _validate_filters(filters: dict[str, Any] | rest.Filter | None = None) -> None:
|
|
1599
2351
|
"""
|
|
1600
2352
|
Validates the filters provided for querying.
|
|
2353
|
+
|
|
2354
|
+
:param filters: Filters to validate. Can be a dictionary or an instance of `qdrant_client.http.models.Filter`.
|
|
2355
|
+
:raises ValueError: If the filters are not in the correct format or syntax.
|
|
1601
2356
|
"""
|
|
1602
2357
|
if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
|
|
1603
2358
|
msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
|
|
@@ -1607,7 +2362,9 @@ class QdrantDocumentStore:
|
|
|
1607
2362
|
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
1608
2363
|
raise ValueError(msg)
|
|
1609
2364
|
|
|
1610
|
-
def _process_query_point_results(
|
|
2365
|
+
def _process_query_point_results(
|
|
2366
|
+
self, results: list[rest.ScoredPoint], scale_score: bool = False
|
|
2367
|
+
) -> list[Document]:
|
|
1611
2368
|
"""
|
|
1612
2369
|
Processes query results from Qdrant.
|
|
1613
2370
|
"""
|
|
@@ -1619,15 +2376,17 @@ class QdrantDocumentStore:
|
|
|
1619
2376
|
if scale_score:
|
|
1620
2377
|
for document in documents:
|
|
1621
2378
|
score = document.score
|
|
2379
|
+
if score is None:
|
|
2380
|
+
continue
|
|
1622
2381
|
if self.similarity == "cosine":
|
|
1623
2382
|
score = (score + 1) / 2
|
|
1624
2383
|
else:
|
|
1625
|
-
score = float(1 / (1 +
|
|
2384
|
+
score = float(1 / (1 + exp(-score / 100)))
|
|
1626
2385
|
document.score = score
|
|
1627
2386
|
|
|
1628
2387
|
return documents
|
|
1629
2388
|
|
|
1630
|
-
def _process_group_results(self, groups):
|
|
2389
|
+
def _process_group_results(self, groups: list[rest.PointGroup]) -> list[Document]:
|
|
1631
2390
|
"""
|
|
1632
2391
|
Processes grouped query results from Qdrant.
|
|
1633
2392
|
|
|
@@ -1644,16 +2403,22 @@ class QdrantDocumentStore:
|
|
|
1644
2403
|
def _validate_collection_compatibility(
|
|
1645
2404
|
self,
|
|
1646
2405
|
collection_name: str,
|
|
1647
|
-
collection_info,
|
|
1648
|
-
distance,
|
|
2406
|
+
collection_info: rest.CollectionInfo,
|
|
2407
|
+
distance: rest.Distance,
|
|
1649
2408
|
embedding_dim: int,
|
|
1650
|
-
):
|
|
2409
|
+
) -> None:
|
|
1651
2410
|
"""
|
|
1652
2411
|
Validates that an existing collection is compatible with the current configuration.
|
|
1653
2412
|
"""
|
|
1654
|
-
|
|
2413
|
+
vectors_config = collection_info.config.params.vectors
|
|
1655
2414
|
|
|
1656
|
-
if
|
|
2415
|
+
if vectors_config is None:
|
|
2416
|
+
msg = f"Collection '{collection_name}' has no vector configuration."
|
|
2417
|
+
raise QdrantStoreError(msg)
|
|
2418
|
+
|
|
2419
|
+
has_named_vectors = isinstance(vectors_config, dict)
|
|
2420
|
+
|
|
2421
|
+
if has_named_vectors and DENSE_VECTORS_NAME not in vectors_config:
|
|
1657
2422
|
msg = (
|
|
1658
2423
|
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
1659
2424
|
f"but it has been originally created outside of Haystack and is not supported. "
|
|
@@ -1685,11 +2450,20 @@ class QdrantDocumentStore:
|
|
|
1685
2450
|
|
|
1686
2451
|
# Get current distance and vector size based on collection configuration
|
|
1687
2452
|
if self.use_sparse_embeddings:
|
|
1688
|
-
|
|
1689
|
-
|
|
2453
|
+
if not isinstance(vectors_config, dict):
|
|
2454
|
+
msg = f"Collection '{collection_name}' has invalid vector configuration for sparse embeddings."
|
|
2455
|
+
raise QdrantStoreError(msg)
|
|
2456
|
+
|
|
2457
|
+
dense_vector_config = vectors_config[DENSE_VECTORS_NAME]
|
|
2458
|
+
current_distance = dense_vector_config.distance
|
|
2459
|
+
current_vector_size = dense_vector_config.size
|
|
1690
2460
|
else:
|
|
1691
|
-
|
|
1692
|
-
|
|
2461
|
+
if isinstance(vectors_config, dict):
|
|
2462
|
+
msg = f"Collection '{collection_name}' has invalid vector configuration for dense embeddings only."
|
|
2463
|
+
raise QdrantStoreError(msg)
|
|
2464
|
+
|
|
2465
|
+
current_distance = vectors_config.distance
|
|
2466
|
+
current_vector_size = vectors_config.size
|
|
1693
2467
|
|
|
1694
2468
|
# Validate distance metric
|
|
1695
2469
|
if current_distance != distance:
|