qdrant-haystack 6.0.0__py3-none-any.whl → 10.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_integrations/components/retrievers/py.typed +0 -0
- haystack_integrations/components/retrievers/qdrant/__init__.py +1 -1
- haystack_integrations/components/retrievers/qdrant/retriever.py +269 -56
- haystack_integrations/document_stores/py.typed +0 -0
- haystack_integrations/document_stores/qdrant/converters.py +15 -13
- haystack_integrations/document_stores/qdrant/document_store.py +1802 -355
- haystack_integrations/document_stores/qdrant/filters.py +87 -168
- haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +8 -3
- {qdrant_haystack-6.0.0.dist-info → qdrant_haystack-10.2.0.dist-info}/METADATA +12 -27
- qdrant_haystack-10.2.0.dist-info/RECORD +13 -0
- {qdrant_haystack-6.0.0.dist-info → qdrant_haystack-10.2.0.dist-info}/WHEEL +1 -1
- qdrant_haystack-6.0.0.dist-info/RECORD +0 -11
- {qdrant_haystack-6.0.0.dist-info → qdrant_haystack-10.2.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
import
|
|
2
|
+
from collections.abc import AsyncGenerator, Generator
|
|
3
3
|
from itertools import islice
|
|
4
|
-
from typing import Any, ClassVar,
|
|
4
|
+
from typing import Any, ClassVar, cast
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
6
|
import qdrant_client
|
|
8
|
-
from haystack import default_from_dict, default_to_dict
|
|
7
|
+
from haystack import default_from_dict, default_to_dict, logging
|
|
9
8
|
from haystack.dataclasses import Document
|
|
10
9
|
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
11
10
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
12
11
|
from haystack.document_stores.types import DuplicatePolicy
|
|
13
12
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
14
|
-
from
|
|
13
|
+
from numpy import exp
|
|
15
14
|
from qdrant_client.http import models as rest
|
|
16
15
|
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
17
16
|
from tqdm import tqdm
|
|
@@ -27,15 +26,21 @@ from .filters import convert_filters_to_qdrant
|
|
|
27
26
|
|
|
28
27
|
logger = logging.getLogger(__name__)
|
|
29
28
|
|
|
29
|
+
# Default group size to apply when using group_by
|
|
30
|
+
# - Our methods use None as the default for optional group_size parameter.
|
|
31
|
+
# - Qdrant expects an integer and internally defaults to 3 when performing grouped queries.
|
|
32
|
+
# - When group_by is specified but group_size is None, we use this value instead of passing None.
|
|
33
|
+
DEFAULT_GROUP_SIZE = 3
|
|
34
|
+
|
|
30
35
|
|
|
31
36
|
class QdrantStoreError(DocumentStoreError):
|
|
32
37
|
pass
|
|
33
38
|
|
|
34
39
|
|
|
35
|
-
FilterType =
|
|
40
|
+
FilterType = dict[str, dict[str, Any] | list[Any] | str | int | float | bool]
|
|
36
41
|
|
|
37
42
|
|
|
38
|
-
def get_batches_from_generator(iterable, n):
|
|
43
|
+
def get_batches_from_generator(iterable: list, n: int) -> Generator:
|
|
39
44
|
"""
|
|
40
45
|
Batch elements of an iterable into fixed-length chunks or blocks.
|
|
41
46
|
"""
|
|
@@ -48,9 +53,8 @@ def get_batches_from_generator(iterable, n):
|
|
|
48
53
|
|
|
49
54
|
class QdrantDocumentStore:
|
|
50
55
|
"""
|
|
51
|
-
QdrantDocumentStore
|
|
52
|
-
|
|
53
|
-
and Qdrant Cloud Cluster deployments.
|
|
56
|
+
A QdrantDocumentStore implementation that you can use with any Qdrant instance: in-memory, disk-persisted,
|
|
57
|
+
Docker-based, and Qdrant Cloud Cluster deployments.
|
|
54
58
|
|
|
55
59
|
Usage example by creating an in-memory instance:
|
|
56
60
|
|
|
@@ -60,7 +64,8 @@ class QdrantDocumentStore:
|
|
|
60
64
|
|
|
61
65
|
document_store = QdrantDocumentStore(
|
|
62
66
|
":memory:",
|
|
63
|
-
recreate_index=True
|
|
67
|
+
recreate_index=True,
|
|
68
|
+
embedding_dim=5
|
|
64
69
|
)
|
|
65
70
|
document_store.write_documents([
|
|
66
71
|
Document(content="This is first", embedding=[0.0]*5),
|
|
@@ -85,7 +90,7 @@ class QdrantDocumentStore:
|
|
|
85
90
|
```
|
|
86
91
|
"""
|
|
87
92
|
|
|
88
|
-
SIMILARITY: ClassVar[
|
|
93
|
+
SIMILARITY: ClassVar[dict[str, rest.Distance]] = {
|
|
89
94
|
"cosine": rest.Distance.COSINE,
|
|
90
95
|
"dot_product": rest.Distance.DOT,
|
|
91
96
|
"l2": rest.Distance.EUCLID,
|
|
@@ -93,17 +98,17 @@ class QdrantDocumentStore:
|
|
|
93
98
|
|
|
94
99
|
def __init__(
|
|
95
100
|
self,
|
|
96
|
-
location:
|
|
97
|
-
url:
|
|
101
|
+
location: str | None = None,
|
|
102
|
+
url: str | None = None,
|
|
98
103
|
port: int = 6333,
|
|
99
104
|
grpc_port: int = 6334,
|
|
100
105
|
prefer_grpc: bool = False,
|
|
101
|
-
https:
|
|
102
|
-
api_key:
|
|
103
|
-
prefix:
|
|
104
|
-
timeout:
|
|
105
|
-
host:
|
|
106
|
-
path:
|
|
106
|
+
https: bool | None = None,
|
|
107
|
+
api_key: Secret | None = None,
|
|
108
|
+
prefix: str | None = None,
|
|
109
|
+
timeout: int | None = None,
|
|
110
|
+
host: str | None = None,
|
|
111
|
+
path: str | None = None,
|
|
107
112
|
force_disable_check_same_thread: bool = False,
|
|
108
113
|
index: str = "Document",
|
|
109
114
|
embedding_dim: int = 768,
|
|
@@ -114,24 +119,25 @@ class QdrantDocumentStore:
|
|
|
114
119
|
return_embedding: bool = False,
|
|
115
120
|
progress_bar: bool = True,
|
|
116
121
|
recreate_index: bool = False,
|
|
117
|
-
shard_number:
|
|
118
|
-
replication_factor:
|
|
119
|
-
write_consistency_factor:
|
|
120
|
-
on_disk_payload:
|
|
121
|
-
hnsw_config:
|
|
122
|
-
optimizers_config:
|
|
123
|
-
wal_config:
|
|
124
|
-
quantization_config:
|
|
125
|
-
init_from: Optional[dict] = None,
|
|
122
|
+
shard_number: int | None = None,
|
|
123
|
+
replication_factor: int | None = None,
|
|
124
|
+
write_consistency_factor: int | None = None,
|
|
125
|
+
on_disk_payload: bool | None = None,
|
|
126
|
+
hnsw_config: dict | None = None,
|
|
127
|
+
optimizers_config: dict | None = None,
|
|
128
|
+
wal_config: dict | None = None,
|
|
129
|
+
quantization_config: dict | None = None,
|
|
126
130
|
wait_result_from_api: bool = True,
|
|
127
|
-
metadata:
|
|
131
|
+
metadata: dict | None = None,
|
|
128
132
|
write_batch_size: int = 100,
|
|
129
133
|
scroll_size: int = 10_000,
|
|
130
|
-
payload_fields_to_index:
|
|
131
|
-
):
|
|
134
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
135
|
+
) -> None:
|
|
132
136
|
"""
|
|
137
|
+
Initializes a QdrantDocumentStore.
|
|
138
|
+
|
|
133
139
|
:param location:
|
|
134
|
-
If `memory` - use in-memory Qdrant instance.
|
|
140
|
+
If `":memory:"` - use in-memory Qdrant instance.
|
|
135
141
|
If `str` - use it as a URL parameter.
|
|
136
142
|
If `None` - use default values for host and port.
|
|
137
143
|
:param url:
|
|
@@ -165,7 +171,7 @@ class QdrantDocumentStore:
|
|
|
165
171
|
Dimension of the embeddings.
|
|
166
172
|
:param on_disk:
|
|
167
173
|
Whether to store the collection on disk.
|
|
168
|
-
:param
|
|
174
|
+
:param use_sparse_embeddings:
|
|
169
175
|
If set to `True`, enables support for sparse embeddings.
|
|
170
176
|
:param sparse_idf:
|
|
171
177
|
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
|
|
@@ -202,8 +208,6 @@ class QdrantDocumentStore:
|
|
|
202
208
|
Params for Write-Ahead-Log.
|
|
203
209
|
:param quantization_config:
|
|
204
210
|
Params for quantization. If `None`, quantization will be disabled.
|
|
205
|
-
:param init_from:
|
|
206
|
-
Use data stored in another collection to initialize this collection.
|
|
207
211
|
:param wait_result_from_api:
|
|
208
212
|
Whether to wait for the result from the API after each request.
|
|
209
213
|
:param metadata:
|
|
@@ -216,7 +220,8 @@ class QdrantDocumentStore:
|
|
|
216
220
|
List of payload fields to index.
|
|
217
221
|
"""
|
|
218
222
|
|
|
219
|
-
self._client = None
|
|
223
|
+
self._client: qdrant_client.QdrantClient | None = None
|
|
224
|
+
self._async_client: qdrant_client.AsyncQdrantClient | None = None
|
|
220
225
|
|
|
221
226
|
# Store the Qdrant client specific attributes
|
|
222
227
|
self.location = location
|
|
@@ -232,7 +237,6 @@ class QdrantDocumentStore:
|
|
|
232
237
|
self.path = path
|
|
233
238
|
self.force_disable_check_same_thread = force_disable_check_same_thread
|
|
234
239
|
self.metadata = metadata or {}
|
|
235
|
-
self.api_key = api_key
|
|
236
240
|
|
|
237
241
|
# Store the Qdrant collection specific attributes
|
|
238
242
|
self.shard_number = shard_number
|
|
@@ -243,7 +247,6 @@ class QdrantDocumentStore:
|
|
|
243
247
|
self.optimizers_config = optimizers_config
|
|
244
248
|
self.wal_config = wal_config
|
|
245
249
|
self.quantization_config = quantization_config
|
|
246
|
-
self.init_from = init_from
|
|
247
250
|
self.wait_result_from_api = wait_result_from_api
|
|
248
251
|
self.recreate_index = recreate_index
|
|
249
252
|
self.payload_fields_to_index = payload_fields_to_index
|
|
@@ -258,24 +261,11 @@ class QdrantDocumentStore:
|
|
|
258
261
|
self.write_batch_size = write_batch_size
|
|
259
262
|
self.scroll_size = scroll_size
|
|
260
263
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
url=self.url,
|
|
267
|
-
port=self.port,
|
|
268
|
-
grpc_port=self.grpc_port,
|
|
269
|
-
prefer_grpc=self.prefer_grpc,
|
|
270
|
-
https=self.https,
|
|
271
|
-
api_key=self.api_key.resolve_value() if self.api_key else None,
|
|
272
|
-
prefix=self.prefix,
|
|
273
|
-
timeout=self.timeout,
|
|
274
|
-
host=self.host,
|
|
275
|
-
path=self.path,
|
|
276
|
-
metadata=self.metadata,
|
|
277
|
-
force_disable_check_same_thread=self.force_disable_check_same_thread,
|
|
278
|
-
)
|
|
264
|
+
def _initialize_client(self) -> None:
|
|
265
|
+
if self._client is None:
|
|
266
|
+
client_params = self._prepare_client_params()
|
|
267
|
+
# This step adds the api-key and User-Agent to metadata
|
|
268
|
+
self._client = qdrant_client.QdrantClient(**client_params)
|
|
279
269
|
# Make sure the collection is properly set up
|
|
280
270
|
self._set_up_collection(
|
|
281
271
|
self.index,
|
|
@@ -287,14 +277,52 @@ class QdrantDocumentStore:
|
|
|
287
277
|
self.on_disk,
|
|
288
278
|
self.payload_fields_to_index,
|
|
289
279
|
)
|
|
290
|
-
|
|
280
|
+
|
|
281
|
+
async def _initialize_async_client(self) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Returns the asynchronous Qdrant client, initializing it if necessary.
|
|
284
|
+
"""
|
|
285
|
+
if self._async_client is None:
|
|
286
|
+
client_params = self._prepare_client_params()
|
|
287
|
+
self._async_client = qdrant_client.AsyncQdrantClient(
|
|
288
|
+
**client_params,
|
|
289
|
+
)
|
|
290
|
+
await self._set_up_collection_async(
|
|
291
|
+
self.index,
|
|
292
|
+
self.embedding_dim,
|
|
293
|
+
self.recreate_index,
|
|
294
|
+
self.similarity,
|
|
295
|
+
self.use_sparse_embeddings,
|
|
296
|
+
self.sparse_idf,
|
|
297
|
+
self.on_disk,
|
|
298
|
+
self.payload_fields_to_index,
|
|
299
|
+
)
|
|
291
300
|
|
|
292
301
|
def count_documents(self) -> int:
|
|
293
302
|
"""
|
|
294
303
|
Returns the number of documents present in the Document Store.
|
|
295
304
|
"""
|
|
305
|
+
self._initialize_client()
|
|
306
|
+
assert self._client is not None
|
|
307
|
+
try:
|
|
308
|
+
response = self._client.count(
|
|
309
|
+
collection_name=self.index,
|
|
310
|
+
)
|
|
311
|
+
return response.count
|
|
312
|
+
except (UnexpectedResponse, ValueError):
|
|
313
|
+
# Qdrant local raises ValueError if the collection is not found, but
|
|
314
|
+
# with the remote server UnexpectedResponse is raised. Until that's unified,
|
|
315
|
+
# we need to catch both.
|
|
316
|
+
return 0
|
|
317
|
+
|
|
318
|
+
async def count_documents_async(self) -> int:
|
|
319
|
+
"""
|
|
320
|
+
Asynchronously returns the number of documents present in the document dtore.
|
|
321
|
+
"""
|
|
322
|
+
await self._initialize_async_client()
|
|
323
|
+
assert self._async_client is not None
|
|
296
324
|
try:
|
|
297
|
-
response = self.
|
|
325
|
+
response = await self._async_client.count(
|
|
298
326
|
collection_name=self.index,
|
|
299
327
|
)
|
|
300
328
|
return response.count
|
|
@@ -306,8 +334,8 @@ class QdrantDocumentStore:
|
|
|
306
334
|
|
|
307
335
|
def filter_documents(
|
|
308
336
|
self,
|
|
309
|
-
filters:
|
|
310
|
-
) ->
|
|
337
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
338
|
+
) -> list[Document]:
|
|
311
339
|
"""
|
|
312
340
|
Returns the documents that match the provided filters.
|
|
313
341
|
|
|
@@ -317,22 +345,32 @@ class QdrantDocumentStore:
|
|
|
317
345
|
:param filters: The filters to apply to the document list.
|
|
318
346
|
:returns: A list of documents that match the given filters.
|
|
319
347
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
raise ValueError(msg)
|
|
348
|
+
# No need to initialize client here as _get_documents_generator
|
|
349
|
+
# will handle client initialization internally
|
|
323
350
|
|
|
324
|
-
|
|
325
|
-
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
326
|
-
raise ValueError(msg)
|
|
351
|
+
QdrantDocumentStore._validate_filters(filters)
|
|
327
352
|
return list(
|
|
328
|
-
self.
|
|
353
|
+
self._get_documents_generator(
|
|
329
354
|
filters,
|
|
330
355
|
)
|
|
331
356
|
)
|
|
332
357
|
|
|
358
|
+
async def filter_documents_async(
|
|
359
|
+
self,
|
|
360
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
361
|
+
) -> list[Document]:
|
|
362
|
+
"""
|
|
363
|
+
Asynchronously returns the documents that match the provided filters.
|
|
364
|
+
"""
|
|
365
|
+
# No need to initialize client here as _get_documents_generator_async
|
|
366
|
+
# will handle client initialization internally
|
|
367
|
+
|
|
368
|
+
QdrantDocumentStore._validate_filters(filters)
|
|
369
|
+
return [doc async for doc in self._get_documents_generator_async(filters)]
|
|
370
|
+
|
|
333
371
|
def write_documents(
|
|
334
372
|
self,
|
|
335
|
-
documents:
|
|
373
|
+
documents: list[Document],
|
|
336
374
|
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
337
375
|
) -> int:
|
|
338
376
|
"""
|
|
@@ -348,13 +386,14 @@ class QdrantDocumentStore:
|
|
|
348
386
|
|
|
349
387
|
:returns: The number of documents written to the document store.
|
|
350
388
|
"""
|
|
389
|
+
|
|
390
|
+
self._initialize_client()
|
|
391
|
+
assert self._client is not None
|
|
392
|
+
|
|
351
393
|
for doc in documents:
|
|
352
394
|
if not isinstance(doc, Document):
|
|
353
395
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
354
396
|
raise ValueError(msg)
|
|
355
|
-
self._set_up_collection(
|
|
356
|
-
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
|
|
357
|
-
)
|
|
358
397
|
|
|
359
398
|
if len(documents) == 0:
|
|
360
399
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -362,7 +401,6 @@ class QdrantDocumentStore:
|
|
|
362
401
|
|
|
363
402
|
document_objects = self._handle_duplicate_documents(
|
|
364
403
|
documents=documents,
|
|
365
|
-
index=self.index,
|
|
366
404
|
policy=policy,
|
|
367
405
|
)
|
|
368
406
|
|
|
@@ -374,7 +412,61 @@ class QdrantDocumentStore:
|
|
|
374
412
|
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
375
413
|
)
|
|
376
414
|
|
|
377
|
-
self.
|
|
415
|
+
self._client.upsert(
|
|
416
|
+
collection_name=self.index,
|
|
417
|
+
points=batch,
|
|
418
|
+
wait=self.wait_result_from_api,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
progress_bar.update(self.write_batch_size)
|
|
422
|
+
return len(document_objects)
|
|
423
|
+
|
|
424
|
+
async def write_documents_async(
|
|
425
|
+
self,
|
|
426
|
+
documents: list[Document],
|
|
427
|
+
policy: DuplicatePolicy = DuplicatePolicy.FAIL,
|
|
428
|
+
) -> int:
|
|
429
|
+
"""
|
|
430
|
+
Asynchronously writes documents to Qdrant using the specified policy.
|
|
431
|
+
The QdrantDocumentStore can handle duplicate documents based on the given policy.
|
|
432
|
+
The available policies are:
|
|
433
|
+
- `FAIL`: The operation will raise an error if any document already exists.
|
|
434
|
+
- `OVERWRITE`: Existing documents will be overwritten with the new ones.
|
|
435
|
+
- `SKIP`: Existing documents will be skipped, and only new documents will be added.
|
|
436
|
+
|
|
437
|
+
:param documents: A list of Document objects to write to Qdrant.
|
|
438
|
+
:param policy: The policy for handling duplicate documents.
|
|
439
|
+
|
|
440
|
+
:returns: The number of documents written to the document store.
|
|
441
|
+
"""
|
|
442
|
+
|
|
443
|
+
await self._initialize_async_client()
|
|
444
|
+
assert self._async_client is not None
|
|
445
|
+
|
|
446
|
+
for doc in documents:
|
|
447
|
+
if not isinstance(doc, Document):
|
|
448
|
+
msg = f"""DocumentStore.write_documents_async() expects a list of
|
|
449
|
+
Documents but got an element of {type(doc)}."""
|
|
450
|
+
raise ValueError(msg)
|
|
451
|
+
|
|
452
|
+
if len(documents) == 0:
|
|
453
|
+
logger.warning("Calling QdrantDocumentStore.write_documents_async() with empty list")
|
|
454
|
+
return 0
|
|
455
|
+
|
|
456
|
+
document_objects = await self._handle_duplicate_documents_async(
|
|
457
|
+
documents=documents,
|
|
458
|
+
policy=policy,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
|
|
462
|
+
with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
|
|
463
|
+
for document_batch in batched_documents:
|
|
464
|
+
batch = convert_haystack_documents_to_qdrant_points(
|
|
465
|
+
document_batch,
|
|
466
|
+
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
await self._async_client.upsert(
|
|
378
470
|
collection_name=self.index,
|
|
379
471
|
points=batch,
|
|
380
472
|
wait=self.wait_result_from_api,
|
|
@@ -383,17 +475,20 @@ class QdrantDocumentStore:
|
|
|
383
475
|
progress_bar.update(self.write_batch_size)
|
|
384
476
|
return len(document_objects)
|
|
385
477
|
|
|
386
|
-
def delete_documents(self, document_ids:
|
|
478
|
+
def delete_documents(self, document_ids: list[str]) -> None:
|
|
387
479
|
"""
|
|
388
480
|
Deletes documents that match the provided `document_ids` from the document store.
|
|
389
481
|
|
|
390
482
|
:param document_ids: the document ids to delete
|
|
391
483
|
"""
|
|
392
|
-
|
|
484
|
+
|
|
485
|
+
self._initialize_client()
|
|
486
|
+
assert self._client is not None
|
|
487
|
+
|
|
393
488
|
try:
|
|
394
|
-
self.
|
|
489
|
+
self._client.delete(
|
|
395
490
|
collection_name=self.index,
|
|
396
|
-
points_selector=
|
|
491
|
+
points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
|
|
397
492
|
wait=self.wait_result_from_api,
|
|
398
493
|
)
|
|
399
494
|
except KeyError:
|
|
@@ -401,149 +496,987 @@ class QdrantDocumentStore:
|
|
|
401
496
|
"Called QdrantDocumentStore.delete_documents() on a non-existing ID",
|
|
402
497
|
)
|
|
403
498
|
|
|
404
|
-
|
|
405
|
-
def from_dict(cls, data: Dict[str, Any]) -> "QdrantDocumentStore":
|
|
499
|
+
async def delete_documents_async(self, document_ids: list[str]) -> None:
|
|
406
500
|
"""
|
|
407
|
-
|
|
501
|
+
Asynchronously deletes documents that match the provided `document_ids` from the document store.
|
|
408
502
|
|
|
409
|
-
:param
|
|
410
|
-
The dictionary to deserialize from.
|
|
411
|
-
:returns:
|
|
412
|
-
The deserialized component.
|
|
503
|
+
:param document_ids: the document ids to delete
|
|
413
504
|
"""
|
|
414
|
-
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
415
|
-
return default_from_dict(cls, data)
|
|
416
505
|
|
|
417
|
-
|
|
506
|
+
await self._initialize_async_client()
|
|
507
|
+
assert self._async_client is not None
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
await self._async_client.delete(
|
|
511
|
+
collection_name=self.index,
|
|
512
|
+
points_selector=rest.PointIdsList(points=[convert_id(_id) for _id in document_ids]),
|
|
513
|
+
wait=self.wait_result_from_api,
|
|
514
|
+
)
|
|
515
|
+
except KeyError:
|
|
516
|
+
logger.warning(
|
|
517
|
+
"Called QdrantDocumentStore.delete_documents_async() on a non-existing ID",
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
def delete_by_filter(self, filters: dict[str, Any]) -> int:
|
|
418
521
|
"""
|
|
419
|
-
|
|
522
|
+
Deletes all documents that match the provided filters.
|
|
523
|
+
|
|
524
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
525
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
420
526
|
|
|
421
527
|
:returns:
|
|
422
|
-
|
|
528
|
+
The number of documents deleted.
|
|
423
529
|
"""
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
# Set as init_parms without default values
|
|
427
|
-
init_params = {k: getattr(self, k) for k in params}
|
|
428
|
-
init_params["api_key"] = self.api_key.to_dict() if self.api_key else None
|
|
429
|
-
return default_to_dict(
|
|
430
|
-
self,
|
|
431
|
-
**init_params,
|
|
432
|
-
)
|
|
530
|
+
self._initialize_client()
|
|
531
|
+
assert self._client is not None
|
|
433
532
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
533
|
+
try:
|
|
534
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
535
|
+
if qdrant_filter is None:
|
|
536
|
+
return 0
|
|
537
|
+
|
|
538
|
+
count_response = self._client.count(
|
|
539
|
+
collection_name=self.index,
|
|
540
|
+
count_filter=qdrant_filter,
|
|
541
|
+
)
|
|
542
|
+
deleted_count = count_response.count
|
|
543
|
+
|
|
544
|
+
self._client.delete(
|
|
545
|
+
collection_name=self.index,
|
|
546
|
+
points_selector=rest.FilterSelector(filter=qdrant_filter),
|
|
547
|
+
wait=self.wait_result_from_api,
|
|
548
|
+
)
|
|
549
|
+
return deleted_count
|
|
550
|
+
|
|
551
|
+
except Exception as e:
|
|
552
|
+
msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
|
|
553
|
+
raise QdrantStoreError(msg) from e
|
|
554
|
+
|
|
555
|
+
async def delete_by_filter_async(self, filters: dict[str, Any]) -> int:
|
|
438
556
|
"""
|
|
439
|
-
|
|
557
|
+
Asynchronously deletes all documents that match the provided filters.
|
|
440
558
|
|
|
441
|
-
:param filters:
|
|
442
|
-
|
|
559
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
560
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
561
|
+
|
|
562
|
+
:returns:
|
|
563
|
+
The number of documents deleted.
|
|
443
564
|
"""
|
|
565
|
+
await self._initialize_async_client()
|
|
566
|
+
assert self._async_client is not None
|
|
444
567
|
|
|
445
|
-
|
|
446
|
-
|
|
568
|
+
try:
|
|
569
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
570
|
+
if qdrant_filter is None:
|
|
571
|
+
return 0
|
|
447
572
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
records, next_offset = self.client.scroll(
|
|
452
|
-
collection_name=index,
|
|
453
|
-
scroll_filter=qdrant_filters,
|
|
454
|
-
limit=self.scroll_size,
|
|
455
|
-
offset=next_offset,
|
|
456
|
-
with_payload=True,
|
|
457
|
-
with_vectors=True,
|
|
573
|
+
count_response = await self._async_client.count(
|
|
574
|
+
collection_name=self.index,
|
|
575
|
+
count_filter=qdrant_filter,
|
|
458
576
|
)
|
|
459
|
-
|
|
460
|
-
|
|
577
|
+
deleted_count = count_response.count
|
|
578
|
+
|
|
579
|
+
await self._async_client.delete(
|
|
580
|
+
collection_name=self.index,
|
|
581
|
+
points_selector=rest.FilterSelector(filter=qdrant_filter),
|
|
582
|
+
wait=self.wait_result_from_api,
|
|
461
583
|
)
|
|
584
|
+
return deleted_count
|
|
462
585
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
)
|
|
586
|
+
except Exception as e:
|
|
587
|
+
msg = f"Failed to delete documents by filter from Qdrant: {e!s}"
|
|
588
|
+
raise QdrantStoreError(msg) from e
|
|
467
589
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
ids: List[str],
|
|
471
|
-
index: Optional[str] = None,
|
|
472
|
-
) -> List[Document]:
|
|
590
|
+
@staticmethod
|
|
591
|
+
def _check_stop_scrolling(next_offset: Any) -> bool:
|
|
473
592
|
"""
|
|
474
|
-
|
|
593
|
+
Checks if scrolling should stop based on the next_offset value.
|
|
475
594
|
|
|
476
|
-
:param
|
|
477
|
-
|
|
478
|
-
:param index:
|
|
479
|
-
The name of the index to retrieve documents from.
|
|
480
|
-
:returns:
|
|
481
|
-
A list of documents.
|
|
595
|
+
:param next_offset: The offset returned from the scroll operation.
|
|
596
|
+
:returns: True if scrolling should stop, False otherwise.
|
|
482
597
|
"""
|
|
483
|
-
|
|
598
|
+
return next_offset is None or (
|
|
599
|
+
hasattr(next_offset, "num")
|
|
600
|
+
and hasattr(next_offset, "uuid")
|
|
601
|
+
and next_offset.num == 0
|
|
602
|
+
and next_offset.uuid == ""
|
|
603
|
+
)
|
|
484
604
|
|
|
485
|
-
|
|
605
|
+
@staticmethod
|
|
606
|
+
def _metadata_fields_info_from_schema(payload_schema: dict[str, Any]) -> dict[str, str]:
|
|
607
|
+
"""Build field name -> type dict from Qdrant payload_schema. Used by get_metadata_fields_info (sync/async)."""
|
|
608
|
+
fields_info: dict[str, str] = {}
|
|
609
|
+
for field_name, field_config in payload_schema.items():
|
|
610
|
+
if hasattr(field_config, "data_type"):
|
|
611
|
+
fields_info[field_name] = str(field_config.data_type)
|
|
612
|
+
else:
|
|
613
|
+
fields_info[field_name] = "unknown"
|
|
614
|
+
return fields_info
|
|
615
|
+
|
|
616
|
+
@staticmethod
|
|
617
|
+
def _process_records_min_max(
|
|
618
|
+
records: list[Any], metadata_field: str, min_value: Any, max_value: Any
|
|
619
|
+
) -> tuple[Any, Any]:
|
|
620
|
+
"""Update min/max from a batch of Qdrant records. Used by get_metadata_field_min_max (sync/async)."""
|
|
621
|
+
for record in records:
|
|
622
|
+
if record.payload and "meta" in record.payload:
|
|
623
|
+
meta = record.payload["meta"]
|
|
624
|
+
if metadata_field in meta:
|
|
625
|
+
value = meta[metadata_field]
|
|
626
|
+
if value is not None:
|
|
627
|
+
if min_value is None or value < min_value:
|
|
628
|
+
min_value = value
|
|
629
|
+
if max_value is None or value > max_value:
|
|
630
|
+
max_value = value
|
|
631
|
+
return min_value, max_value
|
|
632
|
+
|
|
633
|
+
@staticmethod
|
|
634
|
+
def _process_records_count_unique(
|
|
635
|
+
records: list[Any], metadata_fields: list[str], unique_values_by_field: dict[str, set[Any]]
|
|
636
|
+
) -> None:
|
|
637
|
+
"""
|
|
638
|
+
Update unique_values_by_field from a batch of Qdrant records.
|
|
486
639
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
640
|
+
Used by count_unique_metadata_by_filter (sync/async).
|
|
641
|
+
"""
|
|
642
|
+
for record in records:
|
|
643
|
+
if record.payload and "meta" in record.payload:
|
|
644
|
+
meta = record.payload["meta"]
|
|
645
|
+
for field in metadata_fields:
|
|
646
|
+
if field in meta:
|
|
647
|
+
value = meta[field]
|
|
648
|
+
if value is not None:
|
|
649
|
+
if isinstance(value, (list, dict)):
|
|
650
|
+
unique_values_by_field[field].add(str(value))
|
|
651
|
+
else:
|
|
652
|
+
unique_values_by_field[field].add(value)
|
|
653
|
+
|
|
654
|
+
@staticmethod
|
|
655
|
+
def _process_records_unique_values(
|
|
656
|
+
records: list[Any],
|
|
657
|
+
metadata_field: str,
|
|
658
|
+
unique_values: list[Any],
|
|
659
|
+
unique_values_set: set[Any],
|
|
660
|
+
offset: int,
|
|
661
|
+
limit: int,
|
|
662
|
+
) -> bool:
|
|
663
|
+
"""Collect unique values from a batch of records. Returns True when len(unique_values) >= offset + limit."""
|
|
664
|
+
for record in records:
|
|
665
|
+
if record.payload and "meta" in record.payload:
|
|
666
|
+
meta = record.payload["meta"]
|
|
667
|
+
if metadata_field in meta:
|
|
668
|
+
value = meta[metadata_field]
|
|
669
|
+
if value is not None:
|
|
670
|
+
hashable_value = str(value) if isinstance(value, (list, dict)) else value
|
|
671
|
+
if hashable_value not in unique_values_set:
|
|
672
|
+
unique_values_set.add(hashable_value)
|
|
673
|
+
unique_values.append(value)
|
|
674
|
+
if len(unique_values) >= offset + limit:
|
|
675
|
+
return True
|
|
676
|
+
return False
|
|
677
|
+
|
|
678
|
+
@staticmethod
|
|
679
|
+
def _create_updated_point_from_record(record: Any, meta: dict[str, Any]) -> rest.PointStruct:
|
|
680
|
+
"""
|
|
681
|
+
Creates an updated PointStruct from a Qdrant record with merged metadata.
|
|
682
|
+
|
|
683
|
+
:param record: The Qdrant record to update.
|
|
684
|
+
:param meta: The metadata fields to merge with existing metadata.
|
|
685
|
+
:returns: A PointStruct with updated metadata and preserved vectors.
|
|
686
|
+
"""
|
|
687
|
+
# merge existing payload with new metadata
|
|
688
|
+
# Metadata is stored under the "meta" key in the payload
|
|
689
|
+
updated_payload = dict(record.payload or {})
|
|
690
|
+
if "meta" not in updated_payload:
|
|
691
|
+
updated_payload["meta"] = {}
|
|
692
|
+
updated_payload["meta"].update(meta)
|
|
693
|
+
|
|
694
|
+
# create updated point preserving vectors
|
|
695
|
+
# Type cast needed because record.vector type doesn't include all PointStruct vector types
|
|
696
|
+
vector_value = record.vector if record.vector is not None else {}
|
|
697
|
+
return rest.PointStruct(
|
|
698
|
+
id=record.id,
|
|
699
|
+
vector=cast(Any, vector_value),
|
|
700
|
+
payload=updated_payload,
|
|
493
701
|
)
|
|
494
702
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
703
|
+
def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
|
|
704
|
+
"""
|
|
705
|
+
Updates the metadata of all documents that match the provided filters.
|
|
706
|
+
|
|
707
|
+
**Note**: This operation is not atomic. Documents matching the filter are fetched first,
|
|
708
|
+
then updated. If documents are modified between the fetch and update operations,
|
|
709
|
+
those changes may be lost.
|
|
710
|
+
|
|
711
|
+
:param filters: The filters to apply to select documents for updating.
|
|
712
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
713
|
+
:param meta: The metadata fields to update. This will be merged with existing metadata.
|
|
714
|
+
|
|
715
|
+
:returns:
|
|
716
|
+
The number of documents updated.
|
|
717
|
+
"""
|
|
718
|
+
self._initialize_client()
|
|
719
|
+
assert self._client is not None
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
723
|
+
if qdrant_filter is None:
|
|
724
|
+
return 0
|
|
725
|
+
|
|
726
|
+
# get all matching documents using scroll
|
|
727
|
+
updated_points = []
|
|
728
|
+
next_offset = None
|
|
729
|
+
|
|
730
|
+
while True:
|
|
731
|
+
records, next_offset = self._client.scroll(
|
|
732
|
+
collection_name=self.index,
|
|
733
|
+
scroll_filter=qdrant_filter,
|
|
734
|
+
limit=self.scroll_size,
|
|
735
|
+
offset=next_offset,
|
|
736
|
+
with_payload=True,
|
|
737
|
+
with_vectors=True,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# update payload for each record
|
|
741
|
+
for record in records:
|
|
742
|
+
updated_points.append(self._create_updated_point_from_record(record, meta))
|
|
743
|
+
|
|
744
|
+
if self._check_stop_scrolling(next_offset):
|
|
745
|
+
break
|
|
746
|
+
|
|
747
|
+
if not updated_points:
|
|
748
|
+
return 0
|
|
749
|
+
|
|
750
|
+
# upsert updated points back in batches
|
|
751
|
+
for batch in get_batches_from_generator(updated_points, self.write_batch_size):
|
|
752
|
+
self._client.upsert(
|
|
753
|
+
collection_name=self.index,
|
|
754
|
+
points=list(batch),
|
|
755
|
+
wait=self.wait_result_from_api,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
logger.info(
|
|
759
|
+
"Updated {n_docs} documents in collection '{name}' using filters.",
|
|
760
|
+
n_docs=len(updated_points),
|
|
761
|
+
name=self.index,
|
|
498
762
|
)
|
|
499
|
-
|
|
763
|
+
return len(updated_points)
|
|
764
|
+
except Exception as e:
|
|
765
|
+
msg = f"Failed to update documents by filter in Qdrant: {e!s}"
|
|
766
|
+
raise QdrantStoreError(msg) from e
|
|
500
767
|
|
|
501
|
-
def
|
|
502
|
-
self,
|
|
503
|
-
query_sparse_embedding: SparseEmbedding,
|
|
504
|
-
filters: Optional[Union[Dict[str, Any], rest.Filter]] = None,
|
|
505
|
-
top_k: int = 10,
|
|
506
|
-
scale_score: bool = False,
|
|
507
|
-
return_embedding: bool = False,
|
|
508
|
-
score_threshold: Optional[float] = None,
|
|
509
|
-
group_by: Optional[str] = None,
|
|
510
|
-
group_size: Optional[int] = None,
|
|
511
|
-
) -> List[Document]:
|
|
768
|
+
async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
|
|
512
769
|
"""
|
|
513
|
-
|
|
770
|
+
Asynchronously updates the metadata of all documents that match the provided filters.
|
|
514
771
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
groups to return.
|
|
519
|
-
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
520
|
-
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
521
|
-
:param score_threshold: A minimal score threshold for the result.
|
|
522
|
-
Score of the returned result might be higher or smaller than the threshold
|
|
523
|
-
depending on the Distance function used.
|
|
524
|
-
E.g. for cosine similarity only higher scores will be returned.
|
|
525
|
-
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
526
|
-
value, all values will be used for grouping. One point can be in multiple groups.
|
|
527
|
-
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
772
|
+
**Note**: This operation is not atomic. Documents matching the filter are fetched first,
|
|
773
|
+
then updated. If documents are modified between the fetch and update operations,
|
|
774
|
+
those changes may be lost.
|
|
528
775
|
|
|
529
|
-
:
|
|
776
|
+
:param filters: The filters to apply to select documents for updating.
|
|
777
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
778
|
+
:param meta: The metadata fields to update. This will be merged with existing metadata.
|
|
530
779
|
|
|
531
|
-
:
|
|
532
|
-
|
|
780
|
+
:returns:
|
|
781
|
+
The number of documents updated.
|
|
533
782
|
"""
|
|
783
|
+
await self._initialize_async_client()
|
|
784
|
+
assert self._async_client is not None
|
|
534
785
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
786
|
+
try:
|
|
787
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
788
|
+
if qdrant_filter is None:
|
|
789
|
+
return 0
|
|
790
|
+
|
|
791
|
+
updated_points = []
|
|
792
|
+
next_offset = None
|
|
793
|
+
|
|
794
|
+
while True:
|
|
795
|
+
records, next_offset = await self._async_client.scroll(
|
|
796
|
+
collection_name=self.index,
|
|
797
|
+
scroll_filter=qdrant_filter,
|
|
798
|
+
limit=self.scroll_size,
|
|
799
|
+
offset=next_offset,
|
|
800
|
+
with_payload=True,
|
|
801
|
+
with_vectors=True,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# update payload for each record
|
|
805
|
+
for record in records:
|
|
806
|
+
updated_points.append(self._create_updated_point_from_record(record, meta))
|
|
807
|
+
|
|
808
|
+
if self._check_stop_scrolling(next_offset):
|
|
809
|
+
break
|
|
810
|
+
|
|
811
|
+
if not updated_points:
|
|
812
|
+
return 0
|
|
813
|
+
|
|
814
|
+
# upsert updated points back in batches
|
|
815
|
+
for batch in get_batches_from_generator(updated_points, self.write_batch_size):
|
|
816
|
+
await self._async_client.upsert(
|
|
817
|
+
collection_name=self.index,
|
|
818
|
+
points=list(batch),
|
|
819
|
+
wait=self.wait_result_from_api,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
logger.info(
|
|
823
|
+
"Updated {n_docs} documents in collection '{name}' using filters.",
|
|
824
|
+
n_docs=len(updated_points),
|
|
825
|
+
name=self.index,
|
|
539
826
|
)
|
|
540
|
-
|
|
827
|
+
return len(updated_points)
|
|
828
|
+
except Exception as e:
|
|
829
|
+
msg = f"Failed to update documents by filter in Qdrant: {e!s}"
|
|
830
|
+
raise QdrantStoreError(msg) from e
|
|
831
|
+
|
|
832
|
+
def delete_all_documents(self, recreate_index: bool = False) -> None:
|
|
833
|
+
"""
|
|
834
|
+
Deletes all documents from the document store.
|
|
835
|
+
|
|
836
|
+
:param recreate_index: Whether to recreate the index after deleting all documents.
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
self._initialize_client()
|
|
840
|
+
assert self._client is not None
|
|
841
|
+
|
|
842
|
+
if recreate_index:
|
|
843
|
+
# get current collection config as json
|
|
844
|
+
collection_info = self._client.get_collection(collection_name=self.index)
|
|
845
|
+
info_json = collection_info.model_dump()
|
|
846
|
+
|
|
847
|
+
# deal with the Optional use_sparse_embeddings
|
|
848
|
+
sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
|
|
849
|
+
use_sparse_embeddings = True if sparse_vectors else False
|
|
850
|
+
|
|
851
|
+
# deal with the Optional sparse_idf
|
|
852
|
+
hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
|
|
853
|
+
sparse_idf = True if use_sparse_embeddings and hnsw_config else False
|
|
854
|
+
|
|
855
|
+
# recreate collection
|
|
856
|
+
self._set_up_collection(
|
|
857
|
+
collection_name=self.index,
|
|
858
|
+
embedding_dim=info_json["config"]["params"]["vectors"]["size"],
|
|
859
|
+
recreate_collection=True,
|
|
860
|
+
similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
|
|
861
|
+
use_sparse_embeddings=use_sparse_embeddings,
|
|
862
|
+
sparse_idf=sparse_idf,
|
|
863
|
+
on_disk=info_json["config"]["hnsw_config"]["on_disk"],
|
|
864
|
+
payload_fields_to_index=info_json["payload_schema"],
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
else:
|
|
868
|
+
try:
|
|
869
|
+
self._client.delete(
|
|
870
|
+
collection_name=self.index,
|
|
871
|
+
points_selector=rest.FilterSelector(
|
|
872
|
+
filter=rest.Filter(
|
|
873
|
+
must=[],
|
|
874
|
+
)
|
|
875
|
+
),
|
|
876
|
+
wait=self.wait_result_from_api,
|
|
877
|
+
)
|
|
878
|
+
except Exception as e:
|
|
879
|
+
logger.warning(
|
|
880
|
+
f"Error {e} when calling QdrantDocumentStore.delete_all_documents()",
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
async def delete_all_documents_async(self, recreate_index: bool = False) -> None:
|
|
884
|
+
"""
|
|
885
|
+
Asynchronously deletes all documents from the document store.
|
|
886
|
+
|
|
887
|
+
:param recreate_index: Whether to recreate the index after deleting all documents.
|
|
888
|
+
"""
|
|
889
|
+
|
|
890
|
+
await self._initialize_async_client()
|
|
891
|
+
assert self._async_client is not None
|
|
892
|
+
|
|
893
|
+
if recreate_index:
|
|
894
|
+
# get current collection config as json
|
|
895
|
+
collection_info = await self._async_client.get_collection(collection_name=self.index)
|
|
896
|
+
info_json = collection_info.model_dump()
|
|
897
|
+
|
|
898
|
+
# deal with the Optional use_sparse_embeddings
|
|
899
|
+
sparse_vectors = info_json["config"]["params"]["sparse_vectors"]
|
|
900
|
+
use_sparse_embeddings = True if sparse_vectors else False
|
|
901
|
+
|
|
902
|
+
# deal with the Optional sparse_idf
|
|
903
|
+
hnsw_config = info_json["config"]["params"]["vectors"].get("config", {}).get("hnsw_config", None)
|
|
904
|
+
sparse_idf = True if use_sparse_embeddings and hnsw_config else False
|
|
905
|
+
|
|
906
|
+
# recreate collection
|
|
907
|
+
await self._set_up_collection_async(
|
|
908
|
+
collection_name=self.index,
|
|
909
|
+
embedding_dim=info_json["config"]["params"]["vectors"]["size"],
|
|
910
|
+
recreate_collection=True,
|
|
911
|
+
similarity=info_json["config"]["params"]["vectors"]["distance"].lower(),
|
|
912
|
+
use_sparse_embeddings=use_sparse_embeddings,
|
|
913
|
+
sparse_idf=sparse_idf,
|
|
914
|
+
on_disk=info_json["config"]["hnsw_config"]["on_disk"],
|
|
915
|
+
payload_fields_to_index=info_json["payload_schema"],
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
else:
|
|
919
|
+
try:
|
|
920
|
+
await self._async_client.delete(
|
|
921
|
+
collection_name=self.index,
|
|
922
|
+
points_selector=rest.FilterSelector(
|
|
923
|
+
filter=rest.Filter(
|
|
924
|
+
must=[],
|
|
925
|
+
)
|
|
926
|
+
),
|
|
927
|
+
wait=self.wait_result_from_api,
|
|
928
|
+
)
|
|
929
|
+
except Exception as e:
|
|
930
|
+
logger.warning(
|
|
931
|
+
f"Error {e} when calling QdrantDocumentStore.delete_all_documents_async()",
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
|
|
935
|
+
"""
|
|
936
|
+
Returns the number of documents that match the provided filters.
|
|
937
|
+
|
|
938
|
+
:param filters: The filters to apply to count documents.
|
|
939
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
940
|
+
|
|
941
|
+
:returns: The number of documents that match the filters.
|
|
942
|
+
"""
|
|
943
|
+
self._initialize_client()
|
|
944
|
+
assert self._client is not None
|
|
945
|
+
|
|
946
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
947
|
+
try:
|
|
948
|
+
response = self._client.count(
|
|
949
|
+
collection_name=self.index,
|
|
950
|
+
count_filter=qdrant_filter,
|
|
951
|
+
)
|
|
952
|
+
return response.count
|
|
953
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
954
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter()")
|
|
955
|
+
return 0
|
|
956
|
+
|
|
957
|
+
async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int:
|
|
958
|
+
"""
|
|
959
|
+
Asynchronously returns the number of documents that match the provided filters.
|
|
960
|
+
|
|
961
|
+
:param filters: The filters to apply to select documents for counting.
|
|
962
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
963
|
+
|
|
964
|
+
:returns:
|
|
965
|
+
The number of documents that match the filters.
|
|
966
|
+
"""
|
|
967
|
+
await self._initialize_async_client()
|
|
968
|
+
assert self._async_client is not None
|
|
969
|
+
|
|
970
|
+
qdrant_filter = convert_filters_to_qdrant(filters)
|
|
971
|
+
try:
|
|
972
|
+
response = await self._async_client.count(
|
|
973
|
+
collection_name=self.index,
|
|
974
|
+
count_filter=qdrant_filter,
|
|
975
|
+
)
|
|
976
|
+
return response.count
|
|
977
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
978
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_documents_by_filter_async()")
|
|
979
|
+
return 0
|
|
980
|
+
|
|
981
|
+
def get_metadata_fields_info(self) -> dict[str, str]:
|
|
982
|
+
"""
|
|
983
|
+
Returns the information about the fields from the collection.
|
|
984
|
+
|
|
985
|
+
:returns:
|
|
986
|
+
A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
|
|
987
|
+
"""
|
|
988
|
+
self._initialize_client()
|
|
989
|
+
assert self._client is not None
|
|
990
|
+
|
|
991
|
+
try:
|
|
992
|
+
collection_info = self._client.get_collection(self.index)
|
|
993
|
+
payload_schema = collection_info.payload_schema or {}
|
|
994
|
+
return self._metadata_fields_info_from_schema(payload_schema)
|
|
995
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
996
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info()")
|
|
997
|
+
return {}
|
|
998
|
+
|
|
999
|
+
async def get_metadata_fields_info_async(self) -> dict[str, str]:
|
|
1000
|
+
"""
|
|
1001
|
+
Asynchronously returns the information about the fields from the collection.
|
|
1002
|
+
|
|
1003
|
+
:returns:
|
|
1004
|
+
A dictionary mapping field names to their types (e.g., {"field_name": "integer"}).
|
|
1005
|
+
"""
|
|
1006
|
+
await self._initialize_async_client()
|
|
1007
|
+
assert self._async_client is not None
|
|
1008
|
+
|
|
1009
|
+
try:
|
|
1010
|
+
collection_info = await self._async_client.get_collection(self.index)
|
|
1011
|
+
payload_schema = collection_info.payload_schema or {}
|
|
1012
|
+
return self._metadata_fields_info_from_schema(payload_schema)
|
|
1013
|
+
except (UnexpectedResponse, ValueError) as e:
|
|
1014
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_fields_info_async()")
|
|
1015
|
+
return {}
|
|
1016
|
+
|
|
1017
|
+
def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]:
|
|
1018
|
+
"""
|
|
1019
|
+
Returns the minimum and maximum values for the given metadata field.
|
|
1020
|
+
|
|
1021
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
|
|
1022
|
+
|
|
1023
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1024
|
+
metadata field across all documents. Returns an empty dict if no documents have the field.
|
|
1025
|
+
"""
|
|
1026
|
+
self._initialize_client()
|
|
1027
|
+
assert self._client is not None
|
|
1028
|
+
|
|
1029
|
+
try:
|
|
1030
|
+
min_value: Any = None
|
|
1031
|
+
max_value: Any = None
|
|
1032
|
+
next_offset = None
|
|
1033
|
+
|
|
1034
|
+
while True:
|
|
1035
|
+
records, next_offset = self._client.scroll(
|
|
1036
|
+
collection_name=self.index,
|
|
1037
|
+
scroll_filter=None,
|
|
1038
|
+
limit=self.scroll_size,
|
|
1039
|
+
offset=next_offset,
|
|
1040
|
+
with_payload=True,
|
|
1041
|
+
with_vectors=False,
|
|
1042
|
+
)
|
|
1043
|
+
min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
|
|
1044
|
+
if self._check_stop_scrolling(next_offset):
|
|
1045
|
+
break
|
|
1046
|
+
|
|
1047
|
+
if min_value is not None and max_value is not None:
|
|
1048
|
+
return {"min": min_value, "max": max_value}
|
|
1049
|
+
return {}
|
|
1050
|
+
except Exception as e:
|
|
1051
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max()")
|
|
1052
|
+
return {}
|
|
1053
|
+
|
|
1054
|
+
async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, Any]:
|
|
1055
|
+
"""
|
|
1056
|
+
Asynchronously returns the minimum and maximum values for the given metadata field.
|
|
1057
|
+
|
|
1058
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get the minimum and maximum values for.
|
|
1059
|
+
|
|
1060
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1061
|
+
metadata field across all documents. Returns an empty dict if no documents have the field.
|
|
1062
|
+
"""
|
|
1063
|
+
await self._initialize_async_client()
|
|
1064
|
+
assert self._async_client is not None
|
|
1065
|
+
|
|
1066
|
+
try:
|
|
1067
|
+
min_value: Any = None
|
|
1068
|
+
max_value: Any = None
|
|
1069
|
+
next_offset = None
|
|
1070
|
+
|
|
1071
|
+
while True:
|
|
1072
|
+
records, next_offset = await self._async_client.scroll(
|
|
1073
|
+
collection_name=self.index,
|
|
1074
|
+
scroll_filter=None,
|
|
1075
|
+
limit=self.scroll_size,
|
|
1076
|
+
offset=next_offset,
|
|
1077
|
+
with_payload=True,
|
|
1078
|
+
with_vectors=False,
|
|
1079
|
+
)
|
|
1080
|
+
min_value, max_value = self._process_records_min_max(records, metadata_field, min_value, max_value)
|
|
1081
|
+
if self._check_stop_scrolling(next_offset):
|
|
1082
|
+
break
|
|
1083
|
+
|
|
1084
|
+
if min_value is not None and max_value is not None:
|
|
1085
|
+
return {"min": min_value, "max": max_value}
|
|
1086
|
+
return {}
|
|
1087
|
+
except Exception as e:
|
|
1088
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_min_max_async()")
|
|
1089
|
+
return {}
|
|
1090
|
+
|
|
1091
|
+
def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
|
|
1092
|
+
"""
|
|
1093
|
+
Returns the number of unique values for each specified metadata field among documents that match the filters.
|
|
1094
|
+
|
|
1095
|
+
:param filters: The filters to restrict the documents considered.
|
|
1096
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1097
|
+
:param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
|
|
1098
|
+
|
|
1099
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1100
|
+
documents.
|
|
1101
|
+
"""
|
|
1102
|
+
self._initialize_client()
|
|
1103
|
+
assert self._client is not None
|
|
1104
|
+
|
|
1105
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1106
|
+
unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
|
|
1107
|
+
|
|
1108
|
+
try:
|
|
1109
|
+
next_offset = None
|
|
1110
|
+
while True:
|
|
1111
|
+
records, next_offset = self._client.scroll(
|
|
1112
|
+
collection_name=self.index,
|
|
1113
|
+
scroll_filter=qdrant_filter,
|
|
1114
|
+
limit=self.scroll_size,
|
|
1115
|
+
offset=next_offset,
|
|
1116
|
+
with_payload=True,
|
|
1117
|
+
with_vectors=False,
|
|
1118
|
+
)
|
|
1119
|
+
self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
|
|
1120
|
+
if self._check_stop_scrolling(next_offset):
|
|
1121
|
+
break
|
|
1122
|
+
|
|
1123
|
+
return {field: len(unique_values_by_field[field]) for field in metadata_fields}
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter()")
|
|
1126
|
+
return dict.fromkeys(metadata_fields, 0)
|
|
1127
|
+
|
|
1128
|
+
async def count_unique_metadata_by_filter_async(
|
|
1129
|
+
self, filters: dict[str, Any], metadata_fields: list[str]
|
|
1130
|
+
) -> dict[str, int]:
|
|
1131
|
+
"""
|
|
1132
|
+
Asynchronously returns the number of unique values for each specified metadata field among documents that
|
|
1133
|
+
match the filters.
|
|
1134
|
+
|
|
1135
|
+
:param filters: The filters to restrict the documents considered.
|
|
1136
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1137
|
+
:param metadata_fields: List of metadata field keys (inside ``meta``) to count unique values for.
|
|
1138
|
+
|
|
1139
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1140
|
+
documents.
|
|
1141
|
+
"""
|
|
1142
|
+
await self._initialize_async_client()
|
|
1143
|
+
assert self._async_client is not None
|
|
1144
|
+
|
|
1145
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1146
|
+
unique_values_by_field: dict[str, set[Any]] = {field: set() for field in metadata_fields}
|
|
1147
|
+
|
|
1148
|
+
try:
|
|
1149
|
+
next_offset = None
|
|
1150
|
+
while True:
|
|
1151
|
+
records, next_offset = await self._async_client.scroll(
|
|
1152
|
+
collection_name=self.index,
|
|
1153
|
+
scroll_filter=qdrant_filter,
|
|
1154
|
+
limit=self.scroll_size,
|
|
1155
|
+
offset=next_offset,
|
|
1156
|
+
with_payload=True,
|
|
1157
|
+
with_vectors=False,
|
|
1158
|
+
)
|
|
1159
|
+
self._process_records_count_unique(records, metadata_fields, unique_values_by_field)
|
|
1160
|
+
if self._check_stop_scrolling(next_offset):
|
|
1161
|
+
break
|
|
1162
|
+
|
|
1163
|
+
return {field: len(unique_values_by_field[field]) for field in metadata_fields}
|
|
1164
|
+
except Exception as e:
|
|
1165
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.count_unique_metadata_by_filter_async()")
|
|
1166
|
+
return dict.fromkeys(metadata_fields, 0)
|
|
1167
|
+
|
|
1168
|
+
def get_metadata_field_unique_values(
|
|
1169
|
+
self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
|
|
1170
|
+
) -> list[Any]:
|
|
1171
|
+
"""
|
|
1172
|
+
Returns unique values for a metadata field, with optional filters and offset/limit pagination.
|
|
1173
|
+
|
|
1174
|
+
Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
|
|
1175
|
+
|
|
1176
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
|
|
1177
|
+
:param filters: Optional filters to restrict the documents considered.
|
|
1178
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1179
|
+
:param limit: Maximum number of unique values to return per page. Defaults to 100.
|
|
1180
|
+
:param offset: Number of unique values to skip (for pagination). Defaults to 0.
|
|
1181
|
+
|
|
1182
|
+
:returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
|
|
1183
|
+
"""
|
|
1184
|
+
self._initialize_client()
|
|
1185
|
+
assert self._client is not None
|
|
1186
|
+
|
|
1187
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1188
|
+
unique_values: list[Any] = []
|
|
1189
|
+
unique_values_set: set[Any] = set()
|
|
1190
|
+
|
|
1191
|
+
try:
|
|
1192
|
+
next_offset = None
|
|
1193
|
+
while len(unique_values) < offset + limit:
|
|
1194
|
+
records, next_offset = self._client.scroll(
|
|
1195
|
+
collection_name=self.index,
|
|
1196
|
+
scroll_filter=qdrant_filter,
|
|
1197
|
+
limit=self.scroll_size,
|
|
1198
|
+
offset=next_offset,
|
|
1199
|
+
with_payload=True,
|
|
1200
|
+
with_vectors=False,
|
|
1201
|
+
)
|
|
1202
|
+
if self._process_records_unique_values(
|
|
1203
|
+
records, metadata_field, unique_values, unique_values_set, offset, limit
|
|
1204
|
+
):
|
|
1205
|
+
break
|
|
1206
|
+
if self._check_stop_scrolling(next_offset):
|
|
1207
|
+
break
|
|
1208
|
+
|
|
1209
|
+
return unique_values[offset : offset + limit]
|
|
1210
|
+
except Exception as e:
|
|
1211
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values()")
|
|
1212
|
+
return []
|
|
1213
|
+
|
|
1214
|
+
async def get_metadata_field_unique_values_async(
|
|
1215
|
+
self, metadata_field: str, filters: dict[str, Any] | None = None, limit: int = 100, offset: int = 0
|
|
1216
|
+
) -> list[Any]:
|
|
1217
|
+
"""
|
|
1218
|
+
Asynchronously returns unique values for a metadata field, with optional filters and offset/limit pagination.
|
|
1219
|
+
|
|
1220
|
+
Unique values are ordered by first occurrence during scroll. Pagination is offset-based over that order.
|
|
1221
|
+
|
|
1222
|
+
:param metadata_field: The metadata field key (inside ``meta``) to get unique values for.
|
|
1223
|
+
:param filters: Optional filters to restrict the documents considered.
|
|
1224
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1225
|
+
:param limit: Maximum number of unique values to return per page. Defaults to 100.
|
|
1226
|
+
:param offset: Number of unique values to skip (for pagination). Defaults to 0.
|
|
1227
|
+
|
|
1228
|
+
:returns: A list of unique values for the field (at most ``limit`` items, starting at ``offset``).
|
|
1229
|
+
"""
|
|
1230
|
+
await self._initialize_async_client()
|
|
1231
|
+
assert self._async_client is not None
|
|
1232
|
+
|
|
1233
|
+
qdrant_filter = convert_filters_to_qdrant(filters) if filters else None
|
|
1234
|
+
unique_values: list[Any] = []
|
|
1235
|
+
unique_values_set: set[Any] = set()
|
|
1236
|
+
|
|
1237
|
+
try:
|
|
1238
|
+
next_offset = None
|
|
1239
|
+
while len(unique_values) < offset + limit:
|
|
1240
|
+
records, next_offset = await self._async_client.scroll(
|
|
1241
|
+
collection_name=self.index,
|
|
1242
|
+
scroll_filter=qdrant_filter,
|
|
1243
|
+
limit=self.scroll_size,
|
|
1244
|
+
offset=next_offset,
|
|
1245
|
+
with_payload=True,
|
|
1246
|
+
with_vectors=False,
|
|
1247
|
+
)
|
|
1248
|
+
if self._process_records_unique_values(
|
|
1249
|
+
records, metadata_field, unique_values, unique_values_set, offset, limit
|
|
1250
|
+
):
|
|
1251
|
+
break
|
|
1252
|
+
if self._check_stop_scrolling(next_offset):
|
|
1253
|
+
break
|
|
1254
|
+
|
|
1255
|
+
return unique_values[offset : offset + limit]
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
logger.warning(f"Error {e} when calling QdrantDocumentStore.get_metadata_field_unique_values_async()")
|
|
1258
|
+
return []
|
|
1259
|
+
|
|
1260
|
+
@classmethod
|
|
1261
|
+
def from_dict(cls, data: dict[str, Any]) -> "QdrantDocumentStore":
|
|
1262
|
+
"""
|
|
1263
|
+
Deserializes the component from a dictionary.
|
|
1264
|
+
|
|
1265
|
+
:param data:
|
|
1266
|
+
The dictionary to deserialize from.
|
|
1267
|
+
:returns:
|
|
1268
|
+
The deserialized component.
|
|
1269
|
+
"""
|
|
1270
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
1271
|
+
return default_from_dict(cls, data)
|
|
1272
|
+
|
|
1273
|
+
def to_dict(self) -> dict[str, Any]:
|
|
1274
|
+
"""
|
|
1275
|
+
Serializes the component to a dictionary.
|
|
1276
|
+
|
|
1277
|
+
:returns:
|
|
1278
|
+
Dictionary with serialized data.
|
|
1279
|
+
"""
|
|
1280
|
+
params = inspect.signature(self.__init__).parameters # type: ignore
|
|
1281
|
+
# All the __init__ params must be set as attributes
|
|
1282
|
+
# Set as init_parms without default values
|
|
1283
|
+
init_params = {k: getattr(self, k) for k in params}
|
|
1284
|
+
init_params["api_key"] = self.api_key.to_dict() if self.api_key else None
|
|
1285
|
+
return default_to_dict(
|
|
1286
|
+
self,
|
|
1287
|
+
**init_params,
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
def _get_documents_generator(
|
|
1291
|
+
self,
|
|
1292
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1293
|
+
) -> Generator[Document, None, None]:
|
|
1294
|
+
"""
|
|
1295
|
+
Returns a generator that yields documents from Qdrant based on the provided filters.
|
|
1296
|
+
|
|
1297
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1298
|
+
:returns: A generator that yields documents retrieved from Qdrant.
|
|
1299
|
+
"""
|
|
1300
|
+
|
|
1301
|
+
self._initialize_client()
|
|
1302
|
+
assert self._client is not None
|
|
1303
|
+
|
|
1304
|
+
index = self.index
|
|
1305
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1306
|
+
|
|
1307
|
+
next_offset = None
|
|
1308
|
+
stop_scrolling = False
|
|
1309
|
+
while not stop_scrolling:
|
|
1310
|
+
records, next_offset = self._client.scroll(
|
|
1311
|
+
collection_name=index,
|
|
1312
|
+
scroll_filter=qdrant_filters,
|
|
1313
|
+
limit=self.scroll_size,
|
|
1314
|
+
offset=next_offset,
|
|
1315
|
+
with_payload=True,
|
|
1316
|
+
with_vectors=True,
|
|
1317
|
+
)
|
|
1318
|
+
stop_scrolling = next_offset is None or (
|
|
1319
|
+
hasattr(next_offset, "num")
|
|
1320
|
+
and hasattr(next_offset, "uuid")
|
|
1321
|
+
and next_offset.num == 0
|
|
1322
|
+
and next_offset.uuid == ""
|
|
1323
|
+
) # PointId always has num and uuid
|
|
1324
|
+
|
|
1325
|
+
for record in records:
|
|
1326
|
+
yield convert_qdrant_point_to_haystack_document(
|
|
1327
|
+
record, use_sparse_embeddings=self.use_sparse_embeddings
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
async def _get_documents_generator_async(
|
|
1331
|
+
self,
|
|
1332
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1333
|
+
) -> AsyncGenerator[Document, None]:
|
|
1334
|
+
"""
|
|
1335
|
+
Returns an asynchronous generator that yields documents from Qdrant based on the provided filters.
|
|
1336
|
+
|
|
1337
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1338
|
+
:returns: An asynchronous generator that yields documents retrieved from Qdrant.
|
|
1339
|
+
"""
|
|
1340
|
+
|
|
1341
|
+
await self._initialize_async_client()
|
|
1342
|
+
assert self._async_client is not None
|
|
1343
|
+
|
|
1344
|
+
index = self.index
|
|
1345
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1346
|
+
|
|
1347
|
+
next_offset = None
|
|
1348
|
+
stop_scrolling = False
|
|
1349
|
+
while not stop_scrolling:
|
|
1350
|
+
records, next_offset = await self._async_client.scroll(
|
|
1351
|
+
collection_name=index,
|
|
1352
|
+
scroll_filter=qdrant_filters,
|
|
1353
|
+
limit=self.scroll_size,
|
|
1354
|
+
offset=next_offset,
|
|
1355
|
+
with_payload=True,
|
|
1356
|
+
with_vectors=True,
|
|
1357
|
+
)
|
|
1358
|
+
stop_scrolling = next_offset is None or (
|
|
1359
|
+
hasattr(next_offset, "num")
|
|
1360
|
+
and hasattr(next_offset, "uuid")
|
|
1361
|
+
and next_offset.num == 0
|
|
1362
|
+
and next_offset.uuid == ""
|
|
1363
|
+
) # PointId always has num and uuid
|
|
1364
|
+
|
|
1365
|
+
for record in records:
|
|
1366
|
+
yield convert_qdrant_point_to_haystack_document(
|
|
1367
|
+
record, use_sparse_embeddings=self.use_sparse_embeddings
|
|
1368
|
+
)
|
|
1369
|
+
|
|
1370
|
+
def get_documents_by_id(
|
|
1371
|
+
self,
|
|
1372
|
+
ids: list[str],
|
|
1373
|
+
) -> list[Document]:
|
|
1374
|
+
"""
|
|
1375
|
+
Retrieves documents from Qdrant by their IDs.
|
|
1376
|
+
|
|
1377
|
+
:param ids:
|
|
1378
|
+
A list of document IDs to retrieve.
|
|
1379
|
+
:returns:
|
|
1380
|
+
A list of documents.
|
|
1381
|
+
"""
|
|
1382
|
+
documents: list[Document] = []
|
|
1383
|
+
|
|
1384
|
+
self._initialize_client()
|
|
1385
|
+
assert self._client is not None
|
|
1386
|
+
|
|
1387
|
+
ids = [convert_id(_id) for _id in ids]
|
|
1388
|
+
records = self._client.retrieve(
|
|
1389
|
+
collection_name=self.index,
|
|
1390
|
+
ids=ids,
|
|
1391
|
+
with_payload=True,
|
|
1392
|
+
with_vectors=True,
|
|
1393
|
+
)
|
|
1394
|
+
|
|
1395
|
+
for record in records:
|
|
1396
|
+
documents.append(
|
|
1397
|
+
convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1398
|
+
)
|
|
1399
|
+
return documents
|
|
1400
|
+
|
|
1401
|
+
async def get_documents_by_id_async(
|
|
1402
|
+
self,
|
|
1403
|
+
ids: list[str],
|
|
1404
|
+
) -> list[Document]:
|
|
1405
|
+
"""
|
|
1406
|
+
Retrieves documents from Qdrant by their IDs.
|
|
1407
|
+
|
|
1408
|
+
:param ids:
|
|
1409
|
+
A list of document IDs to retrieve.
|
|
1410
|
+
:returns:
|
|
1411
|
+
A list of documents.
|
|
1412
|
+
"""
|
|
1413
|
+
documents: list[Document] = []
|
|
1414
|
+
|
|
1415
|
+
await self._initialize_async_client()
|
|
1416
|
+
assert self._async_client is not None
|
|
1417
|
+
|
|
1418
|
+
ids = [convert_id(_id) for _id in ids]
|
|
1419
|
+
records = await self._async_client.retrieve(
|
|
1420
|
+
collection_name=self.index,
|
|
1421
|
+
ids=ids,
|
|
1422
|
+
with_payload=True,
|
|
1423
|
+
with_vectors=True,
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
for record in records:
|
|
1427
|
+
documents.append(
|
|
1428
|
+
convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
1429
|
+
)
|
|
1430
|
+
return documents
|
|
1431
|
+
|
|
1432
|
+
def _query_by_sparse(
|
|
1433
|
+
self,
|
|
1434
|
+
query_sparse_embedding: SparseEmbedding,
|
|
1435
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1436
|
+
top_k: int = 10,
|
|
1437
|
+
scale_score: bool = False,
|
|
1438
|
+
return_embedding: bool = False,
|
|
1439
|
+
score_threshold: float | None = None,
|
|
1440
|
+
group_by: str | None = None,
|
|
1441
|
+
group_size: int | None = None,
|
|
1442
|
+
) -> list[Document]:
|
|
1443
|
+
"""
|
|
1444
|
+
Queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
1445
|
+
|
|
1446
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
1447
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1448
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1449
|
+
groups to return.
|
|
1450
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
1451
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1452
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1453
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1454
|
+
depending on the Distance function used.
|
|
1455
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1456
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1457
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1458
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1459
|
+
|
|
1460
|
+
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
1461
|
+
|
|
1462
|
+
:raises QdrantStoreError:
|
|
1463
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
1464
|
+
"""
|
|
1465
|
+
self._initialize_client()
|
|
1466
|
+
assert self._client is not None
|
|
1467
|
+
|
|
1468
|
+
if not self.use_sparse_embeddings:
|
|
1469
|
+
message = (
|
|
1470
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
1471
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
1472
|
+
)
|
|
1473
|
+
raise QdrantStoreError(message)
|
|
541
1474
|
|
|
542
1475
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
543
1476
|
query_indices = query_sparse_embedding.indices
|
|
544
1477
|
query_values = query_sparse_embedding.values
|
|
545
1478
|
if group_by:
|
|
546
|
-
groups = self.
|
|
1479
|
+
groups = self._client.query_points_groups(
|
|
547
1480
|
collection_name=self.index,
|
|
548
1481
|
query=rest.SparseVector(
|
|
549
1482
|
indices=query_indices,
|
|
@@ -553,21 +1486,13 @@ class QdrantDocumentStore:
|
|
|
553
1486
|
query_filter=qdrant_filters,
|
|
554
1487
|
limit=top_k,
|
|
555
1488
|
group_by=group_by,
|
|
556
|
-
group_size=group_size,
|
|
1489
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
557
1490
|
with_vectors=return_embedding,
|
|
558
1491
|
score_threshold=score_threshold,
|
|
559
1492
|
).groups
|
|
560
|
-
|
|
561
|
-
[
|
|
562
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
563
|
-
for group in groups
|
|
564
|
-
for point in group.hits
|
|
565
|
-
]
|
|
566
|
-
if groups
|
|
567
|
-
else []
|
|
568
|
-
)
|
|
1493
|
+
return self._process_group_results(groups)
|
|
569
1494
|
else:
|
|
570
|
-
points = self.
|
|
1495
|
+
points = self._client.query_points(
|
|
571
1496
|
collection_name=self.index,
|
|
572
1497
|
query=rest.SparseVector(
|
|
573
1498
|
indices=query_indices,
|
|
@@ -579,28 +1504,19 @@ class QdrantDocumentStore:
|
|
|
579
1504
|
with_vectors=return_embedding,
|
|
580
1505
|
score_threshold=score_threshold,
|
|
581
1506
|
).points
|
|
582
|
-
|
|
583
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
584
|
-
for point in points
|
|
585
|
-
]
|
|
586
|
-
if scale_score:
|
|
587
|
-
for document in results:
|
|
588
|
-
score = document.score
|
|
589
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
590
|
-
document.score = score
|
|
591
|
-
return results
|
|
1507
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
592
1508
|
|
|
593
1509
|
def _query_by_embedding(
|
|
594
1510
|
self,
|
|
595
|
-
query_embedding:
|
|
596
|
-
filters:
|
|
1511
|
+
query_embedding: list[float],
|
|
1512
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
597
1513
|
top_k: int = 10,
|
|
598
1514
|
scale_score: bool = False,
|
|
599
1515
|
return_embedding: bool = False,
|
|
600
|
-
score_threshold:
|
|
601
|
-
group_by:
|
|
602
|
-
group_size:
|
|
603
|
-
) ->
|
|
1516
|
+
score_threshold: float | None = None,
|
|
1517
|
+
group_by: str | None = None,
|
|
1518
|
+
group_size: int | None = None,
|
|
1519
|
+
) -> list[Document]:
|
|
604
1520
|
"""
|
|
605
1521
|
Queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
606
1522
|
|
|
@@ -620,30 +1536,26 @@ class QdrantDocumentStore:
|
|
|
620
1536
|
|
|
621
1537
|
:returns: List of documents that are most similar to `query_embedding`.
|
|
622
1538
|
"""
|
|
1539
|
+
self._initialize_client()
|
|
1540
|
+
assert self._client is not None
|
|
1541
|
+
|
|
623
1542
|
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
624
1543
|
if group_by:
|
|
625
|
-
groups = self.
|
|
1544
|
+
groups = self._client.query_points_groups(
|
|
626
1545
|
collection_name=self.index,
|
|
627
1546
|
query=query_embedding,
|
|
628
1547
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
629
1548
|
query_filter=qdrant_filters,
|
|
630
1549
|
limit=top_k,
|
|
631
1550
|
group_by=group_by,
|
|
632
|
-
group_size=group_size,
|
|
1551
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
633
1552
|
with_vectors=return_embedding,
|
|
634
1553
|
score_threshold=score_threshold,
|
|
635
1554
|
).groups
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
639
|
-
for group in groups
|
|
640
|
-
for point in group.hits
|
|
641
|
-
]
|
|
642
|
-
if groups
|
|
643
|
-
else []
|
|
644
|
-
)
|
|
1555
|
+
return self._process_group_results(groups)
|
|
1556
|
+
|
|
645
1557
|
else:
|
|
646
|
-
points = self.
|
|
1558
|
+
points = self._client.query_points(
|
|
647
1559
|
collection_name=self.index,
|
|
648
1560
|
query=query_embedding,
|
|
649
1561
|
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
@@ -652,32 +1564,19 @@ class QdrantDocumentStore:
|
|
|
652
1564
|
with_vectors=return_embedding,
|
|
653
1565
|
score_threshold=score_threshold,
|
|
654
1566
|
).points
|
|
655
|
-
|
|
656
|
-
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
657
|
-
for point in points
|
|
658
|
-
]
|
|
659
|
-
|
|
660
|
-
if scale_score:
|
|
661
|
-
for document in results:
|
|
662
|
-
score = document.score
|
|
663
|
-
if self.similarity == "cosine":
|
|
664
|
-
score = (score + 1) / 2
|
|
665
|
-
else:
|
|
666
|
-
score = float(1 / (1 + np.exp(-score / 100)))
|
|
667
|
-
document.score = score
|
|
668
|
-
return results
|
|
1567
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
669
1568
|
|
|
670
1569
|
def _query_hybrid(
|
|
671
1570
|
self,
|
|
672
|
-
query_embedding:
|
|
1571
|
+
query_embedding: list[float],
|
|
673
1572
|
query_sparse_embedding: SparseEmbedding,
|
|
674
|
-
filters:
|
|
1573
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
675
1574
|
top_k: int = 10,
|
|
676
1575
|
return_embedding: bool = False,
|
|
677
|
-
score_threshold:
|
|
678
|
-
group_by:
|
|
679
|
-
group_size:
|
|
680
|
-
) ->
|
|
1576
|
+
score_threshold: float | None = None,
|
|
1577
|
+
group_by: str | None = None,
|
|
1578
|
+
group_size: int | None = None,
|
|
1579
|
+
) -> list[Document]:
|
|
681
1580
|
"""
|
|
682
1581
|
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
683
1582
|
|
|
@@ -706,6 +1605,10 @@ class QdrantDocumentStore:
|
|
|
706
1605
|
|
|
707
1606
|
# This implementation is based on the code from the Python Qdrant client:
|
|
708
1607
|
# https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
|
|
1608
|
+
|
|
1609
|
+
self._initialize_client()
|
|
1610
|
+
assert self._client is not None
|
|
1611
|
+
|
|
709
1612
|
if not self.use_sparse_embeddings:
|
|
710
1613
|
message = (
|
|
711
1614
|
"You are trying to query using sparse embeddings, but the Document Store "
|
|
@@ -717,7 +1620,7 @@ class QdrantDocumentStore:
|
|
|
717
1620
|
|
|
718
1621
|
try:
|
|
719
1622
|
if group_by:
|
|
720
|
-
groups = self.
|
|
1623
|
+
groups = self._client.query_points_groups(
|
|
721
1624
|
collection_name=self.index,
|
|
722
1625
|
prefetch=[
|
|
723
1626
|
rest.Prefetch(
|
|
@@ -737,13 +1640,13 @@ class QdrantDocumentStore:
|
|
|
737
1640
|
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
738
1641
|
limit=top_k,
|
|
739
1642
|
group_by=group_by,
|
|
740
|
-
group_size=group_size,
|
|
1643
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
741
1644
|
score_threshold=score_threshold,
|
|
742
1645
|
with_payload=True,
|
|
743
1646
|
with_vectors=return_embedding,
|
|
744
1647
|
).groups
|
|
745
1648
|
else:
|
|
746
|
-
points = self.
|
|
1649
|
+
points = self._client.query_points(
|
|
747
1650
|
collection_name=self.index,
|
|
748
1651
|
prefetch=[
|
|
749
1652
|
rest.Prefetch(
|
|
@@ -772,19 +1675,263 @@ class QdrantDocumentStore:
|
|
|
772
1675
|
raise QdrantStoreError(msg) from e
|
|
773
1676
|
|
|
774
1677
|
if group_by:
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
1678
|
+
return self._process_group_results(groups)
|
|
1679
|
+
else:
|
|
1680
|
+
return self._process_query_point_results(points)
|
|
1681
|
+
|
|
1682
|
+
async def _query_by_sparse_async(
|
|
1683
|
+
self,
|
|
1684
|
+
query_sparse_embedding: SparseEmbedding,
|
|
1685
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1686
|
+
top_k: int = 10,
|
|
1687
|
+
scale_score: bool = False,
|
|
1688
|
+
return_embedding: bool = False,
|
|
1689
|
+
score_threshold: float | None = None,
|
|
1690
|
+
group_by: str | None = None,
|
|
1691
|
+
group_size: int | None = None,
|
|
1692
|
+
) -> list[Document]:
|
|
1693
|
+
"""
|
|
1694
|
+
Asynchronously queries Qdrant using a sparse embedding and returns the most relevant documents.
|
|
1695
|
+
|
|
1696
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
1697
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1698
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1699
|
+
groups to return.
|
|
1700
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
1701
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1702
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1703
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1704
|
+
depending on the Distance function used.
|
|
1705
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1706
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1707
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1708
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1709
|
+
|
|
1710
|
+
:returns: List of documents that are most similar to `query_sparse_embedding`.
|
|
1711
|
+
|
|
1712
|
+
:raises QdrantStoreError:
|
|
1713
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
1714
|
+
"""
|
|
1715
|
+
|
|
1716
|
+
await self._initialize_async_client()
|
|
1717
|
+
assert self._async_client is not None
|
|
1718
|
+
|
|
1719
|
+
if not self.use_sparse_embeddings:
|
|
1720
|
+
message = (
|
|
1721
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
1722
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
1723
|
+
)
|
|
1724
|
+
raise QdrantStoreError(message)
|
|
1725
|
+
|
|
1726
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1727
|
+
query_indices = query_sparse_embedding.indices
|
|
1728
|
+
query_values = query_sparse_embedding.values
|
|
1729
|
+
if group_by:
|
|
1730
|
+
response = await self._async_client.query_points_groups(
|
|
1731
|
+
collection_name=self.index,
|
|
1732
|
+
query=rest.SparseVector(
|
|
1733
|
+
indices=query_indices,
|
|
1734
|
+
values=query_values,
|
|
1735
|
+
),
|
|
1736
|
+
using=SPARSE_VECTORS_NAME,
|
|
1737
|
+
query_filter=qdrant_filters,
|
|
1738
|
+
limit=top_k,
|
|
1739
|
+
group_by=group_by,
|
|
1740
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
1741
|
+
with_vectors=return_embedding,
|
|
1742
|
+
score_threshold=score_threshold,
|
|
1743
|
+
)
|
|
1744
|
+
groups = response.groups
|
|
1745
|
+
return self._process_group_results(groups)
|
|
1746
|
+
else:
|
|
1747
|
+
query_response = await self._async_client.query_points(
|
|
1748
|
+
collection_name=self.index,
|
|
1749
|
+
query=rest.SparseVector(
|
|
1750
|
+
indices=query_indices,
|
|
1751
|
+
values=query_values,
|
|
1752
|
+
),
|
|
1753
|
+
using=SPARSE_VECTORS_NAME,
|
|
1754
|
+
query_filter=qdrant_filters,
|
|
1755
|
+
limit=top_k,
|
|
1756
|
+
with_vectors=return_embedding,
|
|
1757
|
+
score_threshold=score_threshold,
|
|
1758
|
+
)
|
|
1759
|
+
points = query_response.points
|
|
1760
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1761
|
+
|
|
1762
|
+
async def _query_by_embedding_async(
|
|
1763
|
+
self,
|
|
1764
|
+
query_embedding: list[float],
|
|
1765
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1766
|
+
top_k: int = 10,
|
|
1767
|
+
scale_score: bool = False,
|
|
1768
|
+
return_embedding: bool = False,
|
|
1769
|
+
score_threshold: float | None = None,
|
|
1770
|
+
group_by: str | None = None,
|
|
1771
|
+
group_size: int | None = None,
|
|
1772
|
+
) -> list[Document]:
|
|
1773
|
+
"""
|
|
1774
|
+
Asynchronously queries Qdrant using a dense embedding and returns the most relevant documents.
|
|
1775
|
+
|
|
1776
|
+
:param query_embedding: Dense embedding of the query.
|
|
1777
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1778
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1779
|
+
groups to return.
|
|
1780
|
+
:param scale_score: Whether to scale the scores of the retrieved documents.
|
|
1781
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1782
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1783
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1784
|
+
depending on the Distance function used.
|
|
1785
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1786
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1787
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1788
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1789
|
+
|
|
1790
|
+
:returns: List of documents that are most similar to `query_embedding`.
|
|
1791
|
+
"""
|
|
1792
|
+
await self._initialize_async_client()
|
|
1793
|
+
assert self._async_client is not None
|
|
1794
|
+
|
|
1795
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
1796
|
+
if group_by:
|
|
1797
|
+
response = await self._async_client.query_points_groups(
|
|
1798
|
+
collection_name=self.index,
|
|
1799
|
+
query=query_embedding,
|
|
1800
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1801
|
+
query_filter=qdrant_filters,
|
|
1802
|
+
limit=top_k,
|
|
1803
|
+
group_by=group_by,
|
|
1804
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
1805
|
+
with_vectors=return_embedding,
|
|
1806
|
+
score_threshold=score_threshold,
|
|
783
1807
|
)
|
|
1808
|
+
groups = response.groups
|
|
1809
|
+
return self._process_group_results(groups)
|
|
784
1810
|
else:
|
|
785
|
-
|
|
1811
|
+
query_response = await self._async_client.query_points(
|
|
1812
|
+
collection_name=self.index,
|
|
1813
|
+
query=query_embedding,
|
|
1814
|
+
using=DENSE_VECTORS_NAME if self.use_sparse_embeddings else None,
|
|
1815
|
+
query_filter=qdrant_filters,
|
|
1816
|
+
limit=top_k,
|
|
1817
|
+
with_vectors=return_embedding,
|
|
1818
|
+
score_threshold=score_threshold,
|
|
1819
|
+
)
|
|
1820
|
+
points = query_response.points
|
|
1821
|
+
return self._process_query_point_results(points, scale_score=scale_score)
|
|
1822
|
+
|
|
1823
|
+
async def _query_hybrid_async(
|
|
1824
|
+
self,
|
|
1825
|
+
query_embedding: list[float],
|
|
1826
|
+
query_sparse_embedding: SparseEmbedding,
|
|
1827
|
+
filters: dict[str, Any] | rest.Filter | None = None,
|
|
1828
|
+
top_k: int = 10,
|
|
1829
|
+
return_embedding: bool = False,
|
|
1830
|
+
score_threshold: float | None = None,
|
|
1831
|
+
group_by: str | None = None,
|
|
1832
|
+
group_size: int | None = None,
|
|
1833
|
+
) -> list[Document]:
|
|
1834
|
+
"""
|
|
1835
|
+
Asynchronously retrieves documents based on dense and sparse embeddings and fuses
|
|
1836
|
+
the results using Reciprocal Rank Fusion.
|
|
1837
|
+
|
|
1838
|
+
This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
|
|
1839
|
+
Use the `QdrantHybridRetriever` instead.
|
|
1840
|
+
|
|
1841
|
+
:param query_embedding: Dense embedding of the query.
|
|
1842
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
1843
|
+
:param filters: Filters applied to the retrieved documents.
|
|
1844
|
+
:param top_k: Maximum number of documents to return. If using `group_by` parameters, maximum number of
|
|
1845
|
+
groups to return.
|
|
1846
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
1847
|
+
:param score_threshold: A minimal score threshold for the result.
|
|
1848
|
+
Score of the returned result might be higher or smaller than the threshold
|
|
1849
|
+
depending on the Distance function used.
|
|
1850
|
+
E.g. for cosine similarity only higher scores will be returned.
|
|
1851
|
+
:param group_by: Payload field to group by, must be a string or number field. If the field contains more than 1
|
|
1852
|
+
value, all values will be used for grouping. One point can be in multiple groups.
|
|
1853
|
+
:param group_size: Maximum amount of points to return per group. Default is 3.
|
|
1854
|
+
|
|
1855
|
+
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
1856
|
+
|
|
1857
|
+
:raises QdrantStoreError:
|
|
1858
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
1859
|
+
"""
|
|
1860
|
+
|
|
1861
|
+
await self._initialize_async_client()
|
|
1862
|
+
assert self._async_client is not None
|
|
1863
|
+
|
|
1864
|
+
if not self.use_sparse_embeddings:
|
|
1865
|
+
message = (
|
|
1866
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
1867
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
1868
|
+
)
|
|
1869
|
+
raise QdrantStoreError(message)
|
|
1870
|
+
|
|
1871
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
786
1872
|
|
|
787
|
-
|
|
1873
|
+
try:
|
|
1874
|
+
if group_by:
|
|
1875
|
+
response = await self._async_client.query_points_groups(
|
|
1876
|
+
collection_name=self.index,
|
|
1877
|
+
prefetch=[
|
|
1878
|
+
rest.Prefetch(
|
|
1879
|
+
query=rest.SparseVector(
|
|
1880
|
+
indices=query_sparse_embedding.indices,
|
|
1881
|
+
values=query_sparse_embedding.values,
|
|
1882
|
+
),
|
|
1883
|
+
using=SPARSE_VECTORS_NAME,
|
|
1884
|
+
filter=qdrant_filters,
|
|
1885
|
+
),
|
|
1886
|
+
rest.Prefetch(
|
|
1887
|
+
query=query_embedding,
|
|
1888
|
+
using=DENSE_VECTORS_NAME,
|
|
1889
|
+
filter=qdrant_filters,
|
|
1890
|
+
),
|
|
1891
|
+
],
|
|
1892
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1893
|
+
limit=top_k,
|
|
1894
|
+
group_by=group_by,
|
|
1895
|
+
group_size=group_size or DEFAULT_GROUP_SIZE,
|
|
1896
|
+
score_threshold=score_threshold,
|
|
1897
|
+
with_payload=True,
|
|
1898
|
+
with_vectors=return_embedding,
|
|
1899
|
+
)
|
|
1900
|
+
groups = response.groups
|
|
1901
|
+
else:
|
|
1902
|
+
query_response = await self._async_client.query_points(
|
|
1903
|
+
collection_name=self.index,
|
|
1904
|
+
prefetch=[
|
|
1905
|
+
rest.Prefetch(
|
|
1906
|
+
query=rest.SparseVector(
|
|
1907
|
+
indices=query_sparse_embedding.indices,
|
|
1908
|
+
values=query_sparse_embedding.values,
|
|
1909
|
+
),
|
|
1910
|
+
using=SPARSE_VECTORS_NAME,
|
|
1911
|
+
filter=qdrant_filters,
|
|
1912
|
+
),
|
|
1913
|
+
rest.Prefetch(
|
|
1914
|
+
query=query_embedding,
|
|
1915
|
+
using=DENSE_VECTORS_NAME,
|
|
1916
|
+
filter=qdrant_filters,
|
|
1917
|
+
),
|
|
1918
|
+
],
|
|
1919
|
+
query=rest.FusionQuery(fusion=rest.Fusion.RRF),
|
|
1920
|
+
limit=top_k,
|
|
1921
|
+
score_threshold=score_threshold,
|
|
1922
|
+
with_payload=True,
|
|
1923
|
+
with_vectors=return_embedding,
|
|
1924
|
+
)
|
|
1925
|
+
points = query_response.points
|
|
1926
|
+
|
|
1927
|
+
except Exception as e:
|
|
1928
|
+
msg = "Error during hybrid search"
|
|
1929
|
+
raise QdrantStoreError(msg) from e
|
|
1930
|
+
|
|
1931
|
+
if group_by:
|
|
1932
|
+
return self._process_group_results(groups)
|
|
1933
|
+
else:
|
|
1934
|
+
return self._process_query_point_results(points)
|
|
788
1935
|
|
|
789
1936
|
def get_distance(self, similarity: str) -> rest.Distance:
|
|
790
1937
|
"""
|
|
@@ -807,14 +1954,39 @@ class QdrantDocumentStore:
|
|
|
807
1954
|
)
|
|
808
1955
|
raise QdrantStoreError(msg) from ke
|
|
809
1956
|
|
|
810
|
-
def _create_payload_index(self, collection_name: str, payload_fields_to_index:
|
|
1957
|
+
def _create_payload_index(self, collection_name: str, payload_fields_to_index: list[dict] | None = None) -> None:
|
|
1958
|
+
"""
|
|
1959
|
+
Create payload index for the collection if payload_fields_to_index is provided.
|
|
1960
|
+
|
|
1961
|
+
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
1962
|
+
"""
|
|
1963
|
+
if payload_fields_to_index is not None:
|
|
1964
|
+
for payload_index in payload_fields_to_index:
|
|
1965
|
+
# self._client is initialized at this point
|
|
1966
|
+
# since _initialize_client() is called before this method is executed
|
|
1967
|
+
|
|
1968
|
+
assert self._client is not None
|
|
1969
|
+
self._client.create_payload_index(
|
|
1970
|
+
collection_name=collection_name,
|
|
1971
|
+
field_name=payload_index["field_name"],
|
|
1972
|
+
field_schema=payload_index["field_schema"],
|
|
1973
|
+
)
|
|
1974
|
+
|
|
1975
|
+
async def _create_payload_index_async(
|
|
1976
|
+
self, collection_name: str, payload_fields_to_index: list[dict] | None = None
|
|
1977
|
+
) -> None:
|
|
811
1978
|
"""
|
|
812
|
-
|
|
1979
|
+
Asynchronously create payload index for the collection if payload_fields_to_index is provided.
|
|
1980
|
+
|
|
813
1981
|
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
|
|
814
1982
|
"""
|
|
815
1983
|
if payload_fields_to_index is not None:
|
|
816
1984
|
for payload_index in payload_fields_to_index:
|
|
817
|
-
self.
|
|
1985
|
+
# self._async_client is initialized at this point
|
|
1986
|
+
# since _initialize_async_client() is called before this method is executed
|
|
1987
|
+
assert self._async_client is not None
|
|
1988
|
+
|
|
1989
|
+
await self._async_client.create_payload_index(
|
|
818
1990
|
collection_name=collection_name,
|
|
819
1991
|
field_name=payload_index["field_name"],
|
|
820
1992
|
field_schema=payload_index["field_schema"],
|
|
@@ -829,10 +2001,11 @@ class QdrantDocumentStore:
|
|
|
829
2001
|
use_sparse_embeddings: bool,
|
|
830
2002
|
sparse_idf: bool,
|
|
831
2003
|
on_disk: bool = False,
|
|
832
|
-
payload_fields_to_index:
|
|
833
|
-
):
|
|
2004
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
2005
|
+
) -> None:
|
|
834
2006
|
"""
|
|
835
2007
|
Sets up the Qdrant collection with the specified parameters.
|
|
2008
|
+
|
|
836
2009
|
:param collection_name:
|
|
837
2010
|
The name of the collection to set up.
|
|
838
2011
|
:param embedding_dim:
|
|
@@ -856,9 +2029,13 @@ class QdrantDocumentStore:
|
|
|
856
2029
|
If the collection exists with a different similarity measure or embedding dimension.
|
|
857
2030
|
|
|
858
2031
|
"""
|
|
2032
|
+
|
|
2033
|
+
self._initialize_client()
|
|
2034
|
+
assert self._client is not None
|
|
2035
|
+
|
|
859
2036
|
distance = self.get_distance(similarity)
|
|
860
2037
|
|
|
861
|
-
if recreate_collection or not self.
|
|
2038
|
+
if recreate_collection or not self._client.collection_exists(collection_name):
|
|
862
2039
|
# There is no need to verify the current configuration of that
|
|
863
2040
|
# collection. It might be just recreated again or does not exist yet.
|
|
864
2041
|
self.recreate_collection(
|
|
@@ -868,66 +2045,76 @@ class QdrantDocumentStore:
|
|
|
868
2045
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
869
2046
|
return
|
|
870
2047
|
|
|
871
|
-
collection_info = self.
|
|
2048
|
+
collection_info = self._client.get_collection(collection_name)
|
|
872
2049
|
|
|
873
|
-
|
|
874
|
-
isinstance(collection_info.config.params.vectors, dict)
|
|
875
|
-
and DENSE_VECTORS_NAME in collection_info.config.params.vectors
|
|
876
|
-
)
|
|
2050
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
877
2051
|
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
2052
|
+
async def _set_up_collection_async(
|
|
2053
|
+
self,
|
|
2054
|
+
collection_name: str,
|
|
2055
|
+
embedding_dim: int,
|
|
2056
|
+
recreate_collection: bool,
|
|
2057
|
+
similarity: str,
|
|
2058
|
+
use_sparse_embeddings: bool,
|
|
2059
|
+
sparse_idf: bool,
|
|
2060
|
+
on_disk: bool = False,
|
|
2061
|
+
payload_fields_to_index: list[dict] | None = None,
|
|
2062
|
+
) -> None:
|
|
2063
|
+
"""
|
|
2064
|
+
Asynchronously sets up the Qdrant collection with the specified parameters.
|
|
888
2065
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
2066
|
+
:param collection_name:
|
|
2067
|
+
The name of the collection to set up.
|
|
2068
|
+
:param embedding_dim:
|
|
2069
|
+
The dimension of the embeddings.
|
|
2070
|
+
:param recreate_collection:
|
|
2071
|
+
Whether to recreate the collection if it already exists.
|
|
2072
|
+
:param similarity:
|
|
2073
|
+
The similarity measure to use.
|
|
2074
|
+
:param use_sparse_embeddings:
|
|
2075
|
+
Whether to use sparse embeddings.
|
|
2076
|
+
:param sparse_idf:
|
|
2077
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
2078
|
+
:param on_disk:
|
|
2079
|
+
Whether to store the collection on disk.
|
|
2080
|
+
:param payload_fields_to_index:
|
|
2081
|
+
List of payload fields to index.
|
|
896
2082
|
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
current_distance = collection_info.config.params.vectors.distance
|
|
902
|
-
current_vector_size = collection_info.config.params.vectors.size
|
|
2083
|
+
:raises QdrantStoreError:
|
|
2084
|
+
If the collection exists with incompatible settings.
|
|
2085
|
+
:raises ValueError:
|
|
2086
|
+
If the collection exists with a different similarity measure or embedding dimension.
|
|
903
2087
|
|
|
904
|
-
|
|
905
|
-
msg = (
|
|
906
|
-
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
907
|
-
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
908
|
-
f"If you want to use that collection, but with a different "
|
|
909
|
-
f"similarity, please set `recreate_collection=True` argument."
|
|
910
|
-
)
|
|
911
|
-
raise ValueError(msg)
|
|
2088
|
+
"""
|
|
912
2089
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
2090
|
+
await self._initialize_async_client()
|
|
2091
|
+
assert self._async_client is not None
|
|
2092
|
+
|
|
2093
|
+
distance = self.get_distance(similarity)
|
|
2094
|
+
|
|
2095
|
+
if recreate_collection or not await self._async_client.collection_exists(collection_name):
|
|
2096
|
+
# There is no need to verify the current configuration of that
|
|
2097
|
+
# collection. It might be just recreated again or does not exist yet.
|
|
2098
|
+
await self.recreate_collection_async(
|
|
2099
|
+
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
|
|
919
2100
|
)
|
|
920
|
-
|
|
2101
|
+
# Create Payload index if payload_fields_to_index is provided
|
|
2102
|
+
await self._create_payload_index_async(collection_name, payload_fields_to_index)
|
|
2103
|
+
return
|
|
2104
|
+
|
|
2105
|
+
collection_info = await self._async_client.get_collection(collection_name)
|
|
2106
|
+
|
|
2107
|
+
self._validate_collection_compatibility(collection_name, collection_info, distance, embedding_dim)
|
|
921
2108
|
|
|
922
2109
|
def recreate_collection(
|
|
923
2110
|
self,
|
|
924
2111
|
collection_name: str,
|
|
925
|
-
distance,
|
|
2112
|
+
distance: rest.Distance,
|
|
926
2113
|
embedding_dim: int,
|
|
927
|
-
on_disk:
|
|
928
|
-
use_sparse_embeddings:
|
|
2114
|
+
on_disk: bool | None = None,
|
|
2115
|
+
use_sparse_embeddings: bool | None = None,
|
|
929
2116
|
sparse_idf: bool = False,
|
|
930
|
-
):
|
|
2117
|
+
) -> None:
|
|
931
2118
|
"""
|
|
932
2119
|
Recreates the Qdrant collection with the specified parameters.
|
|
933
2120
|
|
|
@@ -944,96 +2131,356 @@ class QdrantDocumentStore:
|
|
|
944
2131
|
:param sparse_idf:
|
|
945
2132
|
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
946
2133
|
"""
|
|
947
|
-
|
|
948
|
-
on_disk
|
|
2134
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
2135
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
2136
|
+
)
|
|
2137
|
+
collection_params = self._prepare_collection_params()
|
|
949
2138
|
|
|
950
|
-
|
|
951
|
-
|
|
2139
|
+
self._initialize_client()
|
|
2140
|
+
assert self._client is not None
|
|
952
2141
|
|
|
953
|
-
|
|
954
|
-
|
|
2142
|
+
if self._client.collection_exists(collection_name):
|
|
2143
|
+
self._client.delete_collection(collection_name)
|
|
955
2144
|
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
vectors_config
|
|
2145
|
+
self._client.create_collection(
|
|
2146
|
+
collection_name=collection_name,
|
|
2147
|
+
vectors_config=vectors_config,
|
|
2148
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
2149
|
+
**collection_params,
|
|
2150
|
+
)
|
|
959
2151
|
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
2152
|
+
async def recreate_collection_async(
|
|
2153
|
+
self,
|
|
2154
|
+
collection_name: str,
|
|
2155
|
+
distance: rest.Distance,
|
|
2156
|
+
embedding_dim: int,
|
|
2157
|
+
on_disk: bool | None = None,
|
|
2158
|
+
use_sparse_embeddings: bool | None = None,
|
|
2159
|
+
sparse_idf: bool = False,
|
|
2160
|
+
) -> None:
|
|
2161
|
+
"""
|
|
2162
|
+
Asynchronously recreates the Qdrant collection with the specified parameters.
|
|
2163
|
+
|
|
2164
|
+
:param collection_name:
|
|
2165
|
+
The name of the collection to recreate.
|
|
2166
|
+
:param distance:
|
|
2167
|
+
The distance metric to use for the collection.
|
|
2168
|
+
:param embedding_dim:
|
|
2169
|
+
The dimension of the embeddings.
|
|
2170
|
+
:param on_disk:
|
|
2171
|
+
Whether to store the collection on disk.
|
|
2172
|
+
:param use_sparse_embeddings:
|
|
2173
|
+
Whether to use sparse embeddings.
|
|
2174
|
+
:param sparse_idf:
|
|
2175
|
+
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
|
|
2176
|
+
"""
|
|
2177
|
+
vectors_config, sparse_vectors_config = self._prepare_collection_config(
|
|
2178
|
+
embedding_dim, distance, on_disk, use_sparse_embeddings, sparse_idf
|
|
2179
|
+
)
|
|
2180
|
+
collection_params = self._prepare_collection_params()
|
|
2181
|
+
|
|
2182
|
+
await self._initialize_async_client()
|
|
2183
|
+
assert self._async_client is not None
|
|
968
2184
|
|
|
969
|
-
if self.
|
|
970
|
-
self.
|
|
2185
|
+
if await self._async_client.collection_exists(collection_name):
|
|
2186
|
+
await self._async_client.delete_collection(collection_name)
|
|
971
2187
|
|
|
972
|
-
self.
|
|
2188
|
+
await self._async_client.create_collection(
|
|
973
2189
|
collection_name=collection_name,
|
|
974
2190
|
vectors_config=vectors_config,
|
|
975
|
-
sparse_vectors_config=sparse_vectors_config
|
|
976
|
-
|
|
977
|
-
replication_factor=self.replication_factor,
|
|
978
|
-
write_consistency_factor=self.write_consistency_factor,
|
|
979
|
-
on_disk_payload=self.on_disk_payload,
|
|
980
|
-
hnsw_config=self.hnsw_config,
|
|
981
|
-
optimizers_config=self.optimizers_config,
|
|
982
|
-
wal_config=self.wal_config,
|
|
983
|
-
quantization_config=self.quantization_config,
|
|
984
|
-
init_from=self.init_from,
|
|
2191
|
+
sparse_vectors_config=sparse_vectors_config,
|
|
2192
|
+
**collection_params,
|
|
985
2193
|
)
|
|
986
2194
|
|
|
987
2195
|
def _handle_duplicate_documents(
|
|
988
2196
|
self,
|
|
989
|
-
documents:
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
):
|
|
2197
|
+
documents: list[Document],
|
|
2198
|
+
policy: DuplicatePolicy | None = None,
|
|
2199
|
+
) -> list[Document]:
|
|
993
2200
|
"""
|
|
994
2201
|
Checks whether any of the passed documents is already existing in the chosen index and returns a list of
|
|
995
2202
|
documents that are not in the index yet.
|
|
996
2203
|
|
|
997
2204
|
:param documents: A list of Haystack Document objects.
|
|
998
|
-
:param index: name of the index
|
|
999
2205
|
:param policy: The duplicate policy to use when writing documents.
|
|
1000
2206
|
:returns: A list of Haystack Document objects.
|
|
1001
2207
|
"""
|
|
1002
2208
|
|
|
1003
|
-
index = index or self.index
|
|
1004
2209
|
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
1005
|
-
documents = self._drop_duplicate_documents(documents
|
|
1006
|
-
documents_found = self.get_documents_by_id(ids=[doc.id for doc in documents]
|
|
1007
|
-
ids_exist_in_db:
|
|
2210
|
+
documents = self._drop_duplicate_documents(documents)
|
|
2211
|
+
documents_found = self.get_documents_by_id(ids=[doc.id for doc in documents])
|
|
2212
|
+
ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
|
|
1008
2213
|
|
|
1009
2214
|
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
1010
|
-
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{index}'."
|
|
2215
|
+
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
1011
2216
|
raise DuplicateDocumentError(msg)
|
|
1012
2217
|
|
|
1013
2218
|
documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
|
|
1014
2219
|
|
|
1015
2220
|
return documents
|
|
1016
2221
|
|
|
1017
|
-
def
|
|
2222
|
+
async def _handle_duplicate_documents_async(
|
|
2223
|
+
self,
|
|
2224
|
+
documents: list[Document],
|
|
2225
|
+
policy: DuplicatePolicy | None = None,
|
|
2226
|
+
) -> list[Document]:
|
|
1018
2227
|
"""
|
|
1019
|
-
|
|
2228
|
+
Asynchronously checks whether any of the passed documents is already existing
|
|
2229
|
+
in the chosen index and returns a list of
|
|
2230
|
+
documents that are not in the index yet.
|
|
1020
2231
|
|
|
1021
2232
|
:param documents: A list of Haystack Document objects.
|
|
1022
|
-
:param
|
|
2233
|
+
:param policy: The duplicate policy to use when writing documents.
|
|
1023
2234
|
:returns: A list of Haystack Document objects.
|
|
1024
2235
|
"""
|
|
1025
|
-
|
|
1026
|
-
|
|
2236
|
+
|
|
2237
|
+
if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.FAIL):
|
|
2238
|
+
documents = self._drop_duplicate_documents(documents)
|
|
2239
|
+
documents_found = await self.get_documents_by_id_async(ids=[doc.id for doc in documents])
|
|
2240
|
+
ids_exist_in_db: list[str] = [doc.id for doc in documents_found]
|
|
2241
|
+
|
|
2242
|
+
if len(ids_exist_in_db) > 0 and policy == DuplicatePolicy.FAIL:
|
|
2243
|
+
msg = f"Document with ids '{', '.join(ids_exist_in_db)} already exists in index = '{self.index}'."
|
|
2244
|
+
raise DuplicateDocumentError(msg)
|
|
2245
|
+
|
|
2246
|
+
documents = list(filter(lambda doc: doc.id not in ids_exist_in_db, documents))
|
|
2247
|
+
|
|
2248
|
+
return documents
|
|
2249
|
+
|
|
2250
|
+
def _drop_duplicate_documents(self, documents: list[Document]) -> list[Document]:
|
|
2251
|
+
"""
|
|
2252
|
+
Drop duplicate documents based on same hash ID.
|
|
2253
|
+
|
|
2254
|
+
"""
|
|
2255
|
+
_hash_ids: set = set()
|
|
2256
|
+
_documents: list[Document] = []
|
|
1027
2257
|
|
|
1028
2258
|
for document in documents:
|
|
1029
2259
|
if document.id in _hash_ids:
|
|
1030
2260
|
logger.info(
|
|
1031
|
-
"Duplicate Documents: Document with id '
|
|
1032
|
-
document.id,
|
|
1033
|
-
index
|
|
2261
|
+
"Duplicate Documents: Document with id '{document_id}' already exists in index '{index}'",
|
|
2262
|
+
document_id=document.id,
|
|
2263
|
+
index=self.index,
|
|
1034
2264
|
)
|
|
1035
2265
|
continue
|
|
1036
2266
|
_documents.append(document)
|
|
1037
2267
|
_hash_ids.add(document.id)
|
|
1038
2268
|
|
|
1039
2269
|
return _documents
|
|
2270
|
+
|
|
2271
|
+
def _prepare_collection_params(self) -> dict[str, Any]:
|
|
2272
|
+
"""
|
|
2273
|
+
Prepares the common parameters for collection creation.
|
|
2274
|
+
"""
|
|
2275
|
+
return {
|
|
2276
|
+
"shard_number": self.shard_number,
|
|
2277
|
+
"replication_factor": self.replication_factor,
|
|
2278
|
+
"write_consistency_factor": self.write_consistency_factor,
|
|
2279
|
+
"on_disk_payload": self.on_disk_payload,
|
|
2280
|
+
"hnsw_config": self.hnsw_config,
|
|
2281
|
+
"optimizers_config": self.optimizers_config,
|
|
2282
|
+
"wal_config": self.wal_config,
|
|
2283
|
+
"quantization_config": self.quantization_config,
|
|
2284
|
+
}
|
|
2285
|
+
|
|
2286
|
+
def _prepare_client_params(self) -> dict[str, Any]:
|
|
2287
|
+
"""
|
|
2288
|
+
Prepares the common parameters for client initialization.
|
|
2289
|
+
|
|
2290
|
+
"""
|
|
2291
|
+
return {
|
|
2292
|
+
"location": self.location,
|
|
2293
|
+
"url": self.url,
|
|
2294
|
+
"port": self.port,
|
|
2295
|
+
"grpc_port": self.grpc_port,
|
|
2296
|
+
"prefer_grpc": self.prefer_grpc,
|
|
2297
|
+
"https": self.https,
|
|
2298
|
+
"api_key": self.api_key.resolve_value() if self.api_key else None,
|
|
2299
|
+
"prefix": self.prefix,
|
|
2300
|
+
"timeout": self.timeout,
|
|
2301
|
+
"host": self.host,
|
|
2302
|
+
"path": self.path,
|
|
2303
|
+
# NOTE: We purposefully expand the fields of self.metadata to avoid modifying the original self.metadata
|
|
2304
|
+
# class attribute. For example, the resolved api key is added to metadata by the QdrantClient class
|
|
2305
|
+
# when using a hosted Qdrant service, which means running to_dict() exposes the api key.
|
|
2306
|
+
"metadata": {**self.metadata},
|
|
2307
|
+
"force_disable_check_same_thread": self.force_disable_check_same_thread,
|
|
2308
|
+
}
|
|
2309
|
+
|
|
2310
|
+
def _prepare_collection_config(
|
|
2311
|
+
self,
|
|
2312
|
+
embedding_dim: int,
|
|
2313
|
+
distance: rest.Distance,
|
|
2314
|
+
on_disk: bool | None = None,
|
|
2315
|
+
use_sparse_embeddings: bool | None = None,
|
|
2316
|
+
sparse_idf: bool = False,
|
|
2317
|
+
) -> tuple[dict[str, rest.VectorParams] | rest.VectorParams, dict[str, rest.SparseVectorParams] | None]:
|
|
2318
|
+
"""
|
|
2319
|
+
Prepares the configuration for creating or recreating a Qdrant collection.
|
|
2320
|
+
|
|
2321
|
+
"""
|
|
2322
|
+
if on_disk is None:
|
|
2323
|
+
on_disk = self.on_disk
|
|
2324
|
+
|
|
2325
|
+
if use_sparse_embeddings is None:
|
|
2326
|
+
use_sparse_embeddings = self.use_sparse_embeddings
|
|
2327
|
+
|
|
2328
|
+
# dense vectors configuration
|
|
2329
|
+
base_vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
|
|
2330
|
+
vectors_config: rest.VectorParams | dict[str, rest.VectorParams] = base_vectors_config
|
|
2331
|
+
|
|
2332
|
+
sparse_vectors_config: dict[str, rest.SparseVectorParams] | None = None
|
|
2333
|
+
|
|
2334
|
+
if use_sparse_embeddings:
|
|
2335
|
+
# in this case, we need to define named vectors
|
|
2336
|
+
vectors_config = {DENSE_VECTORS_NAME: base_vectors_config}
|
|
2337
|
+
|
|
2338
|
+
sparse_vectors_config = {
|
|
2339
|
+
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
2340
|
+
index=rest.SparseIndexParams(
|
|
2341
|
+
on_disk=on_disk,
|
|
2342
|
+
),
|
|
2343
|
+
modifier=rest.Modifier.IDF if sparse_idf else None,
|
|
2344
|
+
),
|
|
2345
|
+
}
|
|
2346
|
+
|
|
2347
|
+
return vectors_config, sparse_vectors_config
|
|
2348
|
+
|
|
2349
|
+
@staticmethod
|
|
2350
|
+
def _validate_filters(filters: dict[str, Any] | rest.Filter | None = None) -> None:
|
|
2351
|
+
"""
|
|
2352
|
+
Validates the filters provided for querying.
|
|
2353
|
+
|
|
2354
|
+
:param filters: Filters to validate. Can be a dictionary or an instance of `qdrant_client.http.models.Filter`.
|
|
2355
|
+
:raises ValueError: If the filters are not in the correct format or syntax.
|
|
2356
|
+
"""
|
|
2357
|
+
if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
|
|
2358
|
+
msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
|
|
2359
|
+
raise ValueError(msg)
|
|
2360
|
+
|
|
2361
|
+
if filters and not isinstance(filters, rest.Filter) and "operator" not in filters:
|
|
2362
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
2363
|
+
raise ValueError(msg)
|
|
2364
|
+
|
|
2365
|
+
def _process_query_point_results(
|
|
2366
|
+
self, results: list[rest.ScoredPoint], scale_score: bool = False
|
|
2367
|
+
) -> list[Document]:
|
|
2368
|
+
"""
|
|
2369
|
+
Processes query results from Qdrant.
|
|
2370
|
+
"""
|
|
2371
|
+
documents = [
|
|
2372
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
2373
|
+
for point in results
|
|
2374
|
+
]
|
|
2375
|
+
|
|
2376
|
+
if scale_score:
|
|
2377
|
+
for document in documents:
|
|
2378
|
+
score = document.score
|
|
2379
|
+
if score is None:
|
|
2380
|
+
continue
|
|
2381
|
+
if self.similarity == "cosine":
|
|
2382
|
+
score = (score + 1) / 2
|
|
2383
|
+
else:
|
|
2384
|
+
score = float(1 / (1 + exp(-score / 100)))
|
|
2385
|
+
document.score = score
|
|
2386
|
+
|
|
2387
|
+
return documents
|
|
2388
|
+
|
|
2389
|
+
def _process_group_results(self, groups: list[rest.PointGroup]) -> list[Document]:
|
|
2390
|
+
"""
|
|
2391
|
+
Processes grouped query results from Qdrant.
|
|
2392
|
+
|
|
2393
|
+
"""
|
|
2394
|
+
if not groups:
|
|
2395
|
+
return []
|
|
2396
|
+
|
|
2397
|
+
return [
|
|
2398
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
2399
|
+
for group in groups
|
|
2400
|
+
for point in group.hits
|
|
2401
|
+
]
|
|
2402
|
+
|
|
2403
|
+
def _validate_collection_compatibility(
|
|
2404
|
+
self,
|
|
2405
|
+
collection_name: str,
|
|
2406
|
+
collection_info: rest.CollectionInfo,
|
|
2407
|
+
distance: rest.Distance,
|
|
2408
|
+
embedding_dim: int,
|
|
2409
|
+
) -> None:
|
|
2410
|
+
"""
|
|
2411
|
+
Validates that an existing collection is compatible with the current configuration.
|
|
2412
|
+
"""
|
|
2413
|
+
vectors_config = collection_info.config.params.vectors
|
|
2414
|
+
|
|
2415
|
+
if vectors_config is None:
|
|
2416
|
+
msg = f"Collection '{collection_name}' has no vector configuration."
|
|
2417
|
+
raise QdrantStoreError(msg)
|
|
2418
|
+
|
|
2419
|
+
has_named_vectors = isinstance(vectors_config, dict)
|
|
2420
|
+
|
|
2421
|
+
if has_named_vectors and DENSE_VECTORS_NAME not in vectors_config:
|
|
2422
|
+
msg = (
|
|
2423
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
2424
|
+
f"but it has been originally created outside of Haystack and is not supported. "
|
|
2425
|
+
f"If possible, you should create a new Document Store with Haystack. "
|
|
2426
|
+
f"In case you want to migrate the existing collection, see an example script in "
|
|
2427
|
+
f"https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/src/"
|
|
2428
|
+
f"haystack_integrations/document_stores/qdrant/migrate_to_sparse.py."
|
|
2429
|
+
)
|
|
2430
|
+
raise QdrantStoreError(msg)
|
|
2431
|
+
|
|
2432
|
+
if self.use_sparse_embeddings and not has_named_vectors:
|
|
2433
|
+
msg = (
|
|
2434
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
2435
|
+
f"but it has been originally created without sparse embedding vectors. "
|
|
2436
|
+
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
2437
|
+
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one. "
|
|
2438
|
+
f"See `migrate_to_sparse_embeddings_support` function in "
|
|
2439
|
+
f"`haystack_integrations.document_stores.qdrant`."
|
|
2440
|
+
)
|
|
2441
|
+
raise QdrantStoreError(msg)
|
|
2442
|
+
|
|
2443
|
+
if not self.use_sparse_embeddings and has_named_vectors:
|
|
2444
|
+
msg = (
|
|
2445
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
2446
|
+
f"but it has been originally created with sparse embedding vectors."
|
|
2447
|
+
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
2448
|
+
)
|
|
2449
|
+
raise QdrantStoreError(msg)
|
|
2450
|
+
|
|
2451
|
+
# Get current distance and vector size based on collection configuration
|
|
2452
|
+
if self.use_sparse_embeddings:
|
|
2453
|
+
if not isinstance(vectors_config, dict):
|
|
2454
|
+
msg = f"Collection '{collection_name}' has invalid vector configuration for sparse embeddings."
|
|
2455
|
+
raise QdrantStoreError(msg)
|
|
2456
|
+
|
|
2457
|
+
dense_vector_config = vectors_config[DENSE_VECTORS_NAME]
|
|
2458
|
+
current_distance = dense_vector_config.distance
|
|
2459
|
+
current_vector_size = dense_vector_config.size
|
|
2460
|
+
else:
|
|
2461
|
+
if isinstance(vectors_config, dict):
|
|
2462
|
+
msg = f"Collection '{collection_name}' has invalid vector configuration for dense embeddings only."
|
|
2463
|
+
raise QdrantStoreError(msg)
|
|
2464
|
+
|
|
2465
|
+
current_distance = vectors_config.distance
|
|
2466
|
+
current_vector_size = vectors_config.size
|
|
2467
|
+
|
|
2468
|
+
# Validate distance metric
|
|
2469
|
+
if current_distance != distance:
|
|
2470
|
+
msg = (
|
|
2471
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
2472
|
+
f"but it is configured with a similarity '{current_distance.name}'. "
|
|
2473
|
+
f"If you want to use that collection, but with a different "
|
|
2474
|
+
f"similarity, please set `recreate_collection=True` argument."
|
|
2475
|
+
)
|
|
2476
|
+
raise ValueError(msg)
|
|
2477
|
+
|
|
2478
|
+
# Validate embedding dimension
|
|
2479
|
+
if current_vector_size != embedding_dim:
|
|
2480
|
+
msg = (
|
|
2481
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
2482
|
+
f"but it is configured with a vector size '{current_vector_size}'. "
|
|
2483
|
+
f"If you want to use that collection, but with a different "
|
|
2484
|
+
f"vector size, please set `recreate_collection=True` argument."
|
|
2485
|
+
)
|
|
2486
|
+
raise ValueError(msg)
|