qdrant-haystack 3.4.0__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/PKG-INFO +2 -1
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/pyproject.toml +1 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/components/retrievers/qdrant/__init__.py +2 -2
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +132 -5
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +83 -2
- qdrant_haystack-3.5.0/tests/conftest.py +18 -0
- qdrant_haystack-3.5.0/tests/test_document_store.py +104 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/test_retriever.py +113 -14
- qdrant_haystack-3.4.0/tests/test_document_store.py +0 -41
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/.gitignore +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/LICENSE.txt +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/README.md +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/examples/embedding_retrieval.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/pydoc/config.yml +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/filters.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/__init__.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/test_converters.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/test_dict_converters.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/test_filters.py +0 -0
- {qdrant_haystack-3.4.0 → qdrant_haystack-3.5.0}/tests/test_legacy_filters.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -9,6 +9,7 @@ Author-email: Kacper Łukawski <kacper.lukawski@qdrant.com>, Anush Shetty <anush
|
|
|
9
9
|
License-Expression: Apache-2.0
|
|
10
10
|
License-File: LICENSE.txt
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
13
|
Classifier: Programming Language :: Python
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -15,6 +15,7 @@ authors = [
|
|
|
15
15
|
{ name = "Anush Shetty", email = "anush.shetty@qdrant.com" },
|
|
16
16
|
]
|
|
17
17
|
classifiers = [
|
|
18
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
19
|
"Development Status :: 4 - Beta",
|
|
19
20
|
"Programming Language :: Python",
|
|
20
21
|
"Programming Language :: Python :: 3.8",
|
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
from .retriever import QdrantEmbeddingRetriever, QdrantSparseEmbeddingRetriever
|
|
5
|
+
from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseEmbeddingRetriever
|
|
6
6
|
|
|
7
|
-
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseEmbeddingRetriever")
|
|
7
|
+
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseEmbeddingRetriever", "QdrantHybridRetriever")
|
|
@@ -19,8 +19,10 @@ class QdrantEmbeddingRetriever:
|
|
|
19
19
|
":memory:",
|
|
20
20
|
recreate_index=True,
|
|
21
21
|
return_embedding=True,
|
|
22
|
-
wait_result_from_api=True,
|
|
23
22
|
)
|
|
23
|
+
|
|
24
|
+
document_store.write_documents([Document(content="test", embedding=[0.5]*768)])
|
|
25
|
+
|
|
24
26
|
retriever = QdrantEmbeddingRetriever(document_store=document_store)
|
|
25
27
|
|
|
26
28
|
# using a fake vector to keep the example simple
|
|
@@ -112,7 +114,7 @@ class QdrantEmbeddingRetriever:
|
|
|
112
114
|
The retrieved documents.
|
|
113
115
|
|
|
114
116
|
"""
|
|
115
|
-
docs = self._document_store.
|
|
117
|
+
docs = self._document_store._query_by_embedding(
|
|
116
118
|
query_embedding=query_embedding,
|
|
117
119
|
filters=filters or self._filters,
|
|
118
120
|
top_k=top_k or self._top_k,
|
|
@@ -136,10 +138,14 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
136
138
|
|
|
137
139
|
document_store = QdrantDocumentStore(
|
|
138
140
|
":memory:",
|
|
141
|
+
use_sparse_embeddings=True,
|
|
139
142
|
recreate_index=True,
|
|
140
143
|
return_embedding=True,
|
|
141
|
-
wait_result_from_api=True,
|
|
142
144
|
)
|
|
145
|
+
|
|
146
|
+
doc = Document(content="test", sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
|
|
147
|
+
document_store.write_documents([doc])
|
|
148
|
+
|
|
143
149
|
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
144
150
|
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
145
151
|
retriever.run(query_sparse_embedding=sparse_embedding)
|
|
@@ -196,7 +202,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
196
202
|
return d
|
|
197
203
|
|
|
198
204
|
@classmethod
|
|
199
|
-
def from_dict(cls, data: Dict[str, Any]) -> "
|
|
205
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantSparseEmbeddingRetriever":
|
|
200
206
|
"""
|
|
201
207
|
Deserializes the component from a dictionary.
|
|
202
208
|
|
|
@@ -230,7 +236,7 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
230
236
|
The retrieved documents.
|
|
231
237
|
|
|
232
238
|
"""
|
|
233
|
-
docs = self._document_store.
|
|
239
|
+
docs = self._document_store._query_by_sparse(
|
|
234
240
|
query_sparse_embedding=query_sparse_embedding,
|
|
235
241
|
filters=filters or self._filters,
|
|
236
242
|
top_k=top_k or self._top_k,
|
|
@@ -239,3 +245,124 @@ class QdrantSparseEmbeddingRetriever:
|
|
|
239
245
|
)
|
|
240
246
|
|
|
241
247
|
return {"documents": docs}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@component
|
|
251
|
+
class QdrantHybridRetriever:
|
|
252
|
+
"""
|
|
253
|
+
A component for retrieving documents from an QdrantDocumentStore using both dense and sparse vectors
|
|
254
|
+
and fusing the results using Reciprocal Rank Fusion.
|
|
255
|
+
|
|
256
|
+
Usage example:
|
|
257
|
+
```python
|
|
258
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
|
|
259
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
260
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
261
|
+
|
|
262
|
+
document_store = QdrantDocumentStore(
|
|
263
|
+
":memory:",
|
|
264
|
+
use_sparse_embeddings=True,
|
|
265
|
+
recreate_index=True,
|
|
266
|
+
return_embedding=True,
|
|
267
|
+
wait_result_from_api=True,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
doc = Document(content="test",
|
|
271
|
+
embedding=[0.5]*768,
|
|
272
|
+
sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
|
|
273
|
+
|
|
274
|
+
document_store.write_documents([doc])
|
|
275
|
+
|
|
276
|
+
retriever = QdrantHybridRetriever(document_store=document_store)
|
|
277
|
+
embedding = [0.1]*768
|
|
278
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
279
|
+
retriever.run(query_embedding=embedding, query_sparse_embedding=sparse_embedding)
|
|
280
|
+
```
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
document_store: QdrantDocumentStore,
|
|
286
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
287
|
+
top_k: int = 10,
|
|
288
|
+
return_embedding: bool = False,
|
|
289
|
+
):
|
|
290
|
+
"""
|
|
291
|
+
Create a QdrantHybridRetriever component.
|
|
292
|
+
|
|
293
|
+
:param document_store: An instance of QdrantDocumentStore.
|
|
294
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
295
|
+
:param top_k: The maximum number of documents to retrieve.
|
|
296
|
+
:param return_embedding: Whether to return the embeddings of the retrieved Documents.
|
|
297
|
+
|
|
298
|
+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
if not isinstance(document_store, QdrantDocumentStore):
|
|
302
|
+
msg = "document_store must be an instance of QdrantDocumentStore"
|
|
303
|
+
raise ValueError(msg)
|
|
304
|
+
|
|
305
|
+
self._document_store = document_store
|
|
306
|
+
self._filters = filters
|
|
307
|
+
self._top_k = top_k
|
|
308
|
+
self._return_embedding = return_embedding
|
|
309
|
+
|
|
310
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
311
|
+
"""
|
|
312
|
+
Serializes the component to a dictionary.
|
|
313
|
+
|
|
314
|
+
:returns:
|
|
315
|
+
Dictionary with serialized data.
|
|
316
|
+
"""
|
|
317
|
+
return default_to_dict(
|
|
318
|
+
self,
|
|
319
|
+
document_store=self._document_store.to_dict(),
|
|
320
|
+
filters=self._filters,
|
|
321
|
+
top_k=self._top_k,
|
|
322
|
+
return_embedding=self._return_embedding,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
@classmethod
|
|
326
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantHybridRetriever":
|
|
327
|
+
"""
|
|
328
|
+
Deserializes the component from a dictionary.
|
|
329
|
+
|
|
330
|
+
:param data:
|
|
331
|
+
Dictionary to deserialize from.
|
|
332
|
+
:returns:
|
|
333
|
+
Deserialized component.
|
|
334
|
+
"""
|
|
335
|
+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
336
|
+
data["init_parameters"]["document_store"] = document_store
|
|
337
|
+
return default_from_dict(cls, data)
|
|
338
|
+
|
|
339
|
+
@component.output_types(documents=List[Document])
|
|
340
|
+
def run(
|
|
341
|
+
self,
|
|
342
|
+
query_embedding: List[float],
|
|
343
|
+
query_sparse_embedding: SparseEmbedding,
|
|
344
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
345
|
+
top_k: Optional[int] = None,
|
|
346
|
+
return_embedding: Optional[bool] = None,
|
|
347
|
+
):
|
|
348
|
+
"""
|
|
349
|
+
Run the Sparse Embedding Retriever on the given input data.
|
|
350
|
+
|
|
351
|
+
:param query_embedding: Dense embedding of the query.
|
|
352
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
353
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
354
|
+
:param top_k: The maximum number of documents to return.
|
|
355
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
356
|
+
:returns:
|
|
357
|
+
The retrieved documents.
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
docs = self._document_store._query_hybrid(
|
|
361
|
+
query_embedding=query_embedding,
|
|
362
|
+
query_sparse_embedding=query_sparse_embedding,
|
|
363
|
+
filters=filters or self._filters,
|
|
364
|
+
top_k=top_k or self._top_k,
|
|
365
|
+
return_embedding=return_embedding or self._return_embedding,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return {"documents": docs}
|
|
@@ -16,6 +16,7 @@ from haystack.utils.filters import convert as convert_legacy_filters
|
|
|
16
16
|
from qdrant_client import grpc
|
|
17
17
|
from qdrant_client.http import models as rest
|
|
18
18
|
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
19
|
+
from qdrant_client.hybrid.fusion import reciprocal_rank_fusion
|
|
19
20
|
from tqdm import tqdm
|
|
20
21
|
|
|
21
22
|
from .converters import (
|
|
@@ -307,7 +308,7 @@ class QdrantDocumentStore:
|
|
|
307
308
|
)
|
|
308
309
|
return documents
|
|
309
310
|
|
|
310
|
-
def
|
|
311
|
+
def _query_by_sparse(
|
|
311
312
|
self,
|
|
312
313
|
query_sparse_embedding: SparseEmbedding,
|
|
313
314
|
filters: Optional[Dict[str, Any]] = None,
|
|
@@ -349,7 +350,7 @@ class QdrantDocumentStore:
|
|
|
349
350
|
document.score = score
|
|
350
351
|
return results
|
|
351
352
|
|
|
352
|
-
def
|
|
353
|
+
def _query_by_embedding(
|
|
353
354
|
self,
|
|
354
355
|
query_embedding: List[float],
|
|
355
356
|
filters: Optional[Dict[str, Any]] = None,
|
|
@@ -383,6 +384,86 @@ class QdrantDocumentStore:
|
|
|
383
384
|
document.score = score
|
|
384
385
|
return results
|
|
385
386
|
|
|
387
|
+
def _query_hybrid(
|
|
388
|
+
self,
|
|
389
|
+
query_embedding: List[float],
|
|
390
|
+
query_sparse_embedding: SparseEmbedding,
|
|
391
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
392
|
+
top_k: int = 10,
|
|
393
|
+
return_embedding: bool = False,
|
|
394
|
+
) -> List[Document]:
|
|
395
|
+
"""
|
|
396
|
+
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
397
|
+
|
|
398
|
+
This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
|
|
399
|
+
Use the `QdrantHybridRetriever` instead.
|
|
400
|
+
|
|
401
|
+
:param query_embedding: Dense embedding of the query.
|
|
402
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
403
|
+
:param filters: Filters applied to the retrieved Documents.
|
|
404
|
+
:param top_k: Maximum number of Documents to return.
|
|
405
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
406
|
+
|
|
407
|
+
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
408
|
+
|
|
409
|
+
:raises QdrantStoreError:
|
|
410
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
# This implementation is based on the code from the Python Qdrant client:
|
|
414
|
+
# https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
|
|
415
|
+
if not self.use_sparse_embeddings:
|
|
416
|
+
message = (
|
|
417
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
418
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
419
|
+
)
|
|
420
|
+
raise QdrantStoreError(message)
|
|
421
|
+
|
|
422
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
423
|
+
|
|
424
|
+
sparse_request = rest.SearchRequest(
|
|
425
|
+
vector=rest.NamedSparseVector(
|
|
426
|
+
name=SPARSE_VECTORS_NAME,
|
|
427
|
+
vector=rest.SparseVector(
|
|
428
|
+
indices=query_sparse_embedding.indices,
|
|
429
|
+
values=query_sparse_embedding.values,
|
|
430
|
+
),
|
|
431
|
+
),
|
|
432
|
+
filter=qdrant_filters,
|
|
433
|
+
limit=top_k,
|
|
434
|
+
with_payload=True,
|
|
435
|
+
with_vector=return_embedding,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
dense_request = rest.SearchRequest(
|
|
439
|
+
vector=rest.NamedVector(
|
|
440
|
+
name=DENSE_VECTORS_NAME,
|
|
441
|
+
vector=query_embedding,
|
|
442
|
+
),
|
|
443
|
+
filter=qdrant_filters,
|
|
444
|
+
limit=top_k,
|
|
445
|
+
with_payload=True,
|
|
446
|
+
with_vector=return_embedding,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
try:
|
|
450
|
+
dense_request_response, sparse_request_response = self.client.search_batch(
|
|
451
|
+
collection_name=self.index, requests=[dense_request, sparse_request]
|
|
452
|
+
)
|
|
453
|
+
except Exception as e:
|
|
454
|
+
msg = "Error during hybrid search"
|
|
455
|
+
raise QdrantStoreError(msg) from e
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
points = reciprocal_rank_fusion(responses=[dense_request_response, sparse_request_response], limit=top_k)
|
|
459
|
+
except Exception as e:
|
|
460
|
+
msg = "Error while applying Reciprocal Rank Fusion"
|
|
461
|
+
raise QdrantStoreError(msg) from e
|
|
462
|
+
|
|
463
|
+
results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
|
|
464
|
+
|
|
465
|
+
return results
|
|
466
|
+
|
|
386
467
|
def _get_distance(self, similarity: str) -> rest.Distance:
|
|
387
468
|
try:
|
|
388
469
|
return self.SIMILARITY[similarity]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
from haystack.dataclasses import SparseEmbedding
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture(scope="session")
|
|
7
|
+
def generate_sparse_embedding():
|
|
8
|
+
"""
|
|
9
|
+
This fixture returns a function that generates a random SparseEmbedding each time it is called.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def _generate_random_sparse_embedding():
|
|
13
|
+
random_indice_length = np.random.randint(3, 15)
|
|
14
|
+
indices = list(range(random_indice_length))
|
|
15
|
+
values = [np.random.random_sample() for _ in range(random_indice_length)]
|
|
16
|
+
return SparseEmbedding(indices=indices, values=values)
|
|
17
|
+
|
|
18
|
+
return _generate_random_sparse_embedding
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from unittest.mock import patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from haystack import Document
|
|
6
|
+
from haystack.dataclasses import SparseEmbedding
|
|
7
|
+
from haystack.document_stores.errors import DuplicateDocumentError
|
|
8
|
+
from haystack.document_stores.types import DuplicatePolicy
|
|
9
|
+
from haystack.testing.document_store import (
|
|
10
|
+
CountDocumentsTest,
|
|
11
|
+
DeleteDocumentsTest,
|
|
12
|
+
WriteDocumentsTest,
|
|
13
|
+
_random_embeddings,
|
|
14
|
+
)
|
|
15
|
+
from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def document_store(self) -> QdrantDocumentStore:
|
|
21
|
+
return QdrantDocumentStore(
|
|
22
|
+
":memory:",
|
|
23
|
+
recreate_index=True,
|
|
24
|
+
return_embedding=True,
|
|
25
|
+
wait_result_from_api=True,
|
|
26
|
+
use_sparse_embeddings=False,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
30
|
+
"""
|
|
31
|
+
Assert that two lists of Documents are equal.
|
|
32
|
+
This is used in every test.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Check that the lengths of the lists are the same
|
|
36
|
+
assert len(received) == len(expected)
|
|
37
|
+
|
|
38
|
+
# Check that the sets are equal, meaning the content and IDs match regardless of order
|
|
39
|
+
assert {doc.id for doc in received} == {doc.id for doc in expected}
|
|
40
|
+
|
|
41
|
+
def test_write_documents(self, document_store: QdrantDocumentStore):
|
|
42
|
+
docs = [Document(id="1")]
|
|
43
|
+
assert document_store.write_documents(docs) == 1
|
|
44
|
+
with pytest.raises(DuplicateDocumentError):
|
|
45
|
+
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
46
|
+
|
|
47
|
+
def test_query_hybrid(self, generate_sparse_embedding):
|
|
48
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
49
|
+
|
|
50
|
+
docs = []
|
|
51
|
+
for i in range(20):
|
|
52
|
+
docs.append(
|
|
53
|
+
Document(
|
|
54
|
+
content=f"doc {i}", sparse_embedding=generate_sparse_embedding(), embedding=_random_embeddings(768)
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
document_store.write_documents(docs)
|
|
59
|
+
|
|
60
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
61
|
+
embedding = [0.1] * 768
|
|
62
|
+
|
|
63
|
+
results: List[Document] = document_store._query_hybrid(
|
|
64
|
+
query_sparse_embedding=sparse_embedding, query_embedding=embedding, top_k=10, return_embedding=True
|
|
65
|
+
)
|
|
66
|
+
assert len(results) == 10
|
|
67
|
+
|
|
68
|
+
for document in results:
|
|
69
|
+
assert document.sparse_embedding
|
|
70
|
+
assert document.embedding
|
|
71
|
+
|
|
72
|
+
def test_query_hybrid_fail_without_sparse_embedding(self, document_store):
|
|
73
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
74
|
+
embedding = [0.1] * 768
|
|
75
|
+
|
|
76
|
+
with pytest.raises(QdrantStoreError):
|
|
77
|
+
|
|
78
|
+
document_store._query_hybrid(
|
|
79
|
+
query_sparse_embedding=sparse_embedding,
|
|
80
|
+
query_embedding=embedding,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def test_query_hybrid_search_batch_failure(self):
|
|
84
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
85
|
+
|
|
86
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
87
|
+
embedding = [0.1] * 768
|
|
88
|
+
|
|
89
|
+
with patch.object(document_store.client, "search_batch", side_effect=Exception("search_batch error")):
|
|
90
|
+
|
|
91
|
+
with pytest.raises(QdrantStoreError):
|
|
92
|
+
document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
|
|
93
|
+
|
|
94
|
+
@patch("haystack_integrations.document_stores.qdrant.document_store.reciprocal_rank_fusion")
|
|
95
|
+
def test_query_hybrid_reciprocal_rank_fusion_failure(self, mocked_fusion):
|
|
96
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
97
|
+
|
|
98
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
99
|
+
embedding = [0.1] * 768
|
|
100
|
+
|
|
101
|
+
mocked_fusion.side_effect = Exception("reciprocal_rank_fusion error")
|
|
102
|
+
|
|
103
|
+
with pytest.raises(QdrantStoreError):
|
|
104
|
+
document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
from unittest.mock import Mock
|
|
2
3
|
|
|
3
|
-
import numpy as np
|
|
4
4
|
from haystack.dataclasses import Document, SparseEmbedding
|
|
5
5
|
from haystack.testing.document_store import (
|
|
6
6
|
FilterableDocsFixtureMixin,
|
|
@@ -8,6 +8,7 @@ from haystack.testing.document_store import (
|
|
|
8
8
|
)
|
|
9
9
|
from haystack_integrations.components.retrievers.qdrant import (
|
|
10
10
|
QdrantEmbeddingRetriever,
|
|
11
|
+
QdrantHybridRetriever,
|
|
11
12
|
QdrantSparseEmbeddingRetriever,
|
|
12
13
|
)
|
|
13
14
|
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
@@ -222,23 +223,12 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
222
223
|
assert retriever._scale_score is False
|
|
223
224
|
assert retriever._return_embedding is True
|
|
224
225
|
|
|
225
|
-
def
|
|
226
|
-
list_of_sparse_vectors = []
|
|
227
|
-
for _ in range(n):
|
|
228
|
-
random_indice_length = np.random.randint(3, 15)
|
|
229
|
-
data = {
|
|
230
|
-
"indices": list(range(random_indice_length)),
|
|
231
|
-
"values": [np.random.random_sample() for _ in range(random_indice_length)],
|
|
232
|
-
}
|
|
233
|
-
list_of_sparse_vectors.append(data)
|
|
234
|
-
return list_of_sparse_vectors
|
|
235
|
-
|
|
236
|
-
def test_run(self, filterable_docs: List[Document]):
|
|
226
|
+
def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
|
|
237
227
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
238
228
|
|
|
239
229
|
# Add fake sparse embedding to documents
|
|
240
230
|
for doc in filterable_docs:
|
|
241
|
-
doc.sparse_embedding =
|
|
231
|
+
doc.sparse_embedding = generate_sparse_embedding()
|
|
242
232
|
|
|
243
233
|
document_store.write_documents(filterable_docs)
|
|
244
234
|
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
@@ -252,3 +242,112 @@ class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
|
252
242
|
|
|
253
243
|
for document in results:
|
|
254
244
|
assert document.sparse_embedding
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class TestQdrantHybridRetriever:
|
|
248
|
+
def test_init_default(self):
|
|
249
|
+
document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True)
|
|
250
|
+
retriever = QdrantHybridRetriever(document_store=document_store)
|
|
251
|
+
|
|
252
|
+
assert retriever._document_store == document_store
|
|
253
|
+
assert retriever._filters is None
|
|
254
|
+
assert retriever._top_k == 10
|
|
255
|
+
assert retriever._return_embedding is False
|
|
256
|
+
|
|
257
|
+
def test_to_dict(self):
|
|
258
|
+
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
259
|
+
retriever = QdrantHybridRetriever(document_store=document_store, top_k=5, return_embedding=True)
|
|
260
|
+
res = retriever.to_dict()
|
|
261
|
+
assert res == {
|
|
262
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
|
|
263
|
+
"init_parameters": {
|
|
264
|
+
"document_store": {
|
|
265
|
+
"type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
|
|
266
|
+
"init_parameters": {
|
|
267
|
+
"location": ":memory:",
|
|
268
|
+
"url": None,
|
|
269
|
+
"port": 6333,
|
|
270
|
+
"grpc_port": 6334,
|
|
271
|
+
"prefer_grpc": False,
|
|
272
|
+
"https": None,
|
|
273
|
+
"api_key": None,
|
|
274
|
+
"prefix": None,
|
|
275
|
+
"timeout": None,
|
|
276
|
+
"host": None,
|
|
277
|
+
"path": None,
|
|
278
|
+
"index": "test",
|
|
279
|
+
"embedding_dim": 768,
|
|
280
|
+
"on_disk": False,
|
|
281
|
+
"content_field": "content",
|
|
282
|
+
"name_field": "name",
|
|
283
|
+
"embedding_field": "embedding",
|
|
284
|
+
"use_sparse_embeddings": False,
|
|
285
|
+
"similarity": "cosine",
|
|
286
|
+
"return_embedding": False,
|
|
287
|
+
"progress_bar": True,
|
|
288
|
+
"duplicate_documents": "overwrite",
|
|
289
|
+
"recreate_index": False,
|
|
290
|
+
"shard_number": None,
|
|
291
|
+
"replication_factor": None,
|
|
292
|
+
"write_consistency_factor": None,
|
|
293
|
+
"on_disk_payload": None,
|
|
294
|
+
"hnsw_config": None,
|
|
295
|
+
"optimizers_config": None,
|
|
296
|
+
"wal_config": None,
|
|
297
|
+
"quantization_config": None,
|
|
298
|
+
"init_from": None,
|
|
299
|
+
"wait_result_from_api": True,
|
|
300
|
+
"metadata": {},
|
|
301
|
+
"write_batch_size": 100,
|
|
302
|
+
"scroll_size": 10000,
|
|
303
|
+
"payload_fields_to_index": None,
|
|
304
|
+
},
|
|
305
|
+
},
|
|
306
|
+
"filters": None,
|
|
307
|
+
"top_k": 5,
|
|
308
|
+
"return_embedding": True,
|
|
309
|
+
},
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
def test_from_dict(self):
|
|
313
|
+
data = {
|
|
314
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
|
|
315
|
+
"init_parameters": {
|
|
316
|
+
"document_store": {
|
|
317
|
+
"init_parameters": {"location": ":memory:", "index": "test"},
|
|
318
|
+
"type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
|
|
319
|
+
},
|
|
320
|
+
"filters": None,
|
|
321
|
+
"top_k": 5,
|
|
322
|
+
"return_embedding": True,
|
|
323
|
+
},
|
|
324
|
+
}
|
|
325
|
+
retriever = QdrantHybridRetriever.from_dict(data)
|
|
326
|
+
assert isinstance(retriever._document_store, QdrantDocumentStore)
|
|
327
|
+
assert retriever._document_store.index == "test"
|
|
328
|
+
assert retriever._filters is None
|
|
329
|
+
assert retriever._top_k == 5
|
|
330
|
+
assert retriever._return_embedding
|
|
331
|
+
|
|
332
|
+
def test_run(self):
|
|
333
|
+
mock_store = Mock(spec=QdrantDocumentStore)
|
|
334
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
335
|
+
mock_store._query_hybrid.return_value = [
|
|
336
|
+
Document(content="Test doc", embedding=[0.1, 0.2], sparse_embedding=sparse_embedding)
|
|
337
|
+
]
|
|
338
|
+
|
|
339
|
+
retriever = QdrantHybridRetriever(document_store=mock_store)
|
|
340
|
+
res = retriever.run(
|
|
341
|
+
query_embedding=[0.5, 0.7], query_sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.7])
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
call_args = mock_store._query_hybrid.call_args
|
|
345
|
+
assert call_args[1]["query_embedding"] == [0.5, 0.7]
|
|
346
|
+
assert call_args[1]["query_sparse_embedding"].indices == [0, 5]
|
|
347
|
+
assert call_args[1]["query_sparse_embedding"].values == [0.1, 0.7]
|
|
348
|
+
assert call_args[1]["top_k"] == 10
|
|
349
|
+
assert call_args[1]["return_embedding"] is False
|
|
350
|
+
|
|
351
|
+
assert res["documents"][0].content == "Test doc"
|
|
352
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
353
|
+
assert res["documents"][0].sparse_embedding == sparse_embedding
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from haystack import Document
|
|
5
|
-
from haystack.document_stores.errors import DuplicateDocumentError
|
|
6
|
-
from haystack.document_stores.types import DuplicatePolicy
|
|
7
|
-
from haystack.testing.document_store import (
|
|
8
|
-
CountDocumentsTest,
|
|
9
|
-
DeleteDocumentsTest,
|
|
10
|
-
WriteDocumentsTest,
|
|
11
|
-
)
|
|
12
|
-
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestQdrantStoreBaseTests(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
|
|
16
|
-
@pytest.fixture
|
|
17
|
-
def document_store(self) -> QdrantDocumentStore:
|
|
18
|
-
return QdrantDocumentStore(
|
|
19
|
-
":memory:",
|
|
20
|
-
recreate_index=True,
|
|
21
|
-
return_embedding=True,
|
|
22
|
-
wait_result_from_api=True,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
26
|
-
"""
|
|
27
|
-
Assert that two lists of Documents are equal.
|
|
28
|
-
This is used in every test.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
# Check that the lengths of the lists are the same
|
|
32
|
-
assert len(received) == len(expected)
|
|
33
|
-
|
|
34
|
-
# Check that the sets are equal, meaning the content and IDs match regardless of order
|
|
35
|
-
assert {doc.id for doc in received} == {doc.id for doc in expected}
|
|
36
|
-
|
|
37
|
-
def test_write_documents(self, document_store: QdrantDocumentStore):
|
|
38
|
-
docs = [Document(id="1")]
|
|
39
|
-
assert document_store.write_documents(docs) == 1
|
|
40
|
-
with pytest.raises(DuplicateDocumentError):
|
|
41
|
-
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|