qdrant-haystack 3.3.1__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/PKG-INFO +2 -1
- qdrant_haystack-3.5.0/examples/embedding_retrieval.py +52 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/pyproject.toml +3 -0
- qdrant_haystack-3.5.0/src/haystack_integrations/components/retrievers/qdrant/__init__.py +7 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/components/retrievers/qdrant/retriever.py +136 -9
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +83 -2
- qdrant_haystack-3.5.0/tests/conftest.py +18 -0
- qdrant_haystack-3.5.0/tests/test_document_store.py +104 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/test_retriever.py +121 -22
- qdrant_haystack-3.3.1/src/haystack_integrations/components/retrievers/qdrant/__init__.py +0 -7
- qdrant_haystack-3.3.1/tests/test_document_store.py +0 -41
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/.gitignore +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/LICENSE.txt +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/README.md +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/pydoc/config.yml +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/converters.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/filters.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/__init__.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/test_converters.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/test_dict_converters.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/test_filters.py +0 -0
- {qdrant_haystack-3.3.1 → qdrant_haystack-3.5.0}/tests/test_legacy_filters.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -9,6 +9,7 @@ Author-email: Kacper Łukawski <kacper.lukawski@qdrant.com>, Anush Shetty <anush
|
|
|
9
9
|
License-Expression: Apache-2.0
|
|
10
10
|
License-File: LICENSE.txt
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
13
|
Classifier: Programming Language :: Python
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Install required packages for this example, including qdrant-haystack and other libraries needed
|
|
2
|
+
# for Markdown conversion and embeddings generation. Use the following command:
|
|
3
|
+
# pip install qdrant-haystack markdown-it-py mdit_plain sentence-transformers
|
|
4
|
+
|
|
5
|
+
# Download some Markdown files to index.
|
|
6
|
+
# git clone https://github.com/anakin87/neural-search-pills
|
|
7
|
+
|
|
8
|
+
import glob
|
|
9
|
+
|
|
10
|
+
from haystack import Pipeline
|
|
11
|
+
from haystack.components.converters import MarkdownToDocument
|
|
12
|
+
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
|
|
13
|
+
from haystack.components.preprocessors import DocumentSplitter
|
|
14
|
+
from haystack.components.writers import DocumentWriter
|
|
15
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
|
|
16
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
17
|
+
|
|
18
|
+
# Initialize QdrantDocumentStore: for simplicity, we use an in-memory store here.
|
|
19
|
+
# You can also run a Qdrant instance using Docker or use Qdrant Cloud.
|
|
20
|
+
document_store = QdrantDocumentStore(
|
|
21
|
+
":memory:",
|
|
22
|
+
index="Document",
|
|
23
|
+
embedding_dim=768,
|
|
24
|
+
recreate_index=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Create the indexing Pipeline and index some documents
|
|
28
|
+
file_paths = glob.glob("neural-search-pills/pills/*.md")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
indexing = Pipeline()
|
|
32
|
+
indexing.add_component("converter", MarkdownToDocument())
|
|
33
|
+
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
|
|
34
|
+
indexing.add_component("embedder", SentenceTransformersDocumentEmbedder())
|
|
35
|
+
indexing.add_component("writer", DocumentWriter(document_store))
|
|
36
|
+
indexing.connect("converter", "splitter")
|
|
37
|
+
indexing.connect("splitter", "embedder")
|
|
38
|
+
indexing.connect("embedder", "writer")
|
|
39
|
+
|
|
40
|
+
indexing.run({"converter": {"sources": file_paths}})
|
|
41
|
+
|
|
42
|
+
# Create the querying Pipeline and try a query
|
|
43
|
+
querying = Pipeline()
|
|
44
|
+
querying.add_component("embedder", SentenceTransformersTextEmbedder())
|
|
45
|
+
querying.add_component("retriever", QdrantEmbeddingRetriever(document_store=document_store, top_k=3))
|
|
46
|
+
querying.connect("embedder", "retriever")
|
|
47
|
+
|
|
48
|
+
results = querying.run({"embedder": {"text": "What is a cross-encoder?"}})
|
|
49
|
+
|
|
50
|
+
for doc in results["retriever"]["documents"]:
|
|
51
|
+
print(doc)
|
|
52
|
+
print("-" * 10)
|
|
@@ -15,6 +15,7 @@ authors = [
|
|
|
15
15
|
{ name = "Anush Shetty", email = "anush.shetty@qdrant.com" },
|
|
16
16
|
]
|
|
17
17
|
classifiers = [
|
|
18
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
19
|
"Development Status :: 4 - Beta",
|
|
19
20
|
"Programming Language :: Python",
|
|
20
21
|
"Programming Language :: Python :: 3.8",
|
|
@@ -127,6 +128,8 @@ ban-relative-imports = "parents"
|
|
|
127
128
|
[tool.ruff.per-file-ignores]
|
|
128
129
|
# Tests can use magic values, assertions, and relative imports
|
|
129
130
|
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
131
|
+
# examples can contain "print" commands
|
|
132
|
+
"examples/**/*" = ["T201"]
|
|
130
133
|
|
|
131
134
|
|
|
132
135
|
[tool.coverage.run]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseEmbeddingRetriever
|
|
6
|
+
|
|
7
|
+
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseEmbeddingRetriever", "QdrantHybridRetriever")
|
|
@@ -19,8 +19,10 @@ class QdrantEmbeddingRetriever:
|
|
|
19
19
|
":memory:",
|
|
20
20
|
recreate_index=True,
|
|
21
21
|
return_embedding=True,
|
|
22
|
-
wait_result_from_api=True,
|
|
23
22
|
)
|
|
23
|
+
|
|
24
|
+
document_store.write_documents([Document(content="test", embedding=[0.5]*768)])
|
|
25
|
+
|
|
24
26
|
retriever = QdrantEmbeddingRetriever(document_store=document_store)
|
|
25
27
|
|
|
26
28
|
# using a fake vector to keep the example simple
|
|
@@ -112,7 +114,7 @@ class QdrantEmbeddingRetriever:
|
|
|
112
114
|
The retrieved documents.
|
|
113
115
|
|
|
114
116
|
"""
|
|
115
|
-
docs = self._document_store.
|
|
117
|
+
docs = self._document_store._query_by_embedding(
|
|
116
118
|
query_embedding=query_embedding,
|
|
117
119
|
filters=filters or self._filters,
|
|
118
120
|
top_k=top_k or self._top_k,
|
|
@@ -124,23 +126,27 @@ class QdrantEmbeddingRetriever:
|
|
|
124
126
|
|
|
125
127
|
|
|
126
128
|
@component
|
|
127
|
-
class
|
|
129
|
+
class QdrantSparseEmbeddingRetriever:
|
|
128
130
|
"""
|
|
129
131
|
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.
|
|
130
132
|
|
|
131
133
|
Usage example:
|
|
132
134
|
```python
|
|
133
|
-
from haystack_integrations.components.retrievers.qdrant import
|
|
135
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantSparseEmbeddingRetriever
|
|
134
136
|
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
135
137
|
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
136
138
|
|
|
137
139
|
document_store = QdrantDocumentStore(
|
|
138
140
|
":memory:",
|
|
141
|
+
use_sparse_embeddings=True,
|
|
139
142
|
recreate_index=True,
|
|
140
143
|
return_embedding=True,
|
|
141
|
-
wait_result_from_api=True,
|
|
142
144
|
)
|
|
143
|
-
|
|
145
|
+
|
|
146
|
+
doc = Document(content="test", sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
|
|
147
|
+
document_store.write_documents([doc])
|
|
148
|
+
|
|
149
|
+
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
144
150
|
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
145
151
|
retriever.run(query_sparse_embedding=sparse_embedding)
|
|
146
152
|
```
|
|
@@ -155,7 +161,7 @@ class QdrantSparseRetriever:
|
|
|
155
161
|
return_embedding: bool = False,
|
|
156
162
|
):
|
|
157
163
|
"""
|
|
158
|
-
Create a
|
|
164
|
+
Create a QdrantSparseEmbeddingRetriever component.
|
|
159
165
|
|
|
160
166
|
:param document_store: An instance of QdrantDocumentStore.
|
|
161
167
|
:param filters: A dictionary with filters to narrow down the search space. Default is None.
|
|
@@ -196,7 +202,7 @@ class QdrantSparseRetriever:
|
|
|
196
202
|
return d
|
|
197
203
|
|
|
198
204
|
@classmethod
|
|
199
|
-
def from_dict(cls, data: Dict[str, Any]) -> "
|
|
205
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantSparseEmbeddingRetriever":
|
|
200
206
|
"""
|
|
201
207
|
Deserializes the component from a dictionary.
|
|
202
208
|
|
|
@@ -230,7 +236,7 @@ class QdrantSparseRetriever:
|
|
|
230
236
|
The retrieved documents.
|
|
231
237
|
|
|
232
238
|
"""
|
|
233
|
-
docs = self._document_store.
|
|
239
|
+
docs = self._document_store._query_by_sparse(
|
|
234
240
|
query_sparse_embedding=query_sparse_embedding,
|
|
235
241
|
filters=filters or self._filters,
|
|
236
242
|
top_k=top_k or self._top_k,
|
|
@@ -239,3 +245,124 @@ class QdrantSparseRetriever:
|
|
|
239
245
|
)
|
|
240
246
|
|
|
241
247
|
return {"documents": docs}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@component
|
|
251
|
+
class QdrantHybridRetriever:
|
|
252
|
+
"""
|
|
253
|
+
A component for retrieving documents from an QdrantDocumentStore using both dense and sparse vectors
|
|
254
|
+
and fusing the results using Reciprocal Rank Fusion.
|
|
255
|
+
|
|
256
|
+
Usage example:
|
|
257
|
+
```python
|
|
258
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
|
|
259
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
260
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
261
|
+
|
|
262
|
+
document_store = QdrantDocumentStore(
|
|
263
|
+
":memory:",
|
|
264
|
+
use_sparse_embeddings=True,
|
|
265
|
+
recreate_index=True,
|
|
266
|
+
return_embedding=True,
|
|
267
|
+
wait_result_from_api=True,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
doc = Document(content="test",
|
|
271
|
+
embedding=[0.5]*768,
|
|
272
|
+
sparse_embedding=SparseEmbedding(indices=[0, 3, 5], values=[0.1, 0.5, 0.12]))
|
|
273
|
+
|
|
274
|
+
document_store.write_documents([doc])
|
|
275
|
+
|
|
276
|
+
retriever = QdrantHybridRetriever(document_store=document_store)
|
|
277
|
+
embedding = [0.1]*768
|
|
278
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
279
|
+
retriever.run(query_embedding=embedding, query_sparse_embedding=sparse_embedding)
|
|
280
|
+
```
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
document_store: QdrantDocumentStore,
|
|
286
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
287
|
+
top_k: int = 10,
|
|
288
|
+
return_embedding: bool = False,
|
|
289
|
+
):
|
|
290
|
+
"""
|
|
291
|
+
Create a QdrantHybridRetriever component.
|
|
292
|
+
|
|
293
|
+
:param document_store: An instance of QdrantDocumentStore.
|
|
294
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
295
|
+
:param top_k: The maximum number of documents to retrieve.
|
|
296
|
+
:param return_embedding: Whether to return the embeddings of the retrieved Documents.
|
|
297
|
+
|
|
298
|
+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
if not isinstance(document_store, QdrantDocumentStore):
|
|
302
|
+
msg = "document_store must be an instance of QdrantDocumentStore"
|
|
303
|
+
raise ValueError(msg)
|
|
304
|
+
|
|
305
|
+
self._document_store = document_store
|
|
306
|
+
self._filters = filters
|
|
307
|
+
self._top_k = top_k
|
|
308
|
+
self._return_embedding = return_embedding
|
|
309
|
+
|
|
310
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
311
|
+
"""
|
|
312
|
+
Serializes the component to a dictionary.
|
|
313
|
+
|
|
314
|
+
:returns:
|
|
315
|
+
Dictionary with serialized data.
|
|
316
|
+
"""
|
|
317
|
+
return default_to_dict(
|
|
318
|
+
self,
|
|
319
|
+
document_store=self._document_store.to_dict(),
|
|
320
|
+
filters=self._filters,
|
|
321
|
+
top_k=self._top_k,
|
|
322
|
+
return_embedding=self._return_embedding,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
@classmethod
|
|
326
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantHybridRetriever":
|
|
327
|
+
"""
|
|
328
|
+
Deserializes the component from a dictionary.
|
|
329
|
+
|
|
330
|
+
:param data:
|
|
331
|
+
Dictionary to deserialize from.
|
|
332
|
+
:returns:
|
|
333
|
+
Deserialized component.
|
|
334
|
+
"""
|
|
335
|
+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
336
|
+
data["init_parameters"]["document_store"] = document_store
|
|
337
|
+
return default_from_dict(cls, data)
|
|
338
|
+
|
|
339
|
+
@component.output_types(documents=List[Document])
|
|
340
|
+
def run(
|
|
341
|
+
self,
|
|
342
|
+
query_embedding: List[float],
|
|
343
|
+
query_sparse_embedding: SparseEmbedding,
|
|
344
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
345
|
+
top_k: Optional[int] = None,
|
|
346
|
+
return_embedding: Optional[bool] = None,
|
|
347
|
+
):
|
|
348
|
+
"""
|
|
349
|
+
Run the Sparse Embedding Retriever on the given input data.
|
|
350
|
+
|
|
351
|
+
:param query_embedding: Dense embedding of the query.
|
|
352
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
353
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
354
|
+
:param top_k: The maximum number of documents to return.
|
|
355
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
356
|
+
:returns:
|
|
357
|
+
The retrieved documents.
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
docs = self._document_store._query_hybrid(
|
|
361
|
+
query_embedding=query_embedding,
|
|
362
|
+
query_sparse_embedding=query_sparse_embedding,
|
|
363
|
+
filters=filters or self._filters,
|
|
364
|
+
top_k=top_k or self._top_k,
|
|
365
|
+
return_embedding=return_embedding or self._return_embedding,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return {"documents": docs}
|
|
@@ -16,6 +16,7 @@ from haystack.utils.filters import convert as convert_legacy_filters
|
|
|
16
16
|
from qdrant_client import grpc
|
|
17
17
|
from qdrant_client.http import models as rest
|
|
18
18
|
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
19
|
+
from qdrant_client.hybrid.fusion import reciprocal_rank_fusion
|
|
19
20
|
from tqdm import tqdm
|
|
20
21
|
|
|
21
22
|
from .converters import (
|
|
@@ -307,7 +308,7 @@ class QdrantDocumentStore:
|
|
|
307
308
|
)
|
|
308
309
|
return documents
|
|
309
310
|
|
|
310
|
-
def
|
|
311
|
+
def _query_by_sparse(
|
|
311
312
|
self,
|
|
312
313
|
query_sparse_embedding: SparseEmbedding,
|
|
313
314
|
filters: Optional[Dict[str, Any]] = None,
|
|
@@ -349,7 +350,7 @@ class QdrantDocumentStore:
|
|
|
349
350
|
document.score = score
|
|
350
351
|
return results
|
|
351
352
|
|
|
352
|
-
def
|
|
353
|
+
def _query_by_embedding(
|
|
353
354
|
self,
|
|
354
355
|
query_embedding: List[float],
|
|
355
356
|
filters: Optional[Dict[str, Any]] = None,
|
|
@@ -383,6 +384,86 @@ class QdrantDocumentStore:
|
|
|
383
384
|
document.score = score
|
|
384
385
|
return results
|
|
385
386
|
|
|
387
|
+
def _query_hybrid(
|
|
388
|
+
self,
|
|
389
|
+
query_embedding: List[float],
|
|
390
|
+
query_sparse_embedding: SparseEmbedding,
|
|
391
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
392
|
+
top_k: int = 10,
|
|
393
|
+
return_embedding: bool = False,
|
|
394
|
+
) -> List[Document]:
|
|
395
|
+
"""
|
|
396
|
+
Retrieves documents based on dense and sparse embeddings and fuses the results using Reciprocal Rank Fusion.
|
|
397
|
+
|
|
398
|
+
This method is not part of the public interface of `QdrantDocumentStore` and shouldn't be used directly.
|
|
399
|
+
Use the `QdrantHybridRetriever` instead.
|
|
400
|
+
|
|
401
|
+
:param query_embedding: Dense embedding of the query.
|
|
402
|
+
:param query_sparse_embedding: Sparse embedding of the query.
|
|
403
|
+
:param filters: Filters applied to the retrieved Documents.
|
|
404
|
+
:param top_k: Maximum number of Documents to return.
|
|
405
|
+
:param return_embedding: Whether to return the embeddings of the retrieved documents.
|
|
406
|
+
|
|
407
|
+
:returns: List of Document that are most similar to `query_embedding` and `query_sparse_embedding`.
|
|
408
|
+
|
|
409
|
+
:raises QdrantStoreError:
|
|
410
|
+
If the Document Store was initialized with `use_sparse_embeddings=False`.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
# This implementation is based on the code from the Python Qdrant client:
|
|
414
|
+
# https://github.com/qdrant/qdrant-client/blob/8e3ea58f781e4110d11c0a6985b5e6bb66b85d33/qdrant_client/qdrant_fastembed.py#L519
|
|
415
|
+
if not self.use_sparse_embeddings:
|
|
416
|
+
message = (
|
|
417
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
418
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
419
|
+
)
|
|
420
|
+
raise QdrantStoreError(message)
|
|
421
|
+
|
|
422
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
423
|
+
|
|
424
|
+
sparse_request = rest.SearchRequest(
|
|
425
|
+
vector=rest.NamedSparseVector(
|
|
426
|
+
name=SPARSE_VECTORS_NAME,
|
|
427
|
+
vector=rest.SparseVector(
|
|
428
|
+
indices=query_sparse_embedding.indices,
|
|
429
|
+
values=query_sparse_embedding.values,
|
|
430
|
+
),
|
|
431
|
+
),
|
|
432
|
+
filter=qdrant_filters,
|
|
433
|
+
limit=top_k,
|
|
434
|
+
with_payload=True,
|
|
435
|
+
with_vector=return_embedding,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
dense_request = rest.SearchRequest(
|
|
439
|
+
vector=rest.NamedVector(
|
|
440
|
+
name=DENSE_VECTORS_NAME,
|
|
441
|
+
vector=query_embedding,
|
|
442
|
+
),
|
|
443
|
+
filter=qdrant_filters,
|
|
444
|
+
limit=top_k,
|
|
445
|
+
with_payload=True,
|
|
446
|
+
with_vector=return_embedding,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
try:
|
|
450
|
+
dense_request_response, sparse_request_response = self.client.search_batch(
|
|
451
|
+
collection_name=self.index, requests=[dense_request, sparse_request]
|
|
452
|
+
)
|
|
453
|
+
except Exception as e:
|
|
454
|
+
msg = "Error during hybrid search"
|
|
455
|
+
raise QdrantStoreError(msg) from e
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
points = reciprocal_rank_fusion(responses=[dense_request_response, sparse_request_response], limit=top_k)
|
|
459
|
+
except Exception as e:
|
|
460
|
+
msg = "Error while applying Reciprocal Rank Fusion"
|
|
461
|
+
raise QdrantStoreError(msg) from e
|
|
462
|
+
|
|
463
|
+
results = [convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) for point in points]
|
|
464
|
+
|
|
465
|
+
return results
|
|
466
|
+
|
|
386
467
|
def _get_distance(self, similarity: str) -> rest.Distance:
|
|
387
468
|
try:
|
|
388
469
|
return self.SIMILARITY[similarity]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pytest
|
|
3
|
+
from haystack.dataclasses import SparseEmbedding
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture(scope="session")
|
|
7
|
+
def generate_sparse_embedding():
|
|
8
|
+
"""
|
|
9
|
+
This fixture returns a function that generates a random SparseEmbedding each time it is called.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def _generate_random_sparse_embedding():
|
|
13
|
+
random_indice_length = np.random.randint(3, 15)
|
|
14
|
+
indices = list(range(random_indice_length))
|
|
15
|
+
values = [np.random.random_sample() for _ in range(random_indice_length)]
|
|
16
|
+
return SparseEmbedding(indices=indices, values=values)
|
|
17
|
+
|
|
18
|
+
return _generate_random_sparse_embedding
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from unittest.mock import patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from haystack import Document
|
|
6
|
+
from haystack.dataclasses import SparseEmbedding
|
|
7
|
+
from haystack.document_stores.errors import DuplicateDocumentError
|
|
8
|
+
from haystack.document_stores.types import DuplicatePolicy
|
|
9
|
+
from haystack.testing.document_store import (
|
|
10
|
+
CountDocumentsTest,
|
|
11
|
+
DeleteDocumentsTest,
|
|
12
|
+
WriteDocumentsTest,
|
|
13
|
+
_random_embeddings,
|
|
14
|
+
)
|
|
15
|
+
from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def document_store(self) -> QdrantDocumentStore:
|
|
21
|
+
return QdrantDocumentStore(
|
|
22
|
+
":memory:",
|
|
23
|
+
recreate_index=True,
|
|
24
|
+
return_embedding=True,
|
|
25
|
+
wait_result_from_api=True,
|
|
26
|
+
use_sparse_embeddings=False,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
30
|
+
"""
|
|
31
|
+
Assert that two lists of Documents are equal.
|
|
32
|
+
This is used in every test.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Check that the lengths of the lists are the same
|
|
36
|
+
assert len(received) == len(expected)
|
|
37
|
+
|
|
38
|
+
# Check that the sets are equal, meaning the content and IDs match regardless of order
|
|
39
|
+
assert {doc.id for doc in received} == {doc.id for doc in expected}
|
|
40
|
+
|
|
41
|
+
def test_write_documents(self, document_store: QdrantDocumentStore):
|
|
42
|
+
docs = [Document(id="1")]
|
|
43
|
+
assert document_store.write_documents(docs) == 1
|
|
44
|
+
with pytest.raises(DuplicateDocumentError):
|
|
45
|
+
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
46
|
+
|
|
47
|
+
def test_query_hybrid(self, generate_sparse_embedding):
|
|
48
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
49
|
+
|
|
50
|
+
docs = []
|
|
51
|
+
for i in range(20):
|
|
52
|
+
docs.append(
|
|
53
|
+
Document(
|
|
54
|
+
content=f"doc {i}", sparse_embedding=generate_sparse_embedding(), embedding=_random_embeddings(768)
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
document_store.write_documents(docs)
|
|
59
|
+
|
|
60
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
61
|
+
embedding = [0.1] * 768
|
|
62
|
+
|
|
63
|
+
results: List[Document] = document_store._query_hybrid(
|
|
64
|
+
query_sparse_embedding=sparse_embedding, query_embedding=embedding, top_k=10, return_embedding=True
|
|
65
|
+
)
|
|
66
|
+
assert len(results) == 10
|
|
67
|
+
|
|
68
|
+
for document in results:
|
|
69
|
+
assert document.sparse_embedding
|
|
70
|
+
assert document.embedding
|
|
71
|
+
|
|
72
|
+
def test_query_hybrid_fail_without_sparse_embedding(self, document_store):
|
|
73
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
74
|
+
embedding = [0.1] * 768
|
|
75
|
+
|
|
76
|
+
with pytest.raises(QdrantStoreError):
|
|
77
|
+
|
|
78
|
+
document_store._query_hybrid(
|
|
79
|
+
query_sparse_embedding=sparse_embedding,
|
|
80
|
+
query_embedding=embedding,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def test_query_hybrid_search_batch_failure(self):
|
|
84
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
85
|
+
|
|
86
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
87
|
+
embedding = [0.1] * 768
|
|
88
|
+
|
|
89
|
+
with patch.object(document_store.client, "search_batch", side_effect=Exception("search_batch error")):
|
|
90
|
+
|
|
91
|
+
with pytest.raises(QdrantStoreError):
|
|
92
|
+
document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
|
|
93
|
+
|
|
94
|
+
@patch("haystack_integrations.document_stores.qdrant.document_store.reciprocal_rank_fusion")
|
|
95
|
+
def test_query_hybrid_reciprocal_rank_fusion_failure(self, mocked_fusion):
|
|
96
|
+
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
|
|
97
|
+
|
|
98
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
99
|
+
embedding = [0.1] * 768
|
|
100
|
+
|
|
101
|
+
mocked_fusion.side_effect = Exception("reciprocal_rank_fusion error")
|
|
102
|
+
|
|
103
|
+
with pytest.raises(QdrantStoreError):
|
|
104
|
+
document_store._query_hybrid(query_sparse_embedding=sparse_embedding, query_embedding=embedding)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
from unittest.mock import Mock
|
|
2
3
|
|
|
3
|
-
import numpy as np
|
|
4
4
|
from haystack.dataclasses import Document, SparseEmbedding
|
|
5
5
|
from haystack.testing.document_store import (
|
|
6
6
|
FilterableDocsFixtureMixin,
|
|
@@ -8,7 +8,8 @@ from haystack.testing.document_store import (
|
|
|
8
8
|
)
|
|
9
9
|
from haystack_integrations.components.retrievers.qdrant import (
|
|
10
10
|
QdrantEmbeddingRetriever,
|
|
11
|
-
|
|
11
|
+
QdrantHybridRetriever,
|
|
12
|
+
QdrantSparseEmbeddingRetriever,
|
|
12
13
|
)
|
|
13
14
|
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
14
15
|
|
|
@@ -135,10 +136,10 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin):
|
|
|
135
136
|
assert document.embedding is None
|
|
136
137
|
|
|
137
138
|
|
|
138
|
-
class
|
|
139
|
+
class TestQdrantSparseEmbeddingRetriever(FilterableDocsFixtureMixin):
|
|
139
140
|
def test_init_default(self):
|
|
140
141
|
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
141
|
-
retriever =
|
|
142
|
+
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
142
143
|
assert retriever._document_store == document_store
|
|
143
144
|
assert retriever._filters is None
|
|
144
145
|
assert retriever._top_k == 10
|
|
@@ -146,10 +147,10 @@ class TestQdrantSparseRetriever(FilterableDocsFixtureMixin):
|
|
|
146
147
|
|
|
147
148
|
def test_to_dict(self):
|
|
148
149
|
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
149
|
-
retriever =
|
|
150
|
+
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
150
151
|
res = retriever.to_dict()
|
|
151
152
|
assert res == {
|
|
152
|
-
"type": "haystack_integrations.components.retrievers.qdrant.retriever.
|
|
153
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantSparseEmbeddingRetriever",
|
|
153
154
|
"init_parameters": {
|
|
154
155
|
"document_store": {
|
|
155
156
|
"type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
|
|
@@ -202,7 +203,7 @@ class TestQdrantSparseRetriever(FilterableDocsFixtureMixin):
|
|
|
202
203
|
|
|
203
204
|
def test_from_dict(self):
|
|
204
205
|
data = {
|
|
205
|
-
"type": "haystack_integrations.components.retrievers.qdrant.retriever.
|
|
206
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantSparseEmbeddingRetriever",
|
|
206
207
|
"init_parameters": {
|
|
207
208
|
"document_store": {
|
|
208
209
|
"init_parameters": {"location": ":memory:", "index": "test"},
|
|
@@ -214,7 +215,7 @@ class TestQdrantSparseRetriever(FilterableDocsFixtureMixin):
|
|
|
214
215
|
"return_embedding": True,
|
|
215
216
|
},
|
|
216
217
|
}
|
|
217
|
-
retriever =
|
|
218
|
+
retriever = QdrantSparseEmbeddingRetriever.from_dict(data)
|
|
218
219
|
assert isinstance(retriever._document_store, QdrantDocumentStore)
|
|
219
220
|
assert retriever._document_store.index == "test"
|
|
220
221
|
assert retriever._filters is None
|
|
@@ -222,26 +223,15 @@ class TestQdrantSparseRetriever(FilterableDocsFixtureMixin):
|
|
|
222
223
|
assert retriever._scale_score is False
|
|
223
224
|
assert retriever._return_embedding is True
|
|
224
225
|
|
|
225
|
-
def
|
|
226
|
-
list_of_sparse_vectors = []
|
|
227
|
-
for _ in range(n):
|
|
228
|
-
random_indice_length = np.random.randint(3, 15)
|
|
229
|
-
data = {
|
|
230
|
-
"indices": list(range(random_indice_length)),
|
|
231
|
-
"values": [np.random.random_sample() for _ in range(random_indice_length)],
|
|
232
|
-
}
|
|
233
|
-
list_of_sparse_vectors.append(data)
|
|
234
|
-
return list_of_sparse_vectors
|
|
235
|
-
|
|
236
|
-
def test_run(self, filterable_docs: List[Document]):
|
|
226
|
+
def test_run(self, filterable_docs: List[Document], generate_sparse_embedding):
|
|
237
227
|
document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True)
|
|
238
228
|
|
|
239
229
|
# Add fake sparse embedding to documents
|
|
240
230
|
for doc in filterable_docs:
|
|
241
|
-
doc.sparse_embedding =
|
|
231
|
+
doc.sparse_embedding = generate_sparse_embedding()
|
|
242
232
|
|
|
243
233
|
document_store.write_documents(filterable_docs)
|
|
244
|
-
retriever =
|
|
234
|
+
retriever = QdrantSparseEmbeddingRetriever(document_store=document_store)
|
|
245
235
|
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
246
236
|
|
|
247
237
|
results: List[Document] = retriever.run(query_sparse_embedding=sparse_embedding)["documents"]
|
|
@@ -252,3 +242,112 @@ class TestQdrantSparseRetriever(FilterableDocsFixtureMixin):
|
|
|
252
242
|
|
|
253
243
|
for document in results:
|
|
254
244
|
assert document.sparse_embedding
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class TestQdrantHybridRetriever:
|
|
248
|
+
def test_init_default(self):
|
|
249
|
+
document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True)
|
|
250
|
+
retriever = QdrantHybridRetriever(document_store=document_store)
|
|
251
|
+
|
|
252
|
+
assert retriever._document_store == document_store
|
|
253
|
+
assert retriever._filters is None
|
|
254
|
+
assert retriever._top_k == 10
|
|
255
|
+
assert retriever._return_embedding is False
|
|
256
|
+
|
|
257
|
+
def test_to_dict(self):
|
|
258
|
+
document_store = QdrantDocumentStore(location=":memory:", index="test")
|
|
259
|
+
retriever = QdrantHybridRetriever(document_store=document_store, top_k=5, return_embedding=True)
|
|
260
|
+
res = retriever.to_dict()
|
|
261
|
+
assert res == {
|
|
262
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
|
|
263
|
+
"init_parameters": {
|
|
264
|
+
"document_store": {
|
|
265
|
+
"type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
|
|
266
|
+
"init_parameters": {
|
|
267
|
+
"location": ":memory:",
|
|
268
|
+
"url": None,
|
|
269
|
+
"port": 6333,
|
|
270
|
+
"grpc_port": 6334,
|
|
271
|
+
"prefer_grpc": False,
|
|
272
|
+
"https": None,
|
|
273
|
+
"api_key": None,
|
|
274
|
+
"prefix": None,
|
|
275
|
+
"timeout": None,
|
|
276
|
+
"host": None,
|
|
277
|
+
"path": None,
|
|
278
|
+
"index": "test",
|
|
279
|
+
"embedding_dim": 768,
|
|
280
|
+
"on_disk": False,
|
|
281
|
+
"content_field": "content",
|
|
282
|
+
"name_field": "name",
|
|
283
|
+
"embedding_field": "embedding",
|
|
284
|
+
"use_sparse_embeddings": False,
|
|
285
|
+
"similarity": "cosine",
|
|
286
|
+
"return_embedding": False,
|
|
287
|
+
"progress_bar": True,
|
|
288
|
+
"duplicate_documents": "overwrite",
|
|
289
|
+
"recreate_index": False,
|
|
290
|
+
"shard_number": None,
|
|
291
|
+
"replication_factor": None,
|
|
292
|
+
"write_consistency_factor": None,
|
|
293
|
+
"on_disk_payload": None,
|
|
294
|
+
"hnsw_config": None,
|
|
295
|
+
"optimizers_config": None,
|
|
296
|
+
"wal_config": None,
|
|
297
|
+
"quantization_config": None,
|
|
298
|
+
"init_from": None,
|
|
299
|
+
"wait_result_from_api": True,
|
|
300
|
+
"metadata": {},
|
|
301
|
+
"write_batch_size": 100,
|
|
302
|
+
"scroll_size": 10000,
|
|
303
|
+
"payload_fields_to_index": None,
|
|
304
|
+
},
|
|
305
|
+
},
|
|
306
|
+
"filters": None,
|
|
307
|
+
"top_k": 5,
|
|
308
|
+
"return_embedding": True,
|
|
309
|
+
},
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
def test_from_dict(self):
|
|
313
|
+
data = {
|
|
314
|
+
"type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever",
|
|
315
|
+
"init_parameters": {
|
|
316
|
+
"document_store": {
|
|
317
|
+
"init_parameters": {"location": ":memory:", "index": "test"},
|
|
318
|
+
"type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore",
|
|
319
|
+
},
|
|
320
|
+
"filters": None,
|
|
321
|
+
"top_k": 5,
|
|
322
|
+
"return_embedding": True,
|
|
323
|
+
},
|
|
324
|
+
}
|
|
325
|
+
retriever = QdrantHybridRetriever.from_dict(data)
|
|
326
|
+
assert isinstance(retriever._document_store, QdrantDocumentStore)
|
|
327
|
+
assert retriever._document_store.index == "test"
|
|
328
|
+
assert retriever._filters is None
|
|
329
|
+
assert retriever._top_k == 5
|
|
330
|
+
assert retriever._return_embedding
|
|
331
|
+
|
|
332
|
+
def test_run(self):
|
|
333
|
+
mock_store = Mock(spec=QdrantDocumentStore)
|
|
334
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
335
|
+
mock_store._query_hybrid.return_value = [
|
|
336
|
+
Document(content="Test doc", embedding=[0.1, 0.2], sparse_embedding=sparse_embedding)
|
|
337
|
+
]
|
|
338
|
+
|
|
339
|
+
retriever = QdrantHybridRetriever(document_store=mock_store)
|
|
340
|
+
res = retriever.run(
|
|
341
|
+
query_embedding=[0.5, 0.7], query_sparse_embedding=SparseEmbedding(indices=[0, 5], values=[0.1, 0.7])
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
call_args = mock_store._query_hybrid.call_args
|
|
345
|
+
assert call_args[1]["query_embedding"] == [0.5, 0.7]
|
|
346
|
+
assert call_args[1]["query_sparse_embedding"].indices == [0, 5]
|
|
347
|
+
assert call_args[1]["query_sparse_embedding"].values == [0.1, 0.7]
|
|
348
|
+
assert call_args[1]["top_k"] == 10
|
|
349
|
+
assert call_args[1]["return_embedding"] is False
|
|
350
|
+
|
|
351
|
+
assert res["documents"][0].content == "Test doc"
|
|
352
|
+
assert res["documents"][0].embedding == [0.1, 0.2]
|
|
353
|
+
assert res["documents"][0].sparse_embedding == sparse_embedding
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from haystack import Document
|
|
5
|
-
from haystack.document_stores.errors import DuplicateDocumentError
|
|
6
|
-
from haystack.document_stores.types import DuplicatePolicy
|
|
7
|
-
from haystack.testing.document_store import (
|
|
8
|
-
CountDocumentsTest,
|
|
9
|
-
DeleteDocumentsTest,
|
|
10
|
-
WriteDocumentsTest,
|
|
11
|
-
)
|
|
12
|
-
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestQdrantStoreBaseTests(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
|
|
16
|
-
@pytest.fixture
|
|
17
|
-
def document_store(self) -> QdrantDocumentStore:
|
|
18
|
-
return QdrantDocumentStore(
|
|
19
|
-
":memory:",
|
|
20
|
-
recreate_index=True,
|
|
21
|
-
return_embedding=True,
|
|
22
|
-
wait_result_from_api=True,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
def assert_documents_are_equal(self, received: List[Document], expected: List[Document]):
|
|
26
|
-
"""
|
|
27
|
-
Assert that two lists of Documents are equal.
|
|
28
|
-
This is used in every test.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
# Check that the lengths of the lists are the same
|
|
32
|
-
assert len(received) == len(expected)
|
|
33
|
-
|
|
34
|
-
# Check that the sets are equal, meaning the content and IDs match regardless of order
|
|
35
|
-
assert {doc.id for doc in received} == {doc.id for doc in expected}
|
|
36
|
-
|
|
37
|
-
def test_write_documents(self, document_store: QdrantDocumentStore):
|
|
38
|
-
docs = [Document(id="1")]
|
|
39
|
-
assert document_store.write_documents(docs) == 1
|
|
40
|
-
with pytest.raises(DuplicateDocumentError):
|
|
41
|
-
document_store.write_documents(docs, DuplicatePolicy.FAIL)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|