qdrant-haystack 3.2.1__tar.gz → 3.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/PKG-INFO +2 -2
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/pyproject.toml +3 -1
- qdrant_haystack-3.3.0/src/haystack_integrations/components/retrievers/qdrant/__init__.py +7 -0
- qdrant_haystack-3.3.0/src/haystack_integrations/components/retrievers/qdrant/retriever.py +241 -0
- qdrant_haystack-3.3.0/src/haystack_integrations/document_stores/qdrant/converters.py +80 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/src/haystack_integrations/document_stores/qdrant/document_store.py +145 -43
- qdrant_haystack-3.3.0/src/haystack_integrations/document_stores/qdrant/filters.py +237 -0
- qdrant_haystack-3.3.0/tests/test_converters.py +63 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/tests/test_dict_converters.py +3 -0
- qdrant_haystack-3.3.0/tests/test_retriever.py +254 -0
- qdrant_haystack-3.2.1/src/haystack_integrations/components/retrievers/qdrant/__init__.py +0 -7
- qdrant_haystack-3.2.1/src/haystack_integrations/components/retrievers/qdrant/retriever.py +0 -122
- qdrant_haystack-3.2.1/src/haystack_integrations/document_stores/qdrant/converters.py +0 -70
- qdrant_haystack-3.2.1/src/haystack_integrations/document_stores/qdrant/filters.py +0 -233
- qdrant_haystack-3.2.1/tests/test_converters.py +0 -52
- qdrant_haystack-3.2.1/tests/test_retriever.py +0 -114
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/.gitignore +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/LICENSE.txt +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/README.md +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/pydoc/config.yml +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/src/haystack_integrations/document_stores/qdrant/__init__.py +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/tests/__init__.py +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/tests/test_document_store.py +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/tests/test_filters.py +0 -0
- {qdrant_haystack-3.2.1 → qdrant_haystack-3.3.0}/tests/test_legacy_filters.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
17
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
18
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
19
19
|
Requires-Python: >=3.8
|
|
20
|
-
Requires-Dist: haystack-ai
|
|
20
|
+
Requires-Dist: haystack-ai>=2.0.1
|
|
21
21
|
Requires-Dist: qdrant-client
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
|
|
@@ -24,7 +24,7 @@ classifiers = [
|
|
|
24
24
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
25
25
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
26
26
|
]
|
|
27
|
-
dependencies = ["haystack-ai", "qdrant-client"]
|
|
27
|
+
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]
|
|
28
28
|
|
|
29
29
|
[project.urls]
|
|
30
30
|
Source = "https://github.com/deepset-ai/haystack-core-integrations"
|
|
@@ -103,6 +103,8 @@ ignore = [
|
|
|
103
103
|
"B027",
|
|
104
104
|
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
|
105
105
|
"FBT003",
|
|
106
|
+
# Allow boolean arguments in function definition
|
|
107
|
+
"FBT001", "FBT002",
|
|
106
108
|
# Ignore checks for possible passwords
|
|
107
109
|
"S105",
|
|
108
110
|
"S106",
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
4
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
5
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@component
|
|
9
|
+
class QdrantEmbeddingRetriever:
|
|
10
|
+
"""
|
|
11
|
+
A component for retrieving documents from an QdrantDocumentStore using dense vectors.
|
|
12
|
+
|
|
13
|
+
Usage example:
|
|
14
|
+
```python
|
|
15
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
|
|
16
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
17
|
+
|
|
18
|
+
document_store = QdrantDocumentStore(
|
|
19
|
+
":memory:",
|
|
20
|
+
recreate_index=True,
|
|
21
|
+
return_embedding=True,
|
|
22
|
+
wait_result_from_api=True,
|
|
23
|
+
)
|
|
24
|
+
retriever = QdrantEmbeddingRetriever(document_store=document_store)
|
|
25
|
+
|
|
26
|
+
# using a fake vector to keep the example simple
|
|
27
|
+
retriever.run(query_embedding=[0.1]*768)
|
|
28
|
+
```
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
document_store: QdrantDocumentStore,
|
|
34
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
35
|
+
top_k: int = 10,
|
|
36
|
+
scale_score: bool = True,
|
|
37
|
+
return_embedding: bool = False,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Create a QdrantEmbeddingRetriever component.
|
|
41
|
+
|
|
42
|
+
:param document_store: An instance of QdrantDocumentStore.
|
|
43
|
+
:param filters: A dictionary with filters to narrow down the search space. Default is None.
|
|
44
|
+
:param top_k: The maximum number of documents to retrieve. Default is 10.
|
|
45
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
|
|
46
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False.
|
|
47
|
+
|
|
48
|
+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
if not isinstance(document_store, QdrantDocumentStore):
|
|
52
|
+
msg = "document_store must be an instance of QdrantDocumentStore"
|
|
53
|
+
raise ValueError(msg)
|
|
54
|
+
|
|
55
|
+
self._document_store = document_store
|
|
56
|
+
self._filters = filters
|
|
57
|
+
self._top_k = top_k
|
|
58
|
+
self._scale_score = scale_score
|
|
59
|
+
self._return_embedding = return_embedding
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
62
|
+
"""
|
|
63
|
+
Serializes the component to a dictionary.
|
|
64
|
+
|
|
65
|
+
:returns:
|
|
66
|
+
Dictionary with serialized data.
|
|
67
|
+
"""
|
|
68
|
+
d = default_to_dict(
|
|
69
|
+
self,
|
|
70
|
+
document_store=self._document_store,
|
|
71
|
+
filters=self._filters,
|
|
72
|
+
top_k=self._top_k,
|
|
73
|
+
scale_score=self._scale_score,
|
|
74
|
+
return_embedding=self._return_embedding,
|
|
75
|
+
)
|
|
76
|
+
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
77
|
+
|
|
78
|
+
return d
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
|
|
82
|
+
"""
|
|
83
|
+
Deserializes the component from a dictionary.
|
|
84
|
+
|
|
85
|
+
:param data:
|
|
86
|
+
Dictionary to deserialize from.
|
|
87
|
+
:returns:
|
|
88
|
+
Deserialized component.
|
|
89
|
+
"""
|
|
90
|
+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
91
|
+
data["init_parameters"]["document_store"] = document_store
|
|
92
|
+
return default_from_dict(cls, data)
|
|
93
|
+
|
|
94
|
+
@component.output_types(documents=List[Document])
|
|
95
|
+
def run(
|
|
96
|
+
self,
|
|
97
|
+
query_embedding: List[float],
|
|
98
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
99
|
+
top_k: Optional[int] = None,
|
|
100
|
+
scale_score: Optional[bool] = None,
|
|
101
|
+
return_embedding: Optional[bool] = None,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Run the Embedding Retriever on the given input data.
|
|
105
|
+
|
|
106
|
+
:param query_embedding: Embedding of the query.
|
|
107
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
108
|
+
:param top_k: The maximum number of documents to return.
|
|
109
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
110
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
111
|
+
:returns:
|
|
112
|
+
The retrieved documents.
|
|
113
|
+
|
|
114
|
+
"""
|
|
115
|
+
docs = self._document_store.query_by_embedding(
|
|
116
|
+
query_embedding=query_embedding,
|
|
117
|
+
filters=filters or self._filters,
|
|
118
|
+
top_k=top_k or self._top_k,
|
|
119
|
+
scale_score=scale_score or self._scale_score,
|
|
120
|
+
return_embedding=return_embedding or self._return_embedding,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return {"documents": docs}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@component
|
|
127
|
+
class QdrantSparseRetriever:
|
|
128
|
+
"""
|
|
129
|
+
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.
|
|
130
|
+
|
|
131
|
+
Usage example:
|
|
132
|
+
```python
|
|
133
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
|
|
134
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
135
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
136
|
+
|
|
137
|
+
document_store = QdrantDocumentStore(
|
|
138
|
+
":memory:",
|
|
139
|
+
recreate_index=True,
|
|
140
|
+
return_embedding=True,
|
|
141
|
+
wait_result_from_api=True,
|
|
142
|
+
)
|
|
143
|
+
retriever = QdrantSparseRetriever(document_store=document_store)
|
|
144
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
145
|
+
retriever.run(query_sparse_embedding=sparse_embedding)
|
|
146
|
+
```
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(
|
|
150
|
+
self,
|
|
151
|
+
document_store: QdrantDocumentStore,
|
|
152
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
153
|
+
top_k: int = 10,
|
|
154
|
+
scale_score: bool = True,
|
|
155
|
+
return_embedding: bool = False,
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Create a QdrantSparseRetriever component.
|
|
159
|
+
|
|
160
|
+
:param document_store: An instance of QdrantDocumentStore.
|
|
161
|
+
:param filters: A dictionary with filters to narrow down the search space. Default is None.
|
|
162
|
+
:param top_k: The maximum number of documents to retrieve. Default is 10.
|
|
163
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
|
|
164
|
+
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
|
|
165
|
+
|
|
166
|
+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
if not isinstance(document_store, QdrantDocumentStore):
|
|
170
|
+
msg = "document_store must be an instance of QdrantDocumentStore"
|
|
171
|
+
raise ValueError(msg)
|
|
172
|
+
|
|
173
|
+
self._document_store = document_store
|
|
174
|
+
self._filters = filters
|
|
175
|
+
self._top_k = top_k
|
|
176
|
+
self._scale_score = scale_score
|
|
177
|
+
self._return_embedding = return_embedding
|
|
178
|
+
|
|
179
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
180
|
+
"""
|
|
181
|
+
Serializes the component to a dictionary.
|
|
182
|
+
|
|
183
|
+
:returns:
|
|
184
|
+
Dictionary with serialized data.
|
|
185
|
+
"""
|
|
186
|
+
d = default_to_dict(
|
|
187
|
+
self,
|
|
188
|
+
document_store=self._document_store,
|
|
189
|
+
filters=self._filters,
|
|
190
|
+
top_k=self._top_k,
|
|
191
|
+
scale_score=self._scale_score,
|
|
192
|
+
return_embedding=self._return_embedding,
|
|
193
|
+
)
|
|
194
|
+
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
195
|
+
|
|
196
|
+
return d
|
|
197
|
+
|
|
198
|
+
@classmethod
|
|
199
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
|
|
200
|
+
"""
|
|
201
|
+
Deserializes the component from a dictionary.
|
|
202
|
+
|
|
203
|
+
:param data:
|
|
204
|
+
Dictionary to deserialize from.
|
|
205
|
+
:returns:
|
|
206
|
+
Deserialized component.
|
|
207
|
+
"""
|
|
208
|
+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
209
|
+
data["init_parameters"]["document_store"] = document_store
|
|
210
|
+
return default_from_dict(cls, data)
|
|
211
|
+
|
|
212
|
+
@component.output_types(documents=List[Document])
|
|
213
|
+
def run(
|
|
214
|
+
self,
|
|
215
|
+
query_sparse_embedding: SparseEmbedding,
|
|
216
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
217
|
+
top_k: Optional[int] = None,
|
|
218
|
+
scale_score: Optional[bool] = None,
|
|
219
|
+
return_embedding: Optional[bool] = None,
|
|
220
|
+
):
|
|
221
|
+
"""
|
|
222
|
+
Run the Sparse Embedding Retriever on the given input data.
|
|
223
|
+
|
|
224
|
+
:param query_sparse_embedding: Sparse Embedding of the query.
|
|
225
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
226
|
+
:param top_k: The maximum number of documents to return.
|
|
227
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
228
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
229
|
+
:returns:
|
|
230
|
+
The retrieved documents.
|
|
231
|
+
|
|
232
|
+
"""
|
|
233
|
+
docs = self._document_store.query_by_sparse(
|
|
234
|
+
query_sparse_embedding=query_sparse_embedding,
|
|
235
|
+
filters=filters or self._filters,
|
|
236
|
+
top_k=top_k or self._top_k,
|
|
237
|
+
scale_score=scale_score or self._scale_score,
|
|
238
|
+
return_embedding=return_embedding or self._return_embedding,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return {"documents": docs}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
from haystack.dataclasses import Document
|
|
6
|
+
from qdrant_client.http import models as rest
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
DENSE_VECTORS_NAME = "text-dense"
|
|
11
|
+
SPARSE_VECTORS_NAME = "text-sparse"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def convert_haystack_documents_to_qdrant_points(
|
|
18
|
+
documents: List[Document],
|
|
19
|
+
*,
|
|
20
|
+
embedding_field: str,
|
|
21
|
+
use_sparse_embeddings: bool,
|
|
22
|
+
) -> List[rest.PointStruct]:
|
|
23
|
+
points = []
|
|
24
|
+
for document in documents:
|
|
25
|
+
payload = document.to_dict(flatten=False)
|
|
26
|
+
if use_sparse_embeddings:
|
|
27
|
+
vector = {}
|
|
28
|
+
|
|
29
|
+
dense_vector = payload.pop(embedding_field, None)
|
|
30
|
+
if dense_vector is not None:
|
|
31
|
+
vector[DENSE_VECTORS_NAME] = dense_vector
|
|
32
|
+
|
|
33
|
+
sparse_vector = payload.pop("sparse_embedding", None)
|
|
34
|
+
if sparse_vector is not None:
|
|
35
|
+
sparse_vector_instance = rest.SparseVector(**sparse_vector)
|
|
36
|
+
vector[SPARSE_VECTORS_NAME] = sparse_vector_instance
|
|
37
|
+
|
|
38
|
+
else:
|
|
39
|
+
vector = payload.pop(embedding_field) or {}
|
|
40
|
+
_id = convert_id(payload.get("id"))
|
|
41
|
+
|
|
42
|
+
point = rest.PointStruct(
|
|
43
|
+
payload=payload,
|
|
44
|
+
vector=vector,
|
|
45
|
+
id=_id,
|
|
46
|
+
)
|
|
47
|
+
points.append(point)
|
|
48
|
+
return points
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def convert_id(_id: str) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Converts any string into a UUID-like format in a deterministic way.
|
|
54
|
+
|
|
55
|
+
Qdrant does not accept any string as an id, so an internal id has to be
|
|
56
|
+
generated for each point. This is a deterministic way of doing so.
|
|
57
|
+
"""
|
|
58
|
+
return uuid.uuid5(UUID_NAMESPACE, _id).hex
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
QdrantPoint = Union[rest.ScoredPoint, rest.Record]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document:
|
|
65
|
+
payload = {**point.payload}
|
|
66
|
+
payload["score"] = point.score if hasattr(point, "score") else None
|
|
67
|
+
|
|
68
|
+
if not use_sparse_embeddings:
|
|
69
|
+
payload["embedding"] = point.vector if hasattr(point, "vector") else None
|
|
70
|
+
elif hasattr(point, "vector") and point.vector is not None:
|
|
71
|
+
payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME)
|
|
72
|
+
|
|
73
|
+
if SPARSE_VECTORS_NAME in point.vector:
|
|
74
|
+
parse_vector_dict = {
|
|
75
|
+
"indices": point.vector[SPARSE_VECTORS_NAME].indices,
|
|
76
|
+
"values": point.vector[SPARSE_VECTORS_NAME].values,
|
|
77
|
+
}
|
|
78
|
+
payload["sparse_embedding"] = parse_vector_dict
|
|
79
|
+
|
|
80
|
+
return Document.from_dict(payload)
|