qdrant-haystack 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of qdrant-haystack might be problematic. Click here for more details.
- haystack_integrations/components/retrievers/qdrant/__init__.py +2 -2
- haystack_integrations/components/retrievers/qdrant/retriever.py +122 -3
- haystack_integrations/document_stores/qdrant/converters.py +60 -50
- haystack_integrations/document_stores/qdrant/document_store.py +145 -43
- haystack_integrations/document_stores/qdrant/filters.py +185 -181
- {qdrant_haystack-3.2.0.dist-info → qdrant_haystack-3.3.0.dist-info}/METADATA +2 -2
- qdrant_haystack-3.3.0.dist-info/RECORD +10 -0
- {qdrant_haystack-3.2.0.dist-info → qdrant_haystack-3.3.0.dist-info}/WHEEL +1 -1
- qdrant_haystack-3.2.0.dist-info/RECORD +0 -10
- {qdrant_haystack-3.2.0.dist-info → qdrant_haystack-3.3.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
from .retriever import QdrantEmbeddingRetriever
|
|
5
|
+
from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever
|
|
6
6
|
|
|
7
|
-
__all__ = ("QdrantEmbeddingRetriever",)
|
|
7
|
+
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever")
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
from typing import Any, Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
from haystack import Document, component, default_from_dict, default_to_dict
|
|
4
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
4
5
|
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
@component
|
|
8
9
|
class QdrantEmbeddingRetriever:
|
|
9
10
|
"""
|
|
10
|
-
A component for retrieving documents from an QdrantDocumentStore.
|
|
11
|
+
A component for retrieving documents from an QdrantDocumentStore using dense vectors.
|
|
11
12
|
|
|
12
13
|
Usage example:
|
|
13
14
|
```python
|
|
@@ -32,8 +33,8 @@ class QdrantEmbeddingRetriever:
|
|
|
32
33
|
document_store: QdrantDocumentStore,
|
|
33
34
|
filters: Optional[Dict[str, Any]] = None,
|
|
34
35
|
top_k: int = 10,
|
|
35
|
-
scale_score: bool = True,
|
|
36
|
-
return_embedding: bool = False,
|
|
36
|
+
scale_score: bool = True,
|
|
37
|
+
return_embedding: bool = False,
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
39
40
|
Create a QdrantEmbeddingRetriever component.
|
|
@@ -120,3 +121,121 @@ class QdrantEmbeddingRetriever:
|
|
|
120
121
|
)
|
|
121
122
|
|
|
122
123
|
return {"documents": docs}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@component
|
|
127
|
+
class QdrantSparseRetriever:
|
|
128
|
+
"""
|
|
129
|
+
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.
|
|
130
|
+
|
|
131
|
+
Usage example:
|
|
132
|
+
```python
|
|
133
|
+
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
|
|
134
|
+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
|
|
135
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
136
|
+
|
|
137
|
+
document_store = QdrantDocumentStore(
|
|
138
|
+
":memory:",
|
|
139
|
+
recreate_index=True,
|
|
140
|
+
return_embedding=True,
|
|
141
|
+
wait_result_from_api=True,
|
|
142
|
+
)
|
|
143
|
+
retriever = QdrantSparseRetriever(document_store=document_store)
|
|
144
|
+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
|
|
145
|
+
retriever.run(query_sparse_embedding=sparse_embedding)
|
|
146
|
+
```
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(
|
|
150
|
+
self,
|
|
151
|
+
document_store: QdrantDocumentStore,
|
|
152
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
153
|
+
top_k: int = 10,
|
|
154
|
+
scale_score: bool = True,
|
|
155
|
+
return_embedding: bool = False,
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Create a QdrantSparseRetriever component.
|
|
159
|
+
|
|
160
|
+
:param document_store: An instance of QdrantDocumentStore.
|
|
161
|
+
:param filters: A dictionary with filters to narrow down the search space. Default is None.
|
|
162
|
+
:param top_k: The maximum number of documents to retrieve. Default is 10.
|
|
163
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
|
|
164
|
+
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
|
|
165
|
+
|
|
166
|
+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
if not isinstance(document_store, QdrantDocumentStore):
|
|
170
|
+
msg = "document_store must be an instance of QdrantDocumentStore"
|
|
171
|
+
raise ValueError(msg)
|
|
172
|
+
|
|
173
|
+
self._document_store = document_store
|
|
174
|
+
self._filters = filters
|
|
175
|
+
self._top_k = top_k
|
|
176
|
+
self._scale_score = scale_score
|
|
177
|
+
self._return_embedding = return_embedding
|
|
178
|
+
|
|
179
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
180
|
+
"""
|
|
181
|
+
Serializes the component to a dictionary.
|
|
182
|
+
|
|
183
|
+
:returns:
|
|
184
|
+
Dictionary with serialized data.
|
|
185
|
+
"""
|
|
186
|
+
d = default_to_dict(
|
|
187
|
+
self,
|
|
188
|
+
document_store=self._document_store,
|
|
189
|
+
filters=self._filters,
|
|
190
|
+
top_k=self._top_k,
|
|
191
|
+
scale_score=self._scale_score,
|
|
192
|
+
return_embedding=self._return_embedding,
|
|
193
|
+
)
|
|
194
|
+
d["init_parameters"]["document_store"] = self._document_store.to_dict()
|
|
195
|
+
|
|
196
|
+
return d
|
|
197
|
+
|
|
198
|
+
@classmethod
|
|
199
|
+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
|
|
200
|
+
"""
|
|
201
|
+
Deserializes the component from a dictionary.
|
|
202
|
+
|
|
203
|
+
:param data:
|
|
204
|
+
Dictionary to deserialize from.
|
|
205
|
+
:returns:
|
|
206
|
+
Deserialized component.
|
|
207
|
+
"""
|
|
208
|
+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
|
|
209
|
+
data["init_parameters"]["document_store"] = document_store
|
|
210
|
+
return default_from_dict(cls, data)
|
|
211
|
+
|
|
212
|
+
@component.output_types(documents=List[Document])
|
|
213
|
+
def run(
|
|
214
|
+
self,
|
|
215
|
+
query_sparse_embedding: SparseEmbedding,
|
|
216
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
217
|
+
top_k: Optional[int] = None,
|
|
218
|
+
scale_score: Optional[bool] = None,
|
|
219
|
+
return_embedding: Optional[bool] = None,
|
|
220
|
+
):
|
|
221
|
+
"""
|
|
222
|
+
Run the Sparse Embedding Retriever on the given input data.
|
|
223
|
+
|
|
224
|
+
:param query_sparse_embedding: Sparse Embedding of the query.
|
|
225
|
+
:param filters: A dictionary with filters to narrow down the search space.
|
|
226
|
+
:param top_k: The maximum number of documents to return.
|
|
227
|
+
:param scale_score: Whether to scale the scores of the retrieved documents or not.
|
|
228
|
+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
|
|
229
|
+
:returns:
|
|
230
|
+
The retrieved documents.
|
|
231
|
+
|
|
232
|
+
"""
|
|
233
|
+
docs = self._document_store.query_by_sparse(
|
|
234
|
+
query_sparse_embedding=query_sparse_embedding,
|
|
235
|
+
filters=filters or self._filters,
|
|
236
|
+
top_k=top_k or self._top_k,
|
|
237
|
+
scale_score=scale_score or self._scale_score,
|
|
238
|
+
return_embedding=return_embedding or self._return_embedding,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return {"documents": docs}
|
|
@@ -7,64 +7,74 @@ from qdrant_client.http import models as rest
|
|
|
7
7
|
|
|
8
8
|
logger = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
|
+
DENSE_VECTORS_NAME = "text-dense"
|
|
11
|
+
SPARSE_VECTORS_NAME = "text-sparse"
|
|
10
12
|
|
|
11
|
-
class HaystackToQdrant:
|
|
12
|
-
"""A converter from Haystack to Qdrant types."""
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
16
|
+
|
|
17
|
+
def convert_haystack_documents_to_qdrant_points(
|
|
18
|
+
documents: List[Document],
|
|
19
|
+
*,
|
|
20
|
+
embedding_field: str,
|
|
21
|
+
use_sparse_embeddings: bool,
|
|
22
|
+
) -> List[rest.PointStruct]:
|
|
23
|
+
points = []
|
|
24
|
+
for document in documents:
|
|
25
|
+
payload = document.to_dict(flatten=False)
|
|
26
|
+
if use_sparse_embeddings:
|
|
27
|
+
vector = {}
|
|
28
|
+
|
|
29
|
+
dense_vector = payload.pop(embedding_field, None)
|
|
30
|
+
if dense_vector is not None:
|
|
31
|
+
vector[DENSE_VECTORS_NAME] = dense_vector
|
|
32
|
+
|
|
33
|
+
sparse_vector = payload.pop("sparse_embedding", None)
|
|
34
|
+
if sparse_vector is not None:
|
|
35
|
+
sparse_vector_instance = rest.SparseVector(**sparse_vector)
|
|
36
|
+
vector[SPARSE_VECTORS_NAME] = sparse_vector_instance
|
|
37
|
+
|
|
38
|
+
else:
|
|
25
39
|
vector = payload.pop(embedding_field) or {}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
return points
|
|
46
|
-
|
|
47
|
-
def convert_id(self, _id: str) -> str:
|
|
48
|
-
"""
|
|
49
|
-
Converts any string into a UUID-like format in a deterministic way.
|
|
50
|
-
|
|
51
|
-
Qdrant does not accept any string as an id, so an internal id has to be
|
|
52
|
-
generated for each point. This is a deterministic way of doing so.
|
|
53
|
-
"""
|
|
54
|
-
return uuid.uuid5(self.UUID_NAMESPACE, _id).hex
|
|
40
|
+
_id = convert_id(payload.get("id"))
|
|
41
|
+
|
|
42
|
+
point = rest.PointStruct(
|
|
43
|
+
payload=payload,
|
|
44
|
+
vector=vector,
|
|
45
|
+
id=_id,
|
|
46
|
+
)
|
|
47
|
+
points.append(point)
|
|
48
|
+
return points
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def convert_id(_id: str) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Converts any string into a UUID-like format in a deterministic way.
|
|
54
|
+
|
|
55
|
+
Qdrant does not accept any string as an id, so an internal id has to be
|
|
56
|
+
generated for each point. This is a deterministic way of doing so.
|
|
57
|
+
"""
|
|
58
|
+
return uuid.uuid5(UUID_NAMESPACE, _id).hex
|
|
55
59
|
|
|
56
60
|
|
|
57
61
|
QdrantPoint = Union[rest.ScoredPoint, rest.Record]
|
|
58
62
|
|
|
59
63
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
self.name_field = name_field
|
|
64
|
-
self.embedding_field = embedding_field
|
|
64
|
+
def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document:
|
|
65
|
+
payload = {**point.payload}
|
|
66
|
+
payload["score"] = point.score if hasattr(point, "score") else None
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
payload = {**point.payload}
|
|
68
|
+
if not use_sparse_embeddings:
|
|
68
69
|
payload["embedding"] = point.vector if hasattr(point, "vector") else None
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
elif hasattr(point, "vector") and point.vector is not None:
|
|
71
|
+
payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME)
|
|
72
|
+
|
|
73
|
+
if SPARSE_VECTORS_NAME in point.vector:
|
|
74
|
+
parse_vector_dict = {
|
|
75
|
+
"indices": point.vector[SPARSE_VECTORS_NAME].indices,
|
|
76
|
+
"values": point.vector[SPARSE_VECTORS_NAME].values,
|
|
77
|
+
}
|
|
78
|
+
payload["sparse_embedding"] = parse_vector_dict
|
|
79
|
+
|
|
80
|
+
return Document.from_dict(payload)
|
|
@@ -8,17 +8,24 @@ import qdrant_client
|
|
|
8
8
|
from grpc import RpcError
|
|
9
9
|
from haystack import default_from_dict, default_to_dict
|
|
10
10
|
from haystack.dataclasses import Document
|
|
11
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
11
12
|
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
12
13
|
from haystack.document_stores.types import DuplicatePolicy
|
|
13
14
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
14
|
-
from haystack.utils.filters import convert
|
|
15
|
+
from haystack.utils.filters import convert as convert_legacy_filters
|
|
15
16
|
from qdrant_client import grpc
|
|
16
17
|
from qdrant_client.http import models as rest
|
|
17
18
|
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
18
19
|
from tqdm import tqdm
|
|
19
20
|
|
|
20
|
-
from .converters import
|
|
21
|
-
|
|
21
|
+
from .converters import (
|
|
22
|
+
DENSE_VECTORS_NAME,
|
|
23
|
+
SPARSE_VECTORS_NAME,
|
|
24
|
+
convert_haystack_documents_to_qdrant_points,
|
|
25
|
+
convert_id,
|
|
26
|
+
convert_qdrant_point_to_haystack_document,
|
|
27
|
+
)
|
|
28
|
+
from .filters import convert_filters_to_qdrant
|
|
22
29
|
|
|
23
30
|
logger = logging.getLogger(__name__)
|
|
24
31
|
|
|
@@ -54,7 +61,7 @@ class QdrantDocumentStore:
|
|
|
54
61
|
url: Optional[str] = None,
|
|
55
62
|
port: int = 6333,
|
|
56
63
|
grpc_port: int = 6334,
|
|
57
|
-
prefer_grpc: bool = False,
|
|
64
|
+
prefer_grpc: bool = False,
|
|
58
65
|
https: Optional[bool] = None,
|
|
59
66
|
api_key: Optional[Secret] = None,
|
|
60
67
|
prefix: Optional[str] = None,
|
|
@@ -63,15 +70,16 @@ class QdrantDocumentStore:
|
|
|
63
70
|
path: Optional[str] = None,
|
|
64
71
|
index: str = "Document",
|
|
65
72
|
embedding_dim: int = 768,
|
|
66
|
-
on_disk: bool = False,
|
|
73
|
+
on_disk: bool = False,
|
|
67
74
|
content_field: str = "content",
|
|
68
75
|
name_field: str = "name",
|
|
69
76
|
embedding_field: str = "embedding",
|
|
77
|
+
use_sparse_embeddings: bool = False,
|
|
70
78
|
similarity: str = "cosine",
|
|
71
|
-
return_embedding: bool = False,
|
|
72
|
-
progress_bar: bool = True,
|
|
79
|
+
return_embedding: bool = False,
|
|
80
|
+
progress_bar: bool = True,
|
|
73
81
|
duplicate_documents: str = "overwrite",
|
|
74
|
-
recreate_index: bool = False,
|
|
82
|
+
recreate_index: bool = False,
|
|
75
83
|
shard_number: Optional[int] = None,
|
|
76
84
|
replication_factor: Optional[int] = None,
|
|
77
85
|
write_consistency_factor: Optional[int] = None,
|
|
@@ -81,7 +89,7 @@ class QdrantDocumentStore:
|
|
|
81
89
|
wal_config: Optional[dict] = None,
|
|
82
90
|
quantization_config: Optional[dict] = None,
|
|
83
91
|
init_from: Optional[dict] = None,
|
|
84
|
-
wait_result_from_api: bool = True,
|
|
92
|
+
wait_result_from_api: bool = True,
|
|
85
93
|
metadata: Optional[dict] = None,
|
|
86
94
|
write_batch_size: int = 100,
|
|
87
95
|
scroll_size: int = 10_000,
|
|
@@ -133,9 +141,12 @@ class QdrantDocumentStore:
|
|
|
133
141
|
self.wait_result_from_api = wait_result_from_api
|
|
134
142
|
self.recreate_index = recreate_index
|
|
135
143
|
self.payload_fields_to_index = payload_fields_to_index
|
|
144
|
+
self.use_sparse_embeddings = use_sparse_embeddings
|
|
136
145
|
|
|
137
146
|
# Make sure the collection is properly set up
|
|
138
|
-
self._set_up_collection(
|
|
147
|
+
self._set_up_collection(
|
|
148
|
+
index, embedding_dim, recreate_index, similarity, use_sparse_embeddings, on_disk, payload_fields_to_index
|
|
149
|
+
)
|
|
139
150
|
|
|
140
151
|
self.embedding_dim = embedding_dim
|
|
141
152
|
self.on_disk = on_disk
|
|
@@ -147,13 +158,6 @@ class QdrantDocumentStore:
|
|
|
147
158
|
self.return_embedding = return_embedding
|
|
148
159
|
self.progress_bar = progress_bar
|
|
149
160
|
self.duplicate_documents = duplicate_documents
|
|
150
|
-
self.qdrant_filter_converter = QdrantFilterConverter()
|
|
151
|
-
self.haystack_to_qdrant_converter = HaystackToQdrant()
|
|
152
|
-
self.qdrant_to_haystack = QdrantToHaystack(
|
|
153
|
-
content_field,
|
|
154
|
-
name_field,
|
|
155
|
-
embedding_field,
|
|
156
|
-
)
|
|
157
161
|
self.write_batch_size = write_batch_size
|
|
158
162
|
self.scroll_size = scroll_size
|
|
159
163
|
|
|
@@ -178,7 +182,7 @@ class QdrantDocumentStore:
|
|
|
178
182
|
raise ValueError(msg)
|
|
179
183
|
|
|
180
184
|
if filters and "operator" not in filters:
|
|
181
|
-
filters =
|
|
185
|
+
filters = convert_legacy_filters(filters)
|
|
182
186
|
return list(
|
|
183
187
|
self.get_documents_generator(
|
|
184
188
|
filters,
|
|
@@ -194,7 +198,7 @@ class QdrantDocumentStore:
|
|
|
194
198
|
if not isinstance(doc, Document):
|
|
195
199
|
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
|
|
196
200
|
raise ValueError(msg)
|
|
197
|
-
self._set_up_collection(self.index, self.embedding_dim, False, self.similarity)
|
|
201
|
+
self._set_up_collection(self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings)
|
|
198
202
|
|
|
199
203
|
if len(documents) == 0:
|
|
200
204
|
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
|
|
@@ -209,9 +213,10 @@ class QdrantDocumentStore:
|
|
|
209
213
|
batched_documents = get_batches_from_generator(document_objects, self.write_batch_size)
|
|
210
214
|
with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
|
|
211
215
|
for document_batch in batched_documents:
|
|
212
|
-
batch =
|
|
216
|
+
batch = convert_haystack_documents_to_qdrant_points(
|
|
213
217
|
document_batch,
|
|
214
218
|
embedding_field=self.embedding_field,
|
|
219
|
+
use_sparse_embeddings=self.use_sparse_embeddings,
|
|
215
220
|
)
|
|
216
221
|
|
|
217
222
|
self.client.upsert(
|
|
@@ -224,7 +229,7 @@ class QdrantDocumentStore:
|
|
|
224
229
|
return len(document_objects)
|
|
225
230
|
|
|
226
231
|
def delete_documents(self, ids: List[str]):
|
|
227
|
-
ids = [
|
|
232
|
+
ids = [convert_id(_id) for _id in ids]
|
|
228
233
|
try:
|
|
229
234
|
self.client.delete(
|
|
230
235
|
collection_name=self.index,
|
|
@@ -257,7 +262,7 @@ class QdrantDocumentStore:
|
|
|
257
262
|
filters: Optional[Dict[str, Any]] = None,
|
|
258
263
|
) -> Generator[Document, None, None]:
|
|
259
264
|
index = self.index
|
|
260
|
-
qdrant_filters =
|
|
265
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
261
266
|
|
|
262
267
|
next_offset = None
|
|
263
268
|
stop_scrolling = False
|
|
@@ -275,7 +280,9 @@ class QdrantDocumentStore:
|
|
|
275
280
|
)
|
|
276
281
|
|
|
277
282
|
for record in records:
|
|
278
|
-
yield
|
|
283
|
+
yield convert_qdrant_point_to_haystack_document(
|
|
284
|
+
record, use_sparse_embeddings=self.use_sparse_embeddings
|
|
285
|
+
)
|
|
279
286
|
|
|
280
287
|
def get_documents_by_id(
|
|
281
288
|
self,
|
|
@@ -286,7 +293,7 @@ class QdrantDocumentStore:
|
|
|
286
293
|
|
|
287
294
|
documents: List[Document] = []
|
|
288
295
|
|
|
289
|
-
ids = [
|
|
296
|
+
ids = [convert_id(_id) for _id in ids]
|
|
290
297
|
records = self.client.retrieve(
|
|
291
298
|
collection_name=index,
|
|
292
299
|
ids=ids,
|
|
@@ -295,28 +302,77 @@ class QdrantDocumentStore:
|
|
|
295
302
|
)
|
|
296
303
|
|
|
297
304
|
for record in records:
|
|
298
|
-
documents.append(
|
|
305
|
+
documents.append(
|
|
306
|
+
convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
307
|
+
)
|
|
299
308
|
return documents
|
|
300
309
|
|
|
310
|
+
def query_by_sparse(
|
|
311
|
+
self,
|
|
312
|
+
query_sparse_embedding: SparseEmbedding,
|
|
313
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
314
|
+
top_k: int = 10,
|
|
315
|
+
scale_score: bool = True,
|
|
316
|
+
return_embedding: bool = False,
|
|
317
|
+
) -> List[Document]:
|
|
318
|
+
if not self.use_sparse_embeddings:
|
|
319
|
+
message = (
|
|
320
|
+
"You are trying to query using sparse embeddings, but the Document Store "
|
|
321
|
+
"was initialized with `use_sparse_embeddings=False`. "
|
|
322
|
+
)
|
|
323
|
+
raise QdrantStoreError(message)
|
|
324
|
+
|
|
325
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
326
|
+
query_indices = query_sparse_embedding.indices
|
|
327
|
+
query_values = query_sparse_embedding.values
|
|
328
|
+
points = self.client.search(
|
|
329
|
+
collection_name=self.index,
|
|
330
|
+
query_vector=rest.NamedSparseVector(
|
|
331
|
+
name=SPARSE_VECTORS_NAME,
|
|
332
|
+
vector=rest.SparseVector(
|
|
333
|
+
indices=query_indices,
|
|
334
|
+
values=query_values,
|
|
335
|
+
),
|
|
336
|
+
),
|
|
337
|
+
query_filter=qdrant_filters,
|
|
338
|
+
limit=top_k,
|
|
339
|
+
with_vectors=return_embedding,
|
|
340
|
+
)
|
|
341
|
+
results = [
|
|
342
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
343
|
+
for point in points
|
|
344
|
+
]
|
|
345
|
+
if scale_score:
|
|
346
|
+
for document in results:
|
|
347
|
+
score = document.score
|
|
348
|
+
score = float(1 / (1 + np.exp(-score / 100)))
|
|
349
|
+
document.score = score
|
|
350
|
+
return results
|
|
351
|
+
|
|
301
352
|
def query_by_embedding(
|
|
302
353
|
self,
|
|
303
354
|
query_embedding: List[float],
|
|
304
355
|
filters: Optional[Dict[str, Any]] = None,
|
|
305
356
|
top_k: int = 10,
|
|
306
|
-
scale_score: bool = True,
|
|
307
|
-
return_embedding: bool = False,
|
|
357
|
+
scale_score: bool = True,
|
|
358
|
+
return_embedding: bool = False,
|
|
308
359
|
) -> List[Document]:
|
|
309
|
-
qdrant_filters =
|
|
360
|
+
qdrant_filters = convert_filters_to_qdrant(filters)
|
|
310
361
|
|
|
311
362
|
points = self.client.search(
|
|
312
363
|
collection_name=self.index,
|
|
313
|
-
query_vector=
|
|
364
|
+
query_vector=rest.NamedVector(
|
|
365
|
+
name=DENSE_VECTORS_NAME if self.use_sparse_embeddings else "",
|
|
366
|
+
vector=query_embedding,
|
|
367
|
+
),
|
|
314
368
|
query_filter=qdrant_filters,
|
|
315
369
|
limit=top_k,
|
|
316
370
|
with_vectors=return_embedding,
|
|
317
371
|
)
|
|
318
|
-
|
|
319
|
-
|
|
372
|
+
results = [
|
|
373
|
+
convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings)
|
|
374
|
+
for point in points
|
|
375
|
+
]
|
|
320
376
|
if scale_score:
|
|
321
377
|
for document in results:
|
|
322
378
|
score = document.score
|
|
@@ -355,9 +411,10 @@ class QdrantDocumentStore:
|
|
|
355
411
|
self,
|
|
356
412
|
collection_name: str,
|
|
357
413
|
embedding_dim: int,
|
|
358
|
-
recreate_collection: bool,
|
|
414
|
+
recreate_collection: bool,
|
|
359
415
|
similarity: str,
|
|
360
|
-
|
|
416
|
+
use_sparse_embeddings: bool,
|
|
417
|
+
on_disk: bool = False,
|
|
361
418
|
payload_fields_to_index: Optional[List[dict]] = None,
|
|
362
419
|
):
|
|
363
420
|
distance = self._get_distance(similarity)
|
|
@@ -365,7 +422,7 @@ class QdrantDocumentStore:
|
|
|
365
422
|
if recreate_collection:
|
|
366
423
|
# There is no need to verify the current configuration of that
|
|
367
424
|
# collection. It might be just recreated again.
|
|
368
|
-
self._recreate_collection(collection_name, distance, embedding_dim, on_disk)
|
|
425
|
+
self._recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings)
|
|
369
426
|
# Create Payload index if payload_fields_to_index is provided
|
|
370
427
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
371
428
|
return
|
|
@@ -381,13 +438,39 @@ class QdrantDocumentStore:
|
|
|
381
438
|
# Qdrant local raises ValueError if the collection is not found, but
|
|
382
439
|
# with the remote server UnexpectedResponse / RpcError is raised.
|
|
383
440
|
# Until that's unified, we need to catch both.
|
|
384
|
-
self._recreate_collection(collection_name, distance, embedding_dim, on_disk)
|
|
441
|
+
self._recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings)
|
|
385
442
|
# Create Payload index if payload_fields_to_index is provided
|
|
386
443
|
self._create_payload_index(collection_name, payload_fields_to_index)
|
|
387
444
|
return
|
|
388
445
|
|
|
389
|
-
|
|
390
|
-
|
|
446
|
+
has_named_vectors = (
|
|
447
|
+
isinstance(collection_info.config.params.vectors, dict)
|
|
448
|
+
and DENSE_VECTORS_NAME in collection_info.config.params.vectors
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
if self.use_sparse_embeddings and not has_named_vectors:
|
|
452
|
+
msg = (
|
|
453
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
454
|
+
f"but it has been originally created without sparse embedding vectors. "
|
|
455
|
+
f"If you want to use that collection, you can set `use_sparse_embeddings=False`. "
|
|
456
|
+
f"To use sparse embeddings, you need to recreate the collection or migrate the existing one."
|
|
457
|
+
)
|
|
458
|
+
raise QdrantStoreError(msg)
|
|
459
|
+
|
|
460
|
+
elif not self.use_sparse_embeddings and has_named_vectors:
|
|
461
|
+
msg = (
|
|
462
|
+
f"Collection '{collection_name}' already exists in Qdrant, "
|
|
463
|
+
f"but it has been originally created with sparse embedding vectors."
|
|
464
|
+
f"If you want to use that collection, please set `use_sparse_embeddings=True`."
|
|
465
|
+
)
|
|
466
|
+
raise QdrantStoreError(msg)
|
|
467
|
+
|
|
468
|
+
if self.use_sparse_embeddings:
|
|
469
|
+
current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance
|
|
470
|
+
current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size
|
|
471
|
+
else:
|
|
472
|
+
current_distance = collection_info.config.params.vectors.distance
|
|
473
|
+
current_vector_size = collection_info.config.params.vectors.size
|
|
391
474
|
|
|
392
475
|
if current_distance != distance:
|
|
393
476
|
msg = (
|
|
@@ -407,14 +490,33 @@ class QdrantDocumentStore:
|
|
|
407
490
|
)
|
|
408
491
|
raise ValueError(msg)
|
|
409
492
|
|
|
410
|
-
def _recreate_collection(
|
|
493
|
+
def _recreate_collection(
|
|
494
|
+
self,
|
|
495
|
+
collection_name: str,
|
|
496
|
+
distance,
|
|
497
|
+
embedding_dim: int,
|
|
498
|
+
on_disk: bool,
|
|
499
|
+
use_sparse_embeddings: bool,
|
|
500
|
+
):
|
|
501
|
+
# dense vectors configuration
|
|
502
|
+
vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance)
|
|
503
|
+
|
|
504
|
+
if use_sparse_embeddings:
|
|
505
|
+
# in this case, we need to define named vectors
|
|
506
|
+
vectors_config = {DENSE_VECTORS_NAME: vectors_config}
|
|
507
|
+
|
|
508
|
+
sparse_vectors_config = {
|
|
509
|
+
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
|
|
510
|
+
index=rest.SparseIndexParams(
|
|
511
|
+
on_disk=on_disk,
|
|
512
|
+
)
|
|
513
|
+
),
|
|
514
|
+
}
|
|
515
|
+
|
|
411
516
|
self.client.recreate_collection(
|
|
412
517
|
collection_name=collection_name,
|
|
413
|
-
vectors_config=
|
|
414
|
-
|
|
415
|
-
on_disk=on_disk,
|
|
416
|
-
distance=distance,
|
|
417
|
-
),
|
|
518
|
+
vectors_config=vectors_config,
|
|
519
|
+
sparse_vectors_config=sparse_vectors_config if use_sparse_embeddings else None,
|
|
418
520
|
shard_number=self.shard_number,
|
|
419
521
|
replication_factor=self.replication_factor,
|
|
420
522
|
write_consistency_factor=self.write_consistency_factor,
|
|
@@ -4,226 +4,230 @@ from typing import List, Optional, Union
|
|
|
4
4
|
from haystack.utils.filters import COMPARISON_OPERATORS, LOGICAL_OPERATORS, FilterError
|
|
5
5
|
from qdrant_client.http import models
|
|
6
6
|
|
|
7
|
-
from .converters import
|
|
7
|
+
from .converters import convert_id
|
|
8
8
|
|
|
9
9
|
COMPARISON_OPERATORS = COMPARISON_OPERATORS.keys()
|
|
10
10
|
LOGICAL_OPERATORS = LOGICAL_OPERATORS.keys()
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
def convert_filters_to_qdrant(
|
|
14
|
+
filter_term: Optional[Union[List[dict], dict]] = None,
|
|
15
|
+
) -> Optional[models.Filter]:
|
|
14
16
|
"""Converts Haystack filters to the format used by Qdrant."""
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
if not filter_term:
|
|
19
|
+
return None
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
self,
|
|
21
|
-
filter_term: Optional[Union[List[dict], dict]] = None,
|
|
22
|
-
) -> Optional[models.Filter]:
|
|
23
|
-
if not filter_term:
|
|
24
|
-
return None
|
|
21
|
+
must_clauses, should_clauses, must_not_clauses = [], [], []
|
|
25
22
|
|
|
26
|
-
|
|
23
|
+
if isinstance(filter_term, dict):
|
|
24
|
+
filter_term = [filter_term]
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
for item in filter_term:
|
|
27
|
+
operator = item.get("operator")
|
|
28
|
+
if operator is None:
|
|
29
|
+
msg = "Operator not found in filters"
|
|
30
|
+
raise FilterError(msg)
|
|
30
31
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
msg = "Operator not found in filters"
|
|
35
|
-
raise FilterError(msg)
|
|
32
|
+
if operator in LOGICAL_OPERATORS and "conditions" not in item:
|
|
33
|
+
msg = f"'conditions' not found for '{operator}'"
|
|
34
|
+
raise FilterError(msg)
|
|
36
35
|
|
|
37
|
-
|
|
38
|
-
|
|
36
|
+
if operator == "AND":
|
|
37
|
+
must_clauses.append(convert_filters_to_qdrant(item.get("conditions", [])))
|
|
38
|
+
elif operator == "OR":
|
|
39
|
+
should_clauses.append(convert_filters_to_qdrant(item.get("conditions", [])))
|
|
40
|
+
elif operator == "NOT":
|
|
41
|
+
must_not_clauses.append(convert_filters_to_qdrant(item.get("conditions", [])))
|
|
42
|
+
elif operator in COMPARISON_OPERATORS:
|
|
43
|
+
field = item.get("field")
|
|
44
|
+
value = item.get("value")
|
|
45
|
+
if field is None or value is None:
|
|
46
|
+
msg = f"'field' or 'value' not found for '{operator}'"
|
|
39
47
|
raise FilterError(msg)
|
|
40
48
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
elif operator == "NOT":
|
|
46
|
-
must_not_clauses.append(self.convert(item.get("conditions", [])))
|
|
47
|
-
elif operator in COMPARISON_OPERATORS:
|
|
48
|
-
field = item.get("field")
|
|
49
|
-
value = item.get("value")
|
|
50
|
-
if field is None or value is None:
|
|
51
|
-
msg = f"'field' or 'value' not found for '{operator}'"
|
|
52
|
-
raise FilterError(msg)
|
|
53
|
-
|
|
54
|
-
must_clauses.extend(
|
|
55
|
-
self._parse_comparison_operation(comparison_operation=operator, key=field, value=value)
|
|
56
|
-
)
|
|
57
|
-
else:
|
|
58
|
-
msg = f"Unknown operator {operator} used in filters"
|
|
59
|
-
raise FilterError(msg)
|
|
49
|
+
must_clauses.extend(_parse_comparison_operation(comparison_operation=operator, key=field, value=value))
|
|
50
|
+
else:
|
|
51
|
+
msg = f"Unknown operator {operator} used in filters"
|
|
52
|
+
raise FilterError(msg)
|
|
60
53
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
54
|
+
payload_filter = models.Filter(
|
|
55
|
+
must=must_clauses or None,
|
|
56
|
+
should=should_clauses or None,
|
|
57
|
+
must_not=must_not_clauses or None,
|
|
58
|
+
)
|
|
66
59
|
|
|
67
|
-
|
|
60
|
+
filter_result = _squeeze_filter(payload_filter)
|
|
68
61
|
|
|
69
|
-
|
|
62
|
+
return filter_result
|
|
70
63
|
|
|
71
|
-
def _parse_comparison_operation(
|
|
72
|
-
self, comparison_operation: str, key: str, value: Union[dict, List, str, float]
|
|
73
|
-
) -> List[models.Condition]:
|
|
74
|
-
conditions: List[models.Condition] = []
|
|
75
64
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
"not in": self._build_nin_condition,
|
|
81
|
-
">": self._build_gt_condition,
|
|
82
|
-
">=": self._build_gte_condition,
|
|
83
|
-
"<": self._build_lt_condition,
|
|
84
|
-
"<=": self._build_lte_condition,
|
|
85
|
-
}
|
|
65
|
+
def _parse_comparison_operation(
|
|
66
|
+
comparison_operation: str, key: str, value: Union[dict, List, str, float]
|
|
67
|
+
) -> List[models.Condition]:
|
|
68
|
+
conditions: List[models.Condition] = []
|
|
86
69
|
|
|
87
|
-
|
|
70
|
+
condition_builder_mapping = {
|
|
71
|
+
"==": _build_eq_condition,
|
|
72
|
+
"in": _build_in_condition,
|
|
73
|
+
"!=": _build_ne_condition,
|
|
74
|
+
"not in": _build_nin_condition,
|
|
75
|
+
">": _build_gt_condition,
|
|
76
|
+
">=": _build_gte_condition,
|
|
77
|
+
"<": _build_lt_condition,
|
|
78
|
+
"<=": _build_lte_condition,
|
|
79
|
+
}
|
|
88
80
|
|
|
89
|
-
|
|
90
|
-
msg = f"Unknown operator {comparison_operation} used in filters"
|
|
91
|
-
raise ValueError(msg)
|
|
81
|
+
condition_builder = condition_builder_mapping.get(comparison_operation)
|
|
92
82
|
|
|
93
|
-
|
|
83
|
+
if condition_builder is None:
|
|
84
|
+
msg = f"Unknown operator {comparison_operation} used in filters"
|
|
85
|
+
raise ValueError(msg)
|
|
94
86
|
|
|
95
|
-
|
|
87
|
+
conditions.append(condition_builder(key, value))
|
|
96
88
|
|
|
97
|
-
|
|
98
|
-
if isinstance(value, str) and " " in value:
|
|
99
|
-
models.FieldCondition(key=key, match=models.MatchText(text=value))
|
|
100
|
-
return models.FieldCondition(key=key, match=models.MatchValue(value=value))
|
|
89
|
+
return conditions
|
|
101
90
|
|
|
102
|
-
def _build_in_condition(self, key: str, value: List[models.ValueVariants]) -> models.Condition:
|
|
103
|
-
if not isinstance(value, list):
|
|
104
|
-
msg = f"Value {value} is not a list"
|
|
105
|
-
raise FilterError(msg)
|
|
106
|
-
return models.Filter(
|
|
107
|
-
should=[
|
|
108
|
-
(
|
|
109
|
-
models.FieldCondition(key=key, match=models.MatchText(text=item))
|
|
110
|
-
if isinstance(item, str) and " " not in item
|
|
111
|
-
else models.FieldCondition(key=key, match=models.MatchValue(value=item))
|
|
112
|
-
)
|
|
113
|
-
for item in value
|
|
114
|
-
]
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
def _build_ne_condition(self, key: str, value: models.ValueVariants) -> models.Condition:
|
|
118
|
-
return models.Filter(
|
|
119
|
-
must_not=[
|
|
120
|
-
(
|
|
121
|
-
models.FieldCondition(key=key, match=models.MatchText(text=value))
|
|
122
|
-
if isinstance(value, str) and " " not in value
|
|
123
|
-
else models.FieldCondition(key=key, match=models.MatchValue(value=value))
|
|
124
|
-
)
|
|
125
|
-
]
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
def _build_nin_condition(self, key: str, value: List[models.ValueVariants]) -> models.Condition:
|
|
129
|
-
if not isinstance(value, list):
|
|
130
|
-
msg = f"Value {value} is not a list"
|
|
131
|
-
raise FilterError(msg)
|
|
132
|
-
return models.Filter(
|
|
133
|
-
must_not=[
|
|
134
|
-
(
|
|
135
|
-
models.FieldCondition(key=key, match=models.MatchText(text=item))
|
|
136
|
-
if isinstance(item, str) and " " in item
|
|
137
|
-
else models.FieldCondition(key=key, match=models.MatchValue(value=item))
|
|
138
|
-
)
|
|
139
|
-
for item in value
|
|
140
|
-
]
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
def _build_lt_condition(self, key: str, value: Union[str, float, int]) -> models.Condition:
|
|
144
|
-
if isinstance(value, str) and is_datetime_string(value):
|
|
145
|
-
return models.FieldCondition(key=key, range=models.DatetimeRange(lt=value))
|
|
146
|
-
|
|
147
|
-
if isinstance(value, (int, float)):
|
|
148
|
-
return models.FieldCondition(key=key, range=models.Range(lt=value))
|
|
149
|
-
|
|
150
|
-
msg = f"Value {value} is not an int or float or datetime string"
|
|
151
|
-
raise FilterError(msg)
|
|
152
91
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
92
|
+
def _build_eq_condition(key: str, value: models.ValueVariants) -> models.Condition:
|
|
93
|
+
if isinstance(value, str) and " " in value:
|
|
94
|
+
models.FieldCondition(key=key, match=models.MatchText(text=value))
|
|
95
|
+
return models.FieldCondition(key=key, match=models.MatchValue(value=value))
|
|
156
96
|
|
|
157
|
-
if isinstance(value, (int, float)):
|
|
158
|
-
return models.FieldCondition(key=key, range=models.Range(lte=value))
|
|
159
97
|
|
|
160
|
-
|
|
98
|
+
def _build_in_condition(key: str, value: List[models.ValueVariants]) -> models.Condition:
|
|
99
|
+
if not isinstance(value, list):
|
|
100
|
+
msg = f"Value {value} is not a list"
|
|
101
|
+
raise FilterError(msg)
|
|
102
|
+
return models.Filter(
|
|
103
|
+
should=[
|
|
104
|
+
(
|
|
105
|
+
models.FieldCondition(key=key, match=models.MatchText(text=item))
|
|
106
|
+
if isinstance(item, str) and " " not in item
|
|
107
|
+
else models.FieldCondition(key=key, match=models.MatchValue(value=item))
|
|
108
|
+
)
|
|
109
|
+
for item in value
|
|
110
|
+
]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_ne_condition(key: str, value: models.ValueVariants) -> models.Condition:
|
|
115
|
+
return models.Filter(
|
|
116
|
+
must_not=[
|
|
117
|
+
(
|
|
118
|
+
models.FieldCondition(key=key, match=models.MatchText(text=value))
|
|
119
|
+
if isinstance(value, str) and " " not in value
|
|
120
|
+
else models.FieldCondition(key=key, match=models.MatchValue(value=value))
|
|
121
|
+
)
|
|
122
|
+
]
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _build_nin_condition(key: str, value: List[models.ValueVariants]) -> models.Condition:
|
|
127
|
+
if not isinstance(value, list):
|
|
128
|
+
msg = f"Value {value} is not a list"
|
|
161
129
|
raise FilterError(msg)
|
|
130
|
+
return models.Filter(
|
|
131
|
+
must_not=[
|
|
132
|
+
(
|
|
133
|
+
models.FieldCondition(key=key, match=models.MatchText(text=item))
|
|
134
|
+
if isinstance(item, str) and " " in item
|
|
135
|
+
else models.FieldCondition(key=key, match=models.MatchValue(value=item))
|
|
136
|
+
)
|
|
137
|
+
for item in value
|
|
138
|
+
]
|
|
139
|
+
)
|
|
162
140
|
|
|
163
|
-
def _build_gt_condition(self, key: str, value: Union[str, float, int]) -> models.Condition:
|
|
164
|
-
if isinstance(value, str) and is_datetime_string(value):
|
|
165
|
-
return models.FieldCondition(key=key, range=models.DatetimeRange(gt=value))
|
|
166
141
|
|
|
167
|
-
|
|
168
|
-
|
|
142
|
+
def _build_lt_condition(key: str, value: Union[str, float, int]) -> models.Condition:
|
|
143
|
+
if isinstance(value, str) and is_datetime_string(value):
|
|
144
|
+
return models.FieldCondition(key=key, range=models.DatetimeRange(lt=value))
|
|
169
145
|
|
|
170
|
-
|
|
171
|
-
|
|
146
|
+
if isinstance(value, (int, float)):
|
|
147
|
+
return models.FieldCondition(key=key, range=models.Range(lt=value))
|
|
172
148
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
return models.FieldCondition(key=key, range=models.DatetimeRange(gte=value))
|
|
149
|
+
msg = f"Value {value} is not an int or float or datetime string"
|
|
150
|
+
raise FilterError(msg)
|
|
176
151
|
|
|
177
|
-
if isinstance(value, (int, float)):
|
|
178
|
-
return models.FieldCondition(key=key, range=models.Range(gte=value))
|
|
179
152
|
|
|
180
|
-
|
|
181
|
-
|
|
153
|
+
def _build_lte_condition(key: str, value: Union[str, float, int]) -> models.Condition:
|
|
154
|
+
if isinstance(value, str) and is_datetime_string(value):
|
|
155
|
+
return models.FieldCondition(key=key, range=models.DatetimeRange(lte=value))
|
|
156
|
+
|
|
157
|
+
if isinstance(value, (int, float)):
|
|
158
|
+
return models.FieldCondition(key=key, range=models.Range(lte=value))
|
|
159
|
+
|
|
160
|
+
msg = f"Value {value} is not an int or float or datetime string"
|
|
161
|
+
raise FilterError(msg)
|
|
162
|
+
|
|
182
163
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# Ids are converted into their internal representation
|
|
187
|
-
self.haystack_to_qdrant_converter.convert_id(item)
|
|
188
|
-
for item in id_values
|
|
189
|
-
]
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
def _squeeze_filter(self, payload_filter: models.Filter) -> models.Filter:
|
|
193
|
-
"""
|
|
194
|
-
Simplify given payload filter, if the nested structure might be unnested.
|
|
195
|
-
That happens if there is a single clause in that filter.
|
|
196
|
-
:param payload_filter:
|
|
197
|
-
:returns:
|
|
198
|
-
"""
|
|
199
|
-
filter_parts = {
|
|
200
|
-
"must": payload_filter.must,
|
|
201
|
-
"should": payload_filter.should,
|
|
202
|
-
"must_not": payload_filter.must_not,
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
total_clauses = sum(len(x) for x in filter_parts.values() if x is not None)
|
|
206
|
-
if total_clauses == 0 or total_clauses > 1:
|
|
207
|
-
return payload_filter
|
|
208
|
-
|
|
209
|
-
# Payload filter has just a single clause provided (either must, should
|
|
210
|
-
# or must_not). If that single clause is also of a models.Filter type,
|
|
211
|
-
# then it might be returned instead.
|
|
212
|
-
for part_name, filter_part in filter_parts.items():
|
|
213
|
-
if not filter_part:
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
subfilter = filter_part[0]
|
|
217
|
-
if not isinstance(subfilter, models.Filter):
|
|
218
|
-
# The inner statement is a simple condition like models.FieldCondition
|
|
219
|
-
# so it cannot be simplified.
|
|
220
|
-
continue
|
|
221
|
-
|
|
222
|
-
if subfilter.must:
|
|
223
|
-
return models.Filter(**{part_name: subfilter.must})
|
|
164
|
+
def _build_gt_condition(key: str, value: Union[str, float, int]) -> models.Condition:
|
|
165
|
+
if isinstance(value, str) and is_datetime_string(value):
|
|
166
|
+
return models.FieldCondition(key=key, range=models.DatetimeRange(gt=value))
|
|
224
167
|
|
|
168
|
+
if isinstance(value, (int, float)):
|
|
169
|
+
return models.FieldCondition(key=key, range=models.Range(gt=value))
|
|
170
|
+
|
|
171
|
+
msg = f"Value {value} is not an int or float or datetime string"
|
|
172
|
+
raise FilterError(msg)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _build_gte_condition(key: str, value: Union[str, float, int]) -> models.Condition:
|
|
176
|
+
if isinstance(value, str) and is_datetime_string(value):
|
|
177
|
+
return models.FieldCondition(key=key, range=models.DatetimeRange(gte=value))
|
|
178
|
+
|
|
179
|
+
if isinstance(value, (int, float)):
|
|
180
|
+
return models.FieldCondition(key=key, range=models.Range(gte=value))
|
|
181
|
+
|
|
182
|
+
msg = f"Value {value} is not an int or float or datetime string"
|
|
183
|
+
raise FilterError(msg)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _build_has_id_condition(id_values: List[models.ExtendedPointId]) -> models.HasIdCondition:
|
|
187
|
+
return models.HasIdCondition(
|
|
188
|
+
has_id=[
|
|
189
|
+
# Ids are converted into their internal representation
|
|
190
|
+
convert_id(item)
|
|
191
|
+
for item in id_values
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _squeeze_filter(payload_filter: models.Filter) -> models.Filter:
|
|
197
|
+
"""
|
|
198
|
+
Simplify given payload filter, if the nested structure might be unnested.
|
|
199
|
+
That happens if there is a single clause in that filter.
|
|
200
|
+
:param payload_filter:
|
|
201
|
+
:returns:
|
|
202
|
+
"""
|
|
203
|
+
filter_parts = {
|
|
204
|
+
"must": payload_filter.must,
|
|
205
|
+
"should": payload_filter.should,
|
|
206
|
+
"must_not": payload_filter.must_not,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
total_clauses = sum(len(x) for x in filter_parts.values() if x is not None)
|
|
210
|
+
if total_clauses == 0 or total_clauses > 1:
|
|
225
211
|
return payload_filter
|
|
226
212
|
|
|
213
|
+
# Payload filter has just a single clause provided (either must, should
|
|
214
|
+
# or must_not). If that single clause is also of a models.Filter type,
|
|
215
|
+
# then it might be returned instead.
|
|
216
|
+
for part_name, filter_part in filter_parts.items():
|
|
217
|
+
if not filter_part:
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
subfilter = filter_part[0]
|
|
221
|
+
if not isinstance(subfilter, models.Filter):
|
|
222
|
+
# The inner statement is a simple condition like models.FieldCondition
|
|
223
|
+
# so it cannot be simplified.
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
if subfilter.must:
|
|
227
|
+
return models.Filter(**{part_name: subfilter.must})
|
|
228
|
+
|
|
229
|
+
return payload_filter
|
|
230
|
+
|
|
227
231
|
|
|
228
232
|
def is_datetime_string(value: str) -> bool:
|
|
229
233
|
try:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: qdrant-haystack
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.0
|
|
4
4
|
Summary: An integration of Qdrant ANN vector database backend with Haystack
|
|
5
5
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations
|
|
6
6
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/qdrant/README.md
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
17
17
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
18
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
19
19
|
Requires-Python: >=3.8
|
|
20
|
-
Requires-Dist: haystack-ai>=2.0.
|
|
20
|
+
Requires-Dist: haystack-ai>=2.0.1
|
|
21
21
|
Requires-Dist: qdrant-client
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
haystack_integrations/components/retrievers/qdrant/__init__.py,sha256=5P4opz_iVPY-6ntpNdQgTu2MAS102_ki8fT-rm7aiJs,247
|
|
2
|
+
haystack_integrations/components/retrievers/qdrant/retriever.py,sha256=M1_5suJnF9VDC0fPbHyfU8kx_YIYG7TEyAWkIYbLTN8,8817
|
|
3
|
+
haystack_integrations/document_stores/qdrant/__init__.py,sha256=PuGxUj29V00f6UiCpTHRkzGufL8bJUML2iNwJnX2KwM,195
|
|
4
|
+
haystack_integrations/document_stores/qdrant/converters.py,sha256=oSO2YlsWEQbcw9CPlWfSg_HoTZlnkAhZw_6VlYWzKKs,2525
|
|
5
|
+
haystack_integrations/document_stores/qdrant/document_store.py,sha256=HEUkzkcxRlTSegx1aq95ay1lJFaEmQh8NyxMf3TAarQ,22800
|
|
6
|
+
haystack_integrations/document_stores/qdrant/filters.py,sha256=iNWOqv1otUaXTURXd8e9QOYg8sx3Qm_LOqOaxAP2xJI,8249
|
|
7
|
+
qdrant_haystack-3.3.0.dist-info/METADATA,sha256=dqi3IRTo3fqIASBVbFW8I8y1F9a14vU4z-dAMJKlUOU,1799
|
|
8
|
+
qdrant_haystack-3.3.0.dist-info/WHEEL,sha256=as-1oFTWSeWBgyzh0O_qF439xqBe6AbBgt4MfYe5zwY,87
|
|
9
|
+
qdrant_haystack-3.3.0.dist-info/licenses/LICENSE.txt,sha256=B05uMshqTA74s-0ltyHKI6yoPfJ3zYgQbvcXfDVGFf8,10280
|
|
10
|
+
qdrant_haystack-3.3.0.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
haystack_integrations/components/retrievers/qdrant/__init__.py,sha256=uX6yULYL7RExXbKb9wD7TTz7fKupJf97lUi1YAHgJcY,200
|
|
2
|
-
haystack_integrations/components/retrievers/qdrant/retriever.py,sha256=Pu8qwKive3uvO3z4FnidgwXC0IUgxNRNuAeJrdE70rU,4423
|
|
3
|
-
haystack_integrations/document_stores/qdrant/__init__.py,sha256=PuGxUj29V00f6UiCpTHRkzGufL8bJUML2iNwJnX2KwM,195
|
|
4
|
-
haystack_integrations/document_stores/qdrant/converters.py,sha256=q_S3ATfX2KF4z9c2Z6t3mqz8GnULXAuKXIiGSSKGBJ0,2442
|
|
5
|
-
haystack_integrations/document_stores/qdrant/document_store.py,sha256=-ahV-OuBjGn-299DbLDjq3a0CN_UdqK_-a7NIV2ngYw,18939
|
|
6
|
-
haystack_integrations/document_stores/qdrant/filters.py,sha256=26sgZOdiXEJesk2NdB6NbQoAxEInpLNxO5pLkLnELKE,9170
|
|
7
|
-
qdrant_haystack-3.2.0.dist-info/METADATA,sha256=RrS-NvljhYaySQ2wyInt6Ko9m1xxcqwH8mUcUklhoHo,1801
|
|
8
|
-
qdrant_haystack-3.2.0.dist-info/WHEEL,sha256=uNdcs2TADwSd5pVaP0Z_kcjcvvTUklh2S7bxZMF8Uj0,87
|
|
9
|
-
qdrant_haystack-3.2.0.dist-info/licenses/LICENSE.txt,sha256=B05uMshqTA74s-0ltyHKI6yoPfJ3zYgQbvcXfDVGFf8,10280
|
|
10
|
-
qdrant_haystack-3.2.0.dist-info/RECORD,,
|
|
File without changes
|