elasticsearch-haystack 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- elasticsearch_haystack-5.1.0.dist-info/METADATA +41 -0
- elasticsearch_haystack-5.1.0.dist-info/RECORD +12 -0
- elasticsearch_haystack-5.1.0.dist-info/WHEEL +4 -0
- elasticsearch_haystack-5.1.0.dist-info/licenses/LICENSE +201 -0
- haystack_integrations/components/retrievers/elasticsearch/__init__.py +7 -0
- haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py +166 -0
- haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py +164 -0
- haystack_integrations/components/retrievers/py.typed +0 -0
- haystack_integrations/document_stores/elasticsearch/__init__.py +6 -0
- haystack_integrations/document_stores/elasticsearch/document_store.py +1477 -0
- haystack_integrations/document_stores/elasticsearch/filters.py +246 -0
- haystack_integrations/document_stores/py.typed +0 -0
|
@@ -0,0 +1,1477 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument
|
|
6
|
+
# ruff: noqa: B008 function-call-in-default-argument
|
|
7
|
+
# ruff: noqa: S101 disable checks for uses of the assert keyword
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from typing import Any, Literal
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from elastic_transport import NodeConfig
|
|
15
|
+
from haystack import default_from_dict, default_to_dict, logging
|
|
16
|
+
from haystack.dataclasses import Document
|
|
17
|
+
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
18
|
+
from haystack.document_stores.types import DuplicatePolicy
|
|
19
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
20
|
+
from haystack.version import __version__ as haystack_version
|
|
21
|
+
|
|
22
|
+
from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
|
|
23
|
+
|
|
24
|
+
from .filters import _normalize_filters
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
Hosts = str | list[str | Mapping[str, str | int] | NodeConfig]
|
|
30
|
+
|
|
31
|
+
# document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to
|
|
32
|
+
# True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor
|
|
33
|
+
# (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method).
|
|
34
|
+
# Larger scaling factor decreases scaled scores. For example, an input of 10 is scaled to 0.99 with
|
|
35
|
+
# BM25_SCALING_FACTOR=2 but to 0.78 with BM25_SCALING_FACTOR=8 (default). The defaults were chosen empirically.
|
|
36
|
+
# Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
|
|
37
|
+
# all be mapped to scores ~1.
|
|
38
|
+
BM25_SCALING_FACTOR = 8
|
|
39
|
+
DOC_ALREADY_EXISTS = 409
|
|
40
|
+
|
|
41
|
+
UPDATE_SCRIPT = """
|
|
42
|
+
for (entry in params.entrySet()) {
|
|
43
|
+
ctx._source[entry.getKey()] = entry.getValue();
|
|
44
|
+
}
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
SPECIAL_FIELDS = {"content", "embedding", "id", "score", "sparse_embedding", "blob"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ElasticsearchDocumentStore:
|
|
51
|
+
"""
|
|
52
|
+
An ElasticsearchDocumentStore instance that works with Elastic Cloud or your own
|
|
53
|
+
Elasticsearch cluster.
|
|
54
|
+
|
|
55
|
+
Usage example (Elastic Cloud):
|
|
56
|
+
```python
|
|
57
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
58
|
+
document_store = ElasticsearchDocumentStore(
|
|
59
|
+
api_key_id=Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
|
|
60
|
+
api_key=Secret.from_env_var("ELASTIC_API_KEY", strict=False),
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Usage example (self-hosted Elasticsearch instance):
|
|
65
|
+
```python
|
|
66
|
+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
|
|
67
|
+
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
|
|
68
|
+
```
|
|
69
|
+
In the above example we connect with security disabled just to show the basic usage.
|
|
70
|
+
We strongly recommend to enable security so that only authorized users can access your data.
|
|
71
|
+
|
|
72
|
+
For more details on how to connect to Elasticsearch and configure security,
|
|
73
|
+
see the official Elasticsearch
|
|
74
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
|
|
75
|
+
|
|
76
|
+
All extra keyword arguments will be passed to the Elasticsearch client.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
hosts: Hosts | None = None,
|
|
83
|
+
custom_mapping: dict[str, Any] | None = None,
|
|
84
|
+
index: str = "default",
|
|
85
|
+
api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False),
|
|
86
|
+
api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
|
|
87
|
+
embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
|
|
88
|
+
**kwargs: Any,
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Creates a new ElasticsearchDocumentStore instance.
|
|
92
|
+
|
|
93
|
+
It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one.
|
|
94
|
+
|
|
95
|
+
One can also set the similarity function used to compare Documents embeddings. This is mostly useful
|
|
96
|
+
when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
|
|
97
|
+
|
|
98
|
+
For more information on connection parameters, see the official Elasticsearch
|
|
99
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
|
|
100
|
+
|
|
101
|
+
For the full list of supported kwargs, see the official Elasticsearch
|
|
102
|
+
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
|
|
103
|
+
|
|
104
|
+
Authentication is provided via Secret objects, which by default are loaded from environment variables.
|
|
105
|
+
You can either provide both `api_key_id` and `api_key`, or just `api_key` containing a base64-encoded string
|
|
106
|
+
of `id:secret`. Secret instances can also be loaded from a token using the `Secret.from_token()` method.
|
|
107
|
+
|
|
108
|
+
:param hosts: List of hosts running the Elasticsearch client.
|
|
109
|
+
:param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
|
|
110
|
+
:param index: Name of index in Elasticsearch.
|
|
111
|
+
:param api_key: A Secret object containing the API key for authenticating or base64-encoded with the
|
|
112
|
+
concatenated secret and id for authenticating with Elasticsearch (separated by “:”).
|
|
113
|
+
:param api_key_id: A Secret object containing the API key ID for authenticating with Elasticsearch.
|
|
114
|
+
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
|
|
115
|
+
This parameter only takes effect if the index does not yet exist and is created.
|
|
116
|
+
To choose the most appropriate function, look for information about your embedding model.
|
|
117
|
+
To understand how document scores are computed, see the Elasticsearch
|
|
118
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
|
|
119
|
+
:param **kwargs: Optional arguments that `Elasticsearch` takes.
|
|
120
|
+
"""
|
|
121
|
+
self._hosts = hosts
|
|
122
|
+
self._client: Elasticsearch | None = None
|
|
123
|
+
self._async_client: AsyncElasticsearch | None = None
|
|
124
|
+
self._index = index
|
|
125
|
+
self._api_key = api_key
|
|
126
|
+
self._api_key_id = api_key_id
|
|
127
|
+
self._embedding_similarity_function = embedding_similarity_function
|
|
128
|
+
self._custom_mapping = custom_mapping
|
|
129
|
+
self._kwargs = kwargs
|
|
130
|
+
self._initialized = False
|
|
131
|
+
|
|
132
|
+
if self._custom_mapping and not isinstance(self._custom_mapping, dict):
|
|
133
|
+
msg = "custom_mapping must be a dictionary"
|
|
134
|
+
raise ValueError(msg)
|
|
135
|
+
|
|
136
|
+
if not self._custom_mapping:
|
|
137
|
+
self._default_mappings = {
|
|
138
|
+
"properties": {
|
|
139
|
+
"embedding": {
|
|
140
|
+
"type": "dense_vector",
|
|
141
|
+
"index": True,
|
|
142
|
+
"similarity": self._embedding_similarity_function,
|
|
143
|
+
},
|
|
144
|
+
"content": {"type": "text"},
|
|
145
|
+
},
|
|
146
|
+
"dynamic_templates": [
|
|
147
|
+
{
|
|
148
|
+
"strings": {
|
|
149
|
+
"path_match": "*",
|
|
150
|
+
"match_mapping_type": "string",
|
|
151
|
+
"mapping": {
|
|
152
|
+
"type": "keyword",
|
|
153
|
+
},
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
],
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def _ensure_initialized(self):
|
|
160
|
+
"""
|
|
161
|
+
Ensures both sync and async clients are initialized and the index exists.
|
|
162
|
+
"""
|
|
163
|
+
if not self._initialized:
|
|
164
|
+
headers = self._kwargs.pop("headers", {})
|
|
165
|
+
headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
|
|
166
|
+
|
|
167
|
+
api_key = self._handle_auth()
|
|
168
|
+
|
|
169
|
+
# Initialize both sync and async clients
|
|
170
|
+
self._client = Elasticsearch(
|
|
171
|
+
self._hosts,
|
|
172
|
+
api_key=api_key,
|
|
173
|
+
headers=headers,
|
|
174
|
+
**self._kwargs,
|
|
175
|
+
)
|
|
176
|
+
self._async_client = AsyncElasticsearch(
|
|
177
|
+
self._hosts,
|
|
178
|
+
api_key=api_key,
|
|
179
|
+
headers=headers,
|
|
180
|
+
**self._kwargs,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Check client connection, this will raise if not connected
|
|
184
|
+
self._client.info()
|
|
185
|
+
|
|
186
|
+
if self._custom_mapping:
|
|
187
|
+
mappings = self._custom_mapping
|
|
188
|
+
else:
|
|
189
|
+
# Configure mapping for the embedding field if none is provided
|
|
190
|
+
mappings = self._default_mappings
|
|
191
|
+
|
|
192
|
+
# Create the index if it doesn't exist
|
|
193
|
+
if not self._client.indices.exists(index=self._index):
|
|
194
|
+
self._client.indices.create(index=self._index, mappings=mappings)
|
|
195
|
+
|
|
196
|
+
self._initialized = True
|
|
197
|
+
|
|
198
|
+
def _handle_auth(self) -> str | tuple[str, str] | None:
|
|
199
|
+
"""
|
|
200
|
+
Handles authentication for the Elasticsearch client.
|
|
201
|
+
|
|
202
|
+
There are three possible scenarios.
|
|
203
|
+
|
|
204
|
+
1) Authentication with both api_key and api_key_id, either as Secrets or as environment variables. In this case,
|
|
205
|
+
use both for authentication.
|
|
206
|
+
|
|
207
|
+
2) Authentication with only api_key, either as a Secret or as an environment variable. In this case, the api_key
|
|
208
|
+
must be a base64-encoded string that encodes both id and secret <id:secret>.
|
|
209
|
+
|
|
210
|
+
3) There's no authentication, neither api_key nor api_key_id are provided as a Secret nor defined as
|
|
211
|
+
environment variables. In this case, the client will connect without authentication.
|
|
212
|
+
|
|
213
|
+
:returns:
|
|
214
|
+
api_key: Optional[Union[str, Tuple[str, str]]]
|
|
215
|
+
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
api_key: str | tuple[str, str] | None # make the type checker happy
|
|
219
|
+
|
|
220
|
+
api_key_resolved = self._api_key.resolve_value()
|
|
221
|
+
api_key_id_resolved = self._api_key_id.resolve_value()
|
|
222
|
+
|
|
223
|
+
# Scenario 1: both are found, use them
|
|
224
|
+
if api_key_id_resolved and api_key_resolved:
|
|
225
|
+
api_key = (api_key_id_resolved, api_key_resolved)
|
|
226
|
+
return api_key
|
|
227
|
+
|
|
228
|
+
# Scenario 2: only api_key is set, must be a base64-encoded string that encodes id and secret (separated by “:”)
|
|
229
|
+
elif api_key_resolved and not api_key_id_resolved:
|
|
230
|
+
return api_key_resolved
|
|
231
|
+
|
|
232
|
+
# Error: only api_key_id is found, raise an error
|
|
233
|
+
elif api_key_id_resolved and not api_key_resolved:
|
|
234
|
+
msg = "api_key_id is provided but api_key is missing."
|
|
235
|
+
raise ValueError(msg)
|
|
236
|
+
|
|
237
|
+
else:
|
|
238
|
+
# Scenario 3: neither found, no authentication
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
@property
|
|
242
|
+
def client(self) -> Elasticsearch:
|
|
243
|
+
"""
|
|
244
|
+
Returns the synchronous Elasticsearch client, initializing it if necessary.
|
|
245
|
+
"""
|
|
246
|
+
self._ensure_initialized()
|
|
247
|
+
assert self._client is not None
|
|
248
|
+
return self._client
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def async_client(self) -> AsyncElasticsearch:
|
|
252
|
+
"""
|
|
253
|
+
Returns the asynchronous Elasticsearch client, initializing it if necessary.
|
|
254
|
+
"""
|
|
255
|
+
self._ensure_initialized()
|
|
256
|
+
assert self._async_client is not None
|
|
257
|
+
return self._async_client
|
|
258
|
+
|
|
259
|
+
def to_dict(self) -> dict[str, Any]:
|
|
260
|
+
"""
|
|
261
|
+
Serializes the component to a dictionary.
|
|
262
|
+
|
|
263
|
+
:returns:
|
|
264
|
+
Dictionary with serialized data.
|
|
265
|
+
"""
|
|
266
|
+
# This is not the best solution to serialise this class but is the fastest to implement.
|
|
267
|
+
# Not all kwargs types can be serialised to text so this can fail. We must serialise each
|
|
268
|
+
# type explicitly to handle this properly.
|
|
269
|
+
return default_to_dict(
|
|
270
|
+
self,
|
|
271
|
+
hosts=self._hosts,
|
|
272
|
+
custom_mapping=self._custom_mapping,
|
|
273
|
+
index=self._index,
|
|
274
|
+
api_key=self._api_key.to_dict(),
|
|
275
|
+
api_key_id=self._api_key_id.to_dict(),
|
|
276
|
+
embedding_similarity_function=self._embedding_similarity_function,
|
|
277
|
+
**self._kwargs,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def from_dict(cls, data: dict[str, Any]) -> "ElasticsearchDocumentStore":
|
|
282
|
+
"""
|
|
283
|
+
Deserializes the component from a dictionary.
|
|
284
|
+
|
|
285
|
+
:param data:
|
|
286
|
+
Dictionary to deserialize from.
|
|
287
|
+
:returns:
|
|
288
|
+
Deserialized component.
|
|
289
|
+
"""
|
|
290
|
+
deserialize_secrets_inplace(data, keys=["api_key", "api_key_id"])
|
|
291
|
+
return default_from_dict(cls, data)
|
|
292
|
+
|
|
293
|
+
def count_documents(self) -> int:
|
|
294
|
+
"""
|
|
295
|
+
Returns how many documents are present in the document store.
|
|
296
|
+
|
|
297
|
+
:returns:
|
|
298
|
+
Number of documents in the document store.
|
|
299
|
+
"""
|
|
300
|
+
self._ensure_initialized()
|
|
301
|
+
return self.client.count(index=self._index)["count"]
|
|
302
|
+
|
|
303
|
+
async def count_documents_async(self) -> int:
|
|
304
|
+
"""
|
|
305
|
+
Asynchronously returns how many documents are present in the document store.
|
|
306
|
+
:returns: Number of documents in the document store.
|
|
307
|
+
"""
|
|
308
|
+
self._ensure_initialized()
|
|
309
|
+
result = await self._async_client.count(index=self._index) # type: ignore
|
|
310
|
+
return result["count"]
|
|
311
|
+
|
|
312
|
+
def _search_documents(self, **kwargs: Any) -> list[Document]:
|
|
313
|
+
"""
|
|
314
|
+
Calls the Elasticsearch client's search method and handles pagination.
|
|
315
|
+
"""
|
|
316
|
+
top_k = kwargs.get("size")
|
|
317
|
+
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
318
|
+
top_k = kwargs["knn"]["k"]
|
|
319
|
+
|
|
320
|
+
documents: list[Document] = []
|
|
321
|
+
from_ = 0
|
|
322
|
+
# Handle pagination
|
|
323
|
+
while True:
|
|
324
|
+
res = self.client.search(
|
|
325
|
+
index=self._index,
|
|
326
|
+
from_=from_,
|
|
327
|
+
**kwargs,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
|
|
331
|
+
from_ = len(documents)
|
|
332
|
+
|
|
333
|
+
if top_k is not None and from_ >= top_k:
|
|
334
|
+
break
|
|
335
|
+
if from_ >= res["hits"]["total"]["value"]:
|
|
336
|
+
break
|
|
337
|
+
return documents
|
|
338
|
+
|
|
339
|
+
async def _search_documents_async(self, **kwargs: Any) -> list[Document]:
|
|
340
|
+
"""
|
|
341
|
+
Asynchronously calls the Elasticsearch client's search method and handles pagination.
|
|
342
|
+
"""
|
|
343
|
+
top_k = kwargs.get("size")
|
|
344
|
+
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
|
|
345
|
+
top_k = kwargs["knn"]["k"]
|
|
346
|
+
|
|
347
|
+
documents: list[Document] = []
|
|
348
|
+
from_ = 0
|
|
349
|
+
|
|
350
|
+
# handle pagination
|
|
351
|
+
while True:
|
|
352
|
+
res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
|
|
353
|
+
documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
|
|
354
|
+
from_ = len(documents)
|
|
355
|
+
|
|
356
|
+
if top_k is not None and from_ >= top_k:
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
if from_ >= res["hits"]["total"]["value"]:
|
|
360
|
+
break
|
|
361
|
+
|
|
362
|
+
return documents
|
|
363
|
+
|
|
364
|
+
def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
|
|
365
|
+
"""
|
|
366
|
+
The main query method for the document store. It retrieves all documents that match the filters.
|
|
367
|
+
|
|
368
|
+
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
|
|
369
|
+
see the official Elasticsearch
|
|
370
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
|
|
371
|
+
:returns: List of `Document`s that match the filters.
|
|
372
|
+
"""
|
|
373
|
+
if filters and "operator" not in filters and "conditions" not in filters:
|
|
374
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
375
|
+
raise ValueError(msg)
|
|
376
|
+
|
|
377
|
+
self._ensure_initialized()
|
|
378
|
+
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
379
|
+
documents = self._search_documents(query=query)
|
|
380
|
+
return documents
|
|
381
|
+
|
|
382
|
+
async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> list[Document]:
|
|
383
|
+
"""
|
|
384
|
+
Asynchronously retrieves all documents that match the filters.
|
|
385
|
+
|
|
386
|
+
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
|
|
387
|
+
see the official Elasticsearch
|
|
388
|
+
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
|
|
389
|
+
:returns: List of `Document`s that match the filters.
|
|
390
|
+
"""
|
|
391
|
+
if filters and "operator" not in filters and "conditions" not in filters:
|
|
392
|
+
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
|
393
|
+
raise ValueError(msg)
|
|
394
|
+
|
|
395
|
+
self._ensure_initialized()
|
|
396
|
+
query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
|
|
397
|
+
documents = await self._search_documents_async(query=query)
|
|
398
|
+
return documents
|
|
399
|
+
|
|
400
|
+
@staticmethod
|
|
401
|
+
def _deserialize_document(hit: dict[str, Any]) -> Document:
|
|
402
|
+
"""
|
|
403
|
+
Creates a `Document` from the search hit provided.
|
|
404
|
+
This is mostly useful in self.filter_documents().
|
|
405
|
+
:param hit: A search hit from Elasticsearch.
|
|
406
|
+
:returns: `Document` created from the search hit.
|
|
407
|
+
"""
|
|
408
|
+
data = hit["_source"]
|
|
409
|
+
|
|
410
|
+
if "highlight" in hit:
|
|
411
|
+
data["metadata"]["highlighted"] = hit["highlight"]
|
|
412
|
+
data["score"] = hit["_score"]
|
|
413
|
+
|
|
414
|
+
return Document.from_dict(data)
|
|
415
|
+
|
|
416
|
+
def write_documents(
|
|
417
|
+
self,
|
|
418
|
+
documents: list[Document],
|
|
419
|
+
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
420
|
+
refresh: Literal["wait_for", True, False] = "wait_for",
|
|
421
|
+
) -> int:
|
|
422
|
+
"""
|
|
423
|
+
Writes `Document`s to Elasticsearch.
|
|
424
|
+
|
|
425
|
+
:param documents: List of Documents to write to the document store.
|
|
426
|
+
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
|
|
427
|
+
:param refresh: Controls when changes are made visible to search operations.
|
|
428
|
+
- `True`: Force refresh immediately after the operation.
|
|
429
|
+
- `False`: Do not refresh (better performance for bulk operations).
|
|
430
|
+
- `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
|
|
431
|
+
For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
|
|
432
|
+
:raises ValueError: If `documents` is not a list of `Document`s.
|
|
433
|
+
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
|
|
434
|
+
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
|
|
435
|
+
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
|
|
436
|
+
:returns: Number of documents written to the document store.
|
|
437
|
+
"""
|
|
438
|
+
if len(documents) > 0:
|
|
439
|
+
if not isinstance(documents[0], Document):
|
|
440
|
+
msg = "param 'documents' must contain a list of objects of type Document"
|
|
441
|
+
raise ValueError(msg)
|
|
442
|
+
|
|
443
|
+
if policy == DuplicatePolicy.NONE:
|
|
444
|
+
policy = DuplicatePolicy.FAIL
|
|
445
|
+
|
|
446
|
+
action = "index" if policy == DuplicatePolicy.OVERWRITE else "create"
|
|
447
|
+
|
|
448
|
+
elasticsearch_actions = []
|
|
449
|
+
for doc in documents:
|
|
450
|
+
doc_dict = doc.to_dict()
|
|
451
|
+
|
|
452
|
+
if "sparse_embedding" in doc_dict:
|
|
453
|
+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
454
|
+
if sparse_embedding:
|
|
455
|
+
logger.warning(
|
|
456
|
+
"Document {doc_id} has the `sparse_embedding` field set,"
|
|
457
|
+
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
458
|
+
"The `sparse_embedding` field will be ignored.",
|
|
459
|
+
doc_id=doc.id,
|
|
460
|
+
)
|
|
461
|
+
elasticsearch_actions.append(
|
|
462
|
+
{
|
|
463
|
+
"_op_type": action,
|
|
464
|
+
"_id": doc.id,
|
|
465
|
+
"_source": doc_dict,
|
|
466
|
+
}
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
documents_written, errors = helpers.bulk(
|
|
470
|
+
client=self.client,
|
|
471
|
+
actions=elasticsearch_actions,
|
|
472
|
+
refresh=refresh,
|
|
473
|
+
index=self._index,
|
|
474
|
+
raise_on_error=False,
|
|
475
|
+
stats_only=False,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
if errors:
|
|
479
|
+
# with stats_only=False, errors is guaranteed to be a list of dicts
|
|
480
|
+
assert isinstance(errors, list)
|
|
481
|
+
duplicate_errors_ids = []
|
|
482
|
+
other_errors = []
|
|
483
|
+
for e in errors:
|
|
484
|
+
error_type = e["create"]["error"]["type"]
|
|
485
|
+
if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception":
|
|
486
|
+
duplicate_errors_ids.append(e["create"]["_id"])
|
|
487
|
+
elif policy == DuplicatePolicy.SKIP and error_type == "version_conflict_engine_exception":
|
|
488
|
+
# when the policy is skip, duplication errors are OK and we should not raise an exception
|
|
489
|
+
continue
|
|
490
|
+
else:
|
|
491
|
+
other_errors.append(e)
|
|
492
|
+
|
|
493
|
+
if len(duplicate_errors_ids) > 0:
|
|
494
|
+
msg = f"IDs '{', '.join(duplicate_errors_ids)}' already exist in the document store."
|
|
495
|
+
raise DuplicateDocumentError(msg)
|
|
496
|
+
|
|
497
|
+
if len(other_errors) > 0:
|
|
498
|
+
msg = f"Failed to write documents to Elasticsearch. Errors:\n{other_errors}"
|
|
499
|
+
raise DocumentStoreError(msg)
|
|
500
|
+
|
|
501
|
+
return documents_written
|
|
502
|
+
|
|
503
|
+
async def write_documents_async(
|
|
504
|
+
self,
|
|
505
|
+
documents: list[Document],
|
|
506
|
+
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
507
|
+
refresh: Literal["wait_for", True, False] = "wait_for",
|
|
508
|
+
) -> int:
|
|
509
|
+
"""
|
|
510
|
+
Asynchronously writes `Document`s to Elasticsearch.
|
|
511
|
+
|
|
512
|
+
:param documents: List of Documents to write to the document store.
|
|
513
|
+
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
|
|
514
|
+
:param refresh: Controls when changes are made visible to search operations.
|
|
515
|
+
- `True`: Force refresh immediately after the operation.
|
|
516
|
+
- `False`: Do not refresh (better performance for bulk operations).
|
|
517
|
+
- `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
|
|
518
|
+
For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
|
|
519
|
+
:raises ValueError: If `documents` is not a list of `Document`s.
|
|
520
|
+
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
|
|
521
|
+
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
|
|
522
|
+
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
|
|
523
|
+
:returns: Number of documents written to the document store.
|
|
524
|
+
"""
|
|
525
|
+
self._ensure_initialized()
|
|
526
|
+
|
|
527
|
+
if len(documents) > 0:
|
|
528
|
+
if not isinstance(documents[0], Document):
|
|
529
|
+
msg = "param 'documents' must contain a list of objects of type Document"
|
|
530
|
+
raise ValueError(msg)
|
|
531
|
+
|
|
532
|
+
if policy == DuplicatePolicy.NONE:
|
|
533
|
+
policy = DuplicatePolicy.FAIL
|
|
534
|
+
|
|
535
|
+
actions = []
|
|
536
|
+
for doc in documents:
|
|
537
|
+
doc_dict = doc.to_dict()
|
|
538
|
+
|
|
539
|
+
if "sparse_embedding" in doc_dict:
|
|
540
|
+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
|
|
541
|
+
if sparse_embedding:
|
|
542
|
+
logger.warning(
|
|
543
|
+
"Document {doc_id} has the `sparse_embedding` field set,"
|
|
544
|
+
"but storing sparse embeddings in Elasticsearch is not currently supported."
|
|
545
|
+
"The `sparse_embedding` field will be ignored.",
|
|
546
|
+
doc_id=doc.id,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
action = {
|
|
550
|
+
"_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
|
|
551
|
+
"_id": doc.id,
|
|
552
|
+
"_source": doc_dict,
|
|
553
|
+
}
|
|
554
|
+
actions.append(action)
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
success, failed = await helpers.async_bulk(
|
|
558
|
+
client=self.async_client,
|
|
559
|
+
actions=actions,
|
|
560
|
+
index=self._index,
|
|
561
|
+
refresh=refresh,
|
|
562
|
+
raise_on_error=False,
|
|
563
|
+
stats_only=False,
|
|
564
|
+
)
|
|
565
|
+
if failed:
|
|
566
|
+
# with stats_only=False, failed is guaranteed to be a list of dicts
|
|
567
|
+
assert isinstance(failed, list)
|
|
568
|
+
if policy == DuplicatePolicy.FAIL:
|
|
569
|
+
for error in failed:
|
|
570
|
+
if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
|
|
571
|
+
msg = f"ID '{error['create']['_id']}' already exists in the document store"
|
|
572
|
+
raise DuplicateDocumentError(msg)
|
|
573
|
+
msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
|
|
574
|
+
raise DocumentStoreError(msg)
|
|
575
|
+
return success
|
|
576
|
+
except Exception as e:
|
|
577
|
+
msg = f"Failed to write documents to Elasticsearch: {e!s}"
|
|
578
|
+
raise DocumentStoreError(msg) from e
|
|
579
|
+
|
|
580
|
+
def delete_documents(self, document_ids: list[str], refresh: Literal["wait_for", True, False] = "wait_for") -> None:
|
|
581
|
+
"""
|
|
582
|
+
Deletes all documents with a matching document_ids from the document store.
|
|
583
|
+
|
|
584
|
+
:param document_ids: the document ids to delete
|
|
585
|
+
:param refresh: Controls when changes are made visible to search operations.
|
|
586
|
+
- `True`: Force refresh immediately after the operation.
|
|
587
|
+
- `False`: Do not refresh (better performance for bulk operations).
|
|
588
|
+
- `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
|
|
589
|
+
For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
|
|
590
|
+
"""
|
|
591
|
+
helpers.bulk(
|
|
592
|
+
client=self.client,
|
|
593
|
+
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
594
|
+
refresh=refresh,
|
|
595
|
+
index=self._index,
|
|
596
|
+
raise_on_error=False,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
def _prepare_delete_all_request(self, *, is_async: bool, refresh: bool) -> dict[str, Any]:
|
|
600
|
+
return {
|
|
601
|
+
"index": self._index,
|
|
602
|
+
"body": {"query": {"match_all": {}}}, # Delete all documents
|
|
603
|
+
"wait_for_completion": False if is_async else True, # block until done (set False for async)
|
|
604
|
+
"refresh": refresh,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
async def delete_documents_async(
|
|
608
|
+
self, document_ids: list[str], refresh: Literal["wait_for", True, False] = "wait_for"
|
|
609
|
+
) -> None:
|
|
610
|
+
"""
|
|
611
|
+
Asynchronously deletes all documents with a matching document_ids from the document store.
|
|
612
|
+
|
|
613
|
+
:param document_ids: the document ids to delete
|
|
614
|
+
:param refresh: Controls when changes are made visible to search operations.
|
|
615
|
+
- `True`: Force refresh immediately after the operation.
|
|
616
|
+
- `False`: Do not refresh (better performance for bulk operations).
|
|
617
|
+
- `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
|
|
618
|
+
For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
|
|
619
|
+
"""
|
|
620
|
+
self._ensure_initialized()
|
|
621
|
+
|
|
622
|
+
try:
|
|
623
|
+
await helpers.async_bulk(
|
|
624
|
+
client=self.async_client,
|
|
625
|
+
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
|
|
626
|
+
index=self._index,
|
|
627
|
+
refresh=refresh,
|
|
628
|
+
)
|
|
629
|
+
except Exception as e:
|
|
630
|
+
msg = f"Failed to delete documents from Elasticsearch: {e!s}"
|
|
631
|
+
raise DocumentStoreError(msg) from e
|
|
632
|
+
|
|
633
|
+
def delete_all_documents(self, recreate_index: bool = False, refresh: bool = True) -> None:
|
|
634
|
+
"""
|
|
635
|
+
Deletes all documents in the document store.
|
|
636
|
+
|
|
637
|
+
A fast way to clear all documents from the document store while preserving any index settings and mappings.
|
|
638
|
+
|
|
639
|
+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
|
|
640
|
+
settings. If False, all documents will be deleted using the `delete_by_query` API.
|
|
641
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
|
|
642
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
643
|
+
[Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
|
|
644
|
+
"""
|
|
645
|
+
self._ensure_initialized() # _ensure_initialized ensures _client is not None and an index exists
|
|
646
|
+
|
|
647
|
+
if recreate_index:
|
|
648
|
+
# get the current index mappings and settings
|
|
649
|
+
index_name = self._index
|
|
650
|
+
mappings = self._client.indices.get(index=self._index)[index_name]["mappings"] # type: ignore
|
|
651
|
+
settings = self._client.indices.get(index=self._index)[index_name]["settings"] # type: ignore
|
|
652
|
+
|
|
653
|
+
# remove settings that cannot be set during index creation
|
|
654
|
+
settings["index"].pop("uuid", None)
|
|
655
|
+
settings["index"].pop("creation_date", None)
|
|
656
|
+
settings["index"].pop("provided_name", None)
|
|
657
|
+
settings["index"].pop("version", None)
|
|
658
|
+
|
|
659
|
+
self._client.indices.delete(index=self._index) # type: ignore
|
|
660
|
+
self._client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
|
|
661
|
+
|
|
662
|
+
# delete index
|
|
663
|
+
self._client.indices.delete(index=self._index) # type: ignore
|
|
664
|
+
|
|
665
|
+
# recreate with mappings
|
|
666
|
+
self._client.indices.create(index=self._index, mappings=mappings) # type: ignore
|
|
667
|
+
|
|
668
|
+
else:
|
|
669
|
+
result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False, refresh=refresh)) # type: ignore
|
|
670
|
+
logger.info(
|
|
671
|
+
"Deleted all the {n_docs} documents from the index '{index}'.",
|
|
672
|
+
index=self._index,
|
|
673
|
+
n_docs=result["deleted"],
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
async def delete_all_documents_async(self, recreate_index: bool = False, refresh: bool = True) -> None:
|
|
677
|
+
"""
|
|
678
|
+
Asynchronously deletes all documents in the document store.
|
|
679
|
+
|
|
680
|
+
A fast way to clear all documents from the document store while preserving any index settings and mappings.
|
|
681
|
+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
|
|
682
|
+
settings. If False, all documents will be deleted using the `delete_by_query` API.
|
|
683
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
|
|
684
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
685
|
+
[Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
|
|
686
|
+
"""
|
|
687
|
+
self._ensure_initialized() # ensures _async_client is not None
|
|
688
|
+
|
|
689
|
+
try:
|
|
690
|
+
if recreate_index:
|
|
691
|
+
# get the current index mappings and settings
|
|
692
|
+
index_name = self._index
|
|
693
|
+
index_info = await self._async_client.indices.get(index=self._index) # type: ignore
|
|
694
|
+
mappings = index_info[index_name]["mappings"]
|
|
695
|
+
settings = index_info[index_name]["settings"]
|
|
696
|
+
|
|
697
|
+
# remove settings that cannot be set during index creation
|
|
698
|
+
settings["index"].pop("uuid", None)
|
|
699
|
+
settings["index"].pop("creation_date", None)
|
|
700
|
+
settings["index"].pop("provided_name", None)
|
|
701
|
+
settings["index"].pop("version", None)
|
|
702
|
+
|
|
703
|
+
# delete index
|
|
704
|
+
await self._async_client.indices.delete(index=self._index) # type: ignore
|
|
705
|
+
|
|
706
|
+
# recreate with settings and mappings
|
|
707
|
+
await self._async_client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
|
|
708
|
+
|
|
709
|
+
else:
|
|
710
|
+
# use delete_by_query for more efficient deletion without index recreation
|
|
711
|
+
# For async, we need to wait for completion to get the deleted count
|
|
712
|
+
delete_request = self._prepare_delete_all_request(is_async=True, refresh=refresh)
|
|
713
|
+
delete_request["wait_for_completion"] = True # Override to wait for completion in async
|
|
714
|
+
result = await self._async_client.delete_by_query(**delete_request) # type: ignore
|
|
715
|
+
logger.info(
|
|
716
|
+
"Deleted all the {n_docs} documents from the index '{index}'.",
|
|
717
|
+
index=self._index,
|
|
718
|
+
n_docs=result["deleted"],
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
except Exception as e:
|
|
722
|
+
msg = f"Failed to delete all documents from Elasticsearch: {e!s}"
|
|
723
|
+
raise DocumentStoreError(msg) from e
|
|
724
|
+
|
|
725
|
+
def delete_by_filter(self, filters: dict[str, Any], refresh: bool = False) -> int:
|
|
726
|
+
"""
|
|
727
|
+
Deletes all documents that match the provided filters.
|
|
728
|
+
|
|
729
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
730
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
731
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
|
|
732
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
733
|
+
[Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
|
|
734
|
+
:returns: The number of documents deleted.
|
|
735
|
+
"""
|
|
736
|
+
self._ensure_initialized()
|
|
737
|
+
|
|
738
|
+
try:
|
|
739
|
+
normalized_filters = _normalize_filters(filters)
|
|
740
|
+
body = {"query": {"bool": {"filter": normalized_filters}}}
|
|
741
|
+
result = self.client.delete_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
|
|
742
|
+
deleted_count = result.get("deleted", 0)
|
|
743
|
+
logger.info(
|
|
744
|
+
"Deleted {n_docs} documents from index '{index}' using filters.",
|
|
745
|
+
n_docs=deleted_count,
|
|
746
|
+
index=self._index,
|
|
747
|
+
)
|
|
748
|
+
return deleted_count
|
|
749
|
+
except Exception as e:
|
|
750
|
+
msg = f"Failed to delete documents by filter from Elasticsearch: {e!s}"
|
|
751
|
+
raise DocumentStoreError(msg) from e
|
|
752
|
+
|
|
753
|
+
async def delete_by_filter_async(self, filters: dict[str, Any], refresh: bool = False) -> int:
|
|
754
|
+
"""
|
|
755
|
+
Asynchronously deletes all documents that match the provided filters.
|
|
756
|
+
|
|
757
|
+
:param filters: The filters to apply to select documents for deletion.
|
|
758
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
759
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
|
|
760
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
761
|
+
[Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
|
|
762
|
+
:returns: The number of documents deleted.
|
|
763
|
+
"""
|
|
764
|
+
self._ensure_initialized()
|
|
765
|
+
|
|
766
|
+
try:
|
|
767
|
+
normalized_filters = _normalize_filters(filters)
|
|
768
|
+
body = {"query": {"bool": {"filter": normalized_filters}}}
|
|
769
|
+
result = await self.async_client.delete_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
|
|
770
|
+
deleted_count = result.get("deleted", 0)
|
|
771
|
+
logger.info(
|
|
772
|
+
"Deleted {n_docs} documents from index '{index}' using filters.",
|
|
773
|
+
n_docs=deleted_count,
|
|
774
|
+
index=self._index,
|
|
775
|
+
)
|
|
776
|
+
return deleted_count
|
|
777
|
+
except Exception as e:
|
|
778
|
+
msg = f"Failed to delete documents by filter from Elasticsearch: {e!s}"
|
|
779
|
+
raise DocumentStoreError(msg) from e
|
|
780
|
+
|
|
781
|
+
def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any], refresh: bool = False) -> int:
|
|
782
|
+
"""
|
|
783
|
+
Updates the metadata of all documents that match the provided filters.
|
|
784
|
+
|
|
785
|
+
:param filters: The filters to apply to select documents for updating.
|
|
786
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
787
|
+
:param meta: The metadata fields to update.
|
|
788
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the update by query after the request
|
|
789
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
790
|
+
[Elasticsearch update_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-update-by-query#operation-update-by-query-refresh).
|
|
791
|
+
:returns: The number of documents updated.
|
|
792
|
+
"""
|
|
793
|
+
self._ensure_initialized()
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
normalized_filters = _normalize_filters(filters)
|
|
797
|
+
# Build the update script to modify metadata fields
|
|
798
|
+
# Documents are stored with flattened metadata, so update fields directly in ctx._source
|
|
799
|
+
body = {
|
|
800
|
+
"query": {"bool": {"filter": normalized_filters}},
|
|
801
|
+
"script": {"source": UPDATE_SCRIPT, "params": meta, "lang": "painless"},
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
result = self.client.update_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
|
|
805
|
+
updated_count = result.get("updated", 0)
|
|
806
|
+
logger.info(
|
|
807
|
+
"Updated {n_docs} documents in index '{index}' using filters.",
|
|
808
|
+
n_docs=updated_count,
|
|
809
|
+
index=self._index,
|
|
810
|
+
)
|
|
811
|
+
return updated_count
|
|
812
|
+
except Exception as e:
|
|
813
|
+
msg = f"Failed to update documents by filter in Elasticsearch: {e!s}"
|
|
814
|
+
raise DocumentStoreError(msg) from e
|
|
815
|
+
|
|
816
|
+
async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any], refresh: bool = False) -> int:
|
|
817
|
+
"""
|
|
818
|
+
Asynchronously updates the metadata of all documents that match the provided filters.
|
|
819
|
+
|
|
820
|
+
:param filters: The filters to apply to select documents for updating.
|
|
821
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
822
|
+
:param meta: The metadata fields to update.
|
|
823
|
+
:param refresh: If True, Elasticsearch refreshes all shards involved in the update by query after the request
|
|
824
|
+
completes. If False, no refresh is performed. For more details, see the
|
|
825
|
+
[Elasticsearch update_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-update-by-query#operation-update-by-query-refresh).
|
|
826
|
+
:returns: The number of documents updated.
|
|
827
|
+
"""
|
|
828
|
+
self._ensure_initialized()
|
|
829
|
+
|
|
830
|
+
try:
|
|
831
|
+
normalized_filters = _normalize_filters(filters)
|
|
832
|
+
# Build the update script to modify metadata fields
|
|
833
|
+
# Documents are stored with flattened metadata, so update fields directly in ctx._source
|
|
834
|
+
body = {
|
|
835
|
+
"query": {"bool": {"filter": normalized_filters}},
|
|
836
|
+
"script": {"source": UPDATE_SCRIPT, "params": meta, "lang": "painless"},
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
result = await self.async_client.update_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
|
|
840
|
+
updated_count = result.get("updated", 0)
|
|
841
|
+
logger.info(
|
|
842
|
+
"Updated {n_docs} documents in index '{index}' using filters.",
|
|
843
|
+
n_docs=updated_count,
|
|
844
|
+
index=self._index,
|
|
845
|
+
)
|
|
846
|
+
return updated_count
|
|
847
|
+
except Exception as e:
|
|
848
|
+
msg = f"Failed to update documents by filter in Elasticsearch: {e!s}"
|
|
849
|
+
raise DocumentStoreError(msg) from e
|
|
850
|
+
|
|
851
|
+
def _bm25_retrieval(
|
|
852
|
+
self,
|
|
853
|
+
query: str,
|
|
854
|
+
*,
|
|
855
|
+
filters: dict[str, Any] | None = None,
|
|
856
|
+
fuzziness: str = "AUTO",
|
|
857
|
+
top_k: int = 10,
|
|
858
|
+
scale_score: bool = False,
|
|
859
|
+
) -> list[Document]:
|
|
860
|
+
"""
|
|
861
|
+
Retrieves documents using BM25 retrieval.
|
|
862
|
+
|
|
863
|
+
:param query: The query string to search for
|
|
864
|
+
:param filters: Optional filters to narrow down the search space
|
|
865
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
866
|
+
:param top_k: Maximum number of documents to return
|
|
867
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
868
|
+
:returns: List of Documents that match the query
|
|
869
|
+
:raises ValueError: If query_embedding is empty
|
|
870
|
+
"""
|
|
871
|
+
if not query:
|
|
872
|
+
msg = "query must be a non empty string"
|
|
873
|
+
raise ValueError(msg)
|
|
874
|
+
|
|
875
|
+
body: dict[str, Any] = {
|
|
876
|
+
"size": top_k,
|
|
877
|
+
"query": {
|
|
878
|
+
"bool": {
|
|
879
|
+
"must": [
|
|
880
|
+
{
|
|
881
|
+
"multi_match": {
|
|
882
|
+
"query": query,
|
|
883
|
+
"fuzziness": fuzziness,
|
|
884
|
+
"type": "most_fields",
|
|
885
|
+
"operator": "OR",
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
]
|
|
889
|
+
}
|
|
890
|
+
},
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
if filters:
|
|
894
|
+
body["query"]["bool"]["filter"] = _normalize_filters(filters)
|
|
895
|
+
|
|
896
|
+
documents = self._search_documents(**body)
|
|
897
|
+
|
|
898
|
+
if scale_score:
|
|
899
|
+
for doc in documents:
|
|
900
|
+
if doc.score is None:
|
|
901
|
+
continue
|
|
902
|
+
doc.score = float(1 / (1 + np.exp(-np.asarray(doc.score / BM25_SCALING_FACTOR))))
|
|
903
|
+
|
|
904
|
+
return documents
|
|
905
|
+
|
|
906
|
+
async def _bm25_retrieval_async(
|
|
907
|
+
self,
|
|
908
|
+
query: str,
|
|
909
|
+
*,
|
|
910
|
+
filters: dict[str, Any] | None = None,
|
|
911
|
+
fuzziness: str = "AUTO",
|
|
912
|
+
top_k: int = 10,
|
|
913
|
+
scale_score: bool = False,
|
|
914
|
+
) -> list[Document]:
|
|
915
|
+
"""
|
|
916
|
+
Asynchronously retrieves documents using BM25 retrieval.
|
|
917
|
+
|
|
918
|
+
:param query: The query string to search for
|
|
919
|
+
:param filters: Optional filters to narrow down the search space
|
|
920
|
+
:param fuzziness: Fuzziness parameter for the search query
|
|
921
|
+
:param top_k: Maximum number of documents to return
|
|
922
|
+
:param scale_score: Whether to scale the similarity score to the range [0,1]
|
|
923
|
+
:returns: List of Documents that match the query
|
|
924
|
+
:raises ValueError: If query_embedding is empty
|
|
925
|
+
"""
|
|
926
|
+
self._ensure_initialized()
|
|
927
|
+
|
|
928
|
+
if not query:
|
|
929
|
+
msg = "query must be a non empty string"
|
|
930
|
+
raise ValueError(msg)
|
|
931
|
+
|
|
932
|
+
# Prepare the search body
|
|
933
|
+
search_body = {
|
|
934
|
+
"size": top_k,
|
|
935
|
+
"query": {
|
|
936
|
+
"bool": {
|
|
937
|
+
"must": [
|
|
938
|
+
{
|
|
939
|
+
"multi_match": {
|
|
940
|
+
"query": query,
|
|
941
|
+
"type": "most_fields",
|
|
942
|
+
"operator": "OR",
|
|
943
|
+
"fuzziness": fuzziness,
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
]
|
|
947
|
+
}
|
|
948
|
+
},
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
if filters:
|
|
952
|
+
search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
|
|
953
|
+
|
|
954
|
+
documents = await self._search_documents_async(**search_body)
|
|
955
|
+
|
|
956
|
+
if scale_score:
|
|
957
|
+
for doc in documents:
|
|
958
|
+
if doc.score is not None:
|
|
959
|
+
doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
|
|
960
|
+
|
|
961
|
+
return documents
|
|
962
|
+
|
|
963
|
+
def _embedding_retrieval(
|
|
964
|
+
self,
|
|
965
|
+
query_embedding: list[float],
|
|
966
|
+
*,
|
|
967
|
+
filters: dict[str, Any] | None = None,
|
|
968
|
+
top_k: int = 10,
|
|
969
|
+
num_candidates: int | None = None,
|
|
970
|
+
) -> list[Document]:
|
|
971
|
+
"""
|
|
972
|
+
Retrieves documents using dense vector similarity search.
|
|
973
|
+
|
|
974
|
+
:param query_embedding: Embedding vector to search for
|
|
975
|
+
:param filters: Optional filters to narrow down the search space
|
|
976
|
+
:param top_k: Maximum number of documents to return
|
|
977
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
978
|
+
:returns: List of Documents most similar to query_embedding
|
|
979
|
+
:raises ValueError: If query_embedding is empty
|
|
980
|
+
"""
|
|
981
|
+
if not query_embedding:
|
|
982
|
+
msg = "query_embedding must be a non-empty list of floats"
|
|
983
|
+
raise ValueError(msg)
|
|
984
|
+
|
|
985
|
+
if not num_candidates:
|
|
986
|
+
num_candidates = top_k * 10
|
|
987
|
+
|
|
988
|
+
body: dict[str, Any] = {
|
|
989
|
+
"knn": {
|
|
990
|
+
"field": "embedding",
|
|
991
|
+
"query_vector": query_embedding,
|
|
992
|
+
"k": top_k,
|
|
993
|
+
"num_candidates": num_candidates,
|
|
994
|
+
},
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
if filters:
|
|
998
|
+
body["knn"]["filter"] = _normalize_filters(filters)
|
|
999
|
+
|
|
1000
|
+
docs = self._search_documents(**body)
|
|
1001
|
+
return docs
|
|
1002
|
+
|
|
1003
|
+
async def _embedding_retrieval_async(
|
|
1004
|
+
self,
|
|
1005
|
+
query_embedding: list[float],
|
|
1006
|
+
*,
|
|
1007
|
+
filters: dict[str, Any] | None = None,
|
|
1008
|
+
top_k: int = 10,
|
|
1009
|
+
num_candidates: int | None = None,
|
|
1010
|
+
) -> list[Document]:
|
|
1011
|
+
"""
|
|
1012
|
+
Asynchronously retrieves documents using dense vector similarity search.
|
|
1013
|
+
|
|
1014
|
+
:param query_embedding: Embedding vector to search for
|
|
1015
|
+
:param filters: Optional filters to narrow down the search space
|
|
1016
|
+
:param top_k: Maximum number of documents to return
|
|
1017
|
+
:param num_candidates: Number of candidates to consider in the search
|
|
1018
|
+
:returns: List of Documents most similar to query_embedding
|
|
1019
|
+
:raises ValueError: If query_embedding is empty
|
|
1020
|
+
"""
|
|
1021
|
+
self._ensure_initialized()
|
|
1022
|
+
|
|
1023
|
+
if not query_embedding:
|
|
1024
|
+
msg = "query_embedding must be a non-empty list of floats"
|
|
1025
|
+
raise ValueError(msg)
|
|
1026
|
+
|
|
1027
|
+
# If num_candidates is not set, use top_k * 10 as default
|
|
1028
|
+
if num_candidates is None:
|
|
1029
|
+
num_candidates = top_k * 10
|
|
1030
|
+
|
|
1031
|
+
# Prepare the search body
|
|
1032
|
+
search_body = {
|
|
1033
|
+
"knn": {
|
|
1034
|
+
"field": "embedding",
|
|
1035
|
+
"query_vector": query_embedding,
|
|
1036
|
+
"k": top_k,
|
|
1037
|
+
"num_candidates": num_candidates,
|
|
1038
|
+
},
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
if filters:
|
|
1042
|
+
search_body["knn"]["filter"] = _normalize_filters(filters)
|
|
1043
|
+
|
|
1044
|
+
return await self._search_documents_async(**search_body)
|
|
1045
|
+
|
|
1046
|
+
def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
|
|
1047
|
+
"""
|
|
1048
|
+
Returns the number of documents that match the provided filters.
|
|
1049
|
+
|
|
1050
|
+
:param filters: The filters to apply to count documents.
|
|
1051
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1052
|
+
:returns: The number of documents that match the filters.
|
|
1053
|
+
"""
|
|
1054
|
+
self._ensure_initialized()
|
|
1055
|
+
|
|
1056
|
+
normalized_filters = _normalize_filters(filters)
|
|
1057
|
+
body = {"query": {"bool": {"filter": normalized_filters}}}
|
|
1058
|
+
return self.client.count(index=self._index, body=body)["count"]
|
|
1059
|
+
|
|
1060
|
+
async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int:
|
|
1061
|
+
"""
|
|
1062
|
+
Asynchronously returns the number of documents that match the provided filters.
|
|
1063
|
+
|
|
1064
|
+
:param filters: The filters to apply to count documents.
|
|
1065
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1066
|
+
:returns: The number of documents that match the filters.
|
|
1067
|
+
"""
|
|
1068
|
+
self._ensure_initialized()
|
|
1069
|
+
|
|
1070
|
+
normalized_filters = _normalize_filters(filters)
|
|
1071
|
+
body = {"query": {"bool": {"filter": normalized_filters}}}
|
|
1072
|
+
result = await self.async_client.count(index=self._index, body=body)
|
|
1073
|
+
return result["count"]
|
|
1074
|
+
|
|
1075
|
+
@staticmethod
|
|
1076
|
+
def _normalize_metadata_field_name(metadata_field: str) -> str:
|
|
1077
|
+
"""
|
|
1078
|
+
Normalizes a metadata field name by removing the "meta." prefix if present.
|
|
1079
|
+
"""
|
|
1080
|
+
return metadata_field[5:] if metadata_field.startswith("meta.") else metadata_field
|
|
1081
|
+
|
|
1082
|
+
@staticmethod
|
|
1083
|
+
def _build_cardinality_aggregations(index_mapping: dict[str, Any], fields: list[str]) -> dict[str, Any]:
|
|
1084
|
+
"""
|
|
1085
|
+
Builds cardinality aggregations for specified metadata fields in the index mapping.
|
|
1086
|
+
|
|
1087
|
+
:param index_mapping: The index mapping containing field definitions.
|
|
1088
|
+
:param fields: List of field names to build aggregations for.
|
|
1089
|
+
:returns: Dictionary of cardinality aggregations.
|
|
1090
|
+
|
|
1091
|
+
See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html
|
|
1092
|
+
"""
|
|
1093
|
+
aggs = {}
|
|
1094
|
+
for field_name in fields:
|
|
1095
|
+
if field_name not in SPECIAL_FIELDS and field_name in index_mapping:
|
|
1096
|
+
aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}}
|
|
1097
|
+
return aggs
|
|
1098
|
+
|
|
1099
|
+
@staticmethod
|
|
1100
|
+
def _build_distinct_values_query_body(filters: dict[str, Any] | None, aggs: dict[str, Any]) -> dict[str, Any]:
|
|
1101
|
+
"""
|
|
1102
|
+
Builds the query body for distinct values counting with filters and aggregations.
|
|
1103
|
+
"""
|
|
1104
|
+
if filters:
|
|
1105
|
+
normalized_filters = _normalize_filters(filters)
|
|
1106
|
+
return {
|
|
1107
|
+
"query": {"bool": {"filter": normalized_filters}},
|
|
1108
|
+
"aggs": aggs,
|
|
1109
|
+
"size": 0, # we only need aggregations, not documents
|
|
1110
|
+
}
|
|
1111
|
+
else:
|
|
1112
|
+
return {
|
|
1113
|
+
"query": {"match_all": {}},
|
|
1114
|
+
"aggs": aggs,
|
|
1115
|
+
"size": 0, # we only need aggregations, not documents
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
@staticmethod
|
|
1119
|
+
def _extract_distinct_counts_from_aggregations(
|
|
1120
|
+
aggregations: dict[str, Any], index_mapping: dict[str, Any], fields: list[str]
|
|
1121
|
+
) -> dict[str, int]:
|
|
1122
|
+
"""
|
|
1123
|
+
Extracts distinct value counts from search result aggregations.
|
|
1124
|
+
|
|
1125
|
+
:param aggregations: The aggregations result from the search query.
|
|
1126
|
+
:param index_mapping: The index mapping containing field definitions.
|
|
1127
|
+
:param fields: List of field names to extract counts for.
|
|
1128
|
+
:returns: Dictionary mapping field names to their distinct value counts.
|
|
1129
|
+
"""
|
|
1130
|
+
distinct_counts = {}
|
|
1131
|
+
for field_name in fields:
|
|
1132
|
+
if field_name not in SPECIAL_FIELDS and field_name in index_mapping:
|
|
1133
|
+
agg_key = f"{field_name}_cardinality"
|
|
1134
|
+
if agg_key in aggregations:
|
|
1135
|
+
distinct_counts[field_name] = aggregations[agg_key]["value"]
|
|
1136
|
+
return distinct_counts
|
|
1137
|
+
|
|
1138
|
+
def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
|
|
1139
|
+
"""
|
|
1140
|
+
Returns the number of unique values for each specified metadata field of the documents
|
|
1141
|
+
that match the provided filters.
|
|
1142
|
+
|
|
1143
|
+
:param filters: The filters to apply to count documents.
|
|
1144
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1145
|
+
:param metadata_fields: List of field names to calculate unique values for.
|
|
1146
|
+
Field names can include or omit the "meta." prefix.
|
|
1147
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1148
|
+
documents.
|
|
1149
|
+
:raises ValueError: If any of the requested fields don't exist in the index mapping.
|
|
1150
|
+
"""
|
|
1151
|
+
self._ensure_initialized()
|
|
1152
|
+
|
|
1153
|
+
# use index mapping to get all fields
|
|
1154
|
+
mapping = self.client.indices.get_mapping(index=self._index)
|
|
1155
|
+
index_mapping = mapping[self._index]["mappings"]["properties"]
|
|
1156
|
+
|
|
1157
|
+
# normalize field names, e.g: remove "meta." prefix if present
|
|
1158
|
+
normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields]
|
|
1159
|
+
|
|
1160
|
+
# validate that all requested fields exist in the index mapping
|
|
1161
|
+
missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping]
|
|
1162
|
+
if missing_fields:
|
|
1163
|
+
msg = f"Fields not found in index mapping: {missing_fields}"
|
|
1164
|
+
raise ValueError(msg)
|
|
1165
|
+
|
|
1166
|
+
# build aggregations for specified metadata fields
|
|
1167
|
+
aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields)
|
|
1168
|
+
if not aggs:
|
|
1169
|
+
return {}
|
|
1170
|
+
|
|
1171
|
+
# build and execute search query
|
|
1172
|
+
body = self._build_distinct_values_query_body(filters, aggs)
|
|
1173
|
+
result = self.client.search(index=self._index, body=body)
|
|
1174
|
+
|
|
1175
|
+
# extract cardinality values from aggregations
|
|
1176
|
+
return self._extract_distinct_counts_from_aggregations(
|
|
1177
|
+
result.get("aggregations", {}), index_mapping, normalized_metadata_fields
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
async def count_unique_metadata_by_filter_async(
|
|
1181
|
+
self, filters: dict[str, Any], metadata_fields: list[str]
|
|
1182
|
+
) -> dict[str, int]:
|
|
1183
|
+
"""
|
|
1184
|
+
Asynchronously returns the number of unique values for each specified metadata field of the documents
|
|
1185
|
+
that match the provided filters.
|
|
1186
|
+
|
|
1187
|
+
:param filters: The filters to apply to count documents.
|
|
1188
|
+
For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
|
|
1189
|
+
:param metadata_fields: List of field names to calculate unique values for.
|
|
1190
|
+
Field names can include or omit the "meta." prefix.
|
|
1191
|
+
:returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
|
|
1192
|
+
documents.
|
|
1193
|
+
:raises ValueError: If any of the requested fields don't exist in the index mapping.
|
|
1194
|
+
"""
|
|
1195
|
+
self._ensure_initialized()
|
|
1196
|
+
|
|
1197
|
+
# use index mapping to get all fields
|
|
1198
|
+
mapping = await self.async_client.indices.get_mapping(index=self._index)
|
|
1199
|
+
index_mapping = mapping[self._index]["mappings"]["properties"]
|
|
1200
|
+
|
|
1201
|
+
# normalize field names
|
|
1202
|
+
normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields]
|
|
1203
|
+
# validate that all requested fields exist in the index mapping
|
|
1204
|
+
missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping]
|
|
1205
|
+
if missing_fields:
|
|
1206
|
+
msg = f"Fields not found in index mapping: {missing_fields}"
|
|
1207
|
+
raise ValueError(msg)
|
|
1208
|
+
|
|
1209
|
+
# build aggregations for specified metadata fields
|
|
1210
|
+
aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields)
|
|
1211
|
+
if not aggs:
|
|
1212
|
+
return {}
|
|
1213
|
+
|
|
1214
|
+
# build and execute search query
|
|
1215
|
+
body = self._build_distinct_values_query_body(filters, aggs)
|
|
1216
|
+
result = await self.async_client.search(index=self._index, body=body)
|
|
1217
|
+
|
|
1218
|
+
# extract cardinality values from aggregations
|
|
1219
|
+
return self._extract_distinct_counts_from_aggregations(
|
|
1220
|
+
result.get("aggregations", {}), index_mapping, normalized_metadata_fields
|
|
1221
|
+
)
|
|
1222
|
+
|
|
1223
|
+
def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
|
|
1224
|
+
"""
|
|
1225
|
+
Returns the information about the fields in the index.
|
|
1226
|
+
|
|
1227
|
+
If we populated the index with documents like:
|
|
1228
|
+
|
|
1229
|
+
```python
|
|
1230
|
+
Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1})
|
|
1231
|
+
Document(content="Doc 2", meta={"category": "B", "status": "inactive"})
|
|
1232
|
+
```
|
|
1233
|
+
|
|
1234
|
+
This method would return:
|
|
1235
|
+
|
|
1236
|
+
```python
|
|
1237
|
+
{
|
|
1238
|
+
'content': {'type': 'text'},
|
|
1239
|
+
'category': {'type': 'keyword'},
|
|
1240
|
+
'status': {'type': 'keyword'},
|
|
1241
|
+
'priority': {'type': 'long'},
|
|
1242
|
+
}
|
|
1243
|
+
```
|
|
1244
|
+
|
|
1245
|
+
:returns: The information about the fields in the index.
|
|
1246
|
+
"""
|
|
1247
|
+
self._ensure_initialized()
|
|
1248
|
+
|
|
1249
|
+
mapping = self.client.indices.get_mapping(index=self._index) # type: ignore
|
|
1250
|
+
index_mapping = mapping[self._index]["mappings"]["properties"]
|
|
1251
|
+
# remove all fields that are not metadata fields
|
|
1252
|
+
index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS}
|
|
1253
|
+
return index_mapping
|
|
1254
|
+
|
|
1255
|
+
async def get_metadata_fields_info_async(self) -> dict[str, dict[str, str]]:
|
|
1256
|
+
"""
|
|
1257
|
+
Asynchronously returns the information about the fields in the index.
|
|
1258
|
+
|
|
1259
|
+
If we populated the index with documents like:
|
|
1260
|
+
|
|
1261
|
+
```python
|
|
1262
|
+
Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1})
|
|
1263
|
+
Document(content="Doc 2", meta={"category": "B", "status": "inactive"})
|
|
1264
|
+
```
|
|
1265
|
+
|
|
1266
|
+
This method would return:
|
|
1267
|
+
|
|
1268
|
+
```python
|
|
1269
|
+
{
|
|
1270
|
+
'content': {'type': 'text'},
|
|
1271
|
+
'category': {'type': 'keyword'},
|
|
1272
|
+
'status': {'type': 'keyword'},
|
|
1273
|
+
'priority': {'type': 'long'},
|
|
1274
|
+
}
|
|
1275
|
+
```
|
|
1276
|
+
|
|
1277
|
+
:returns: The information about the fields in the index.
|
|
1278
|
+
"""
|
|
1279
|
+
self._ensure_initialized()
|
|
1280
|
+
|
|
1281
|
+
mapping = await self.async_client.indices.get_mapping(index=self._index)
|
|
1282
|
+
index_mapping = mapping[self._index]["mappings"]["properties"]
|
|
1283
|
+
# remove all fields that are not metadata fields
|
|
1284
|
+
index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS}
|
|
1285
|
+
return index_mapping
|
|
1286
|
+
|
|
1287
|
+
@staticmethod
|
|
1288
|
+
def _build_min_max_query_body(field_name: str) -> dict[str, Any]:
|
|
1289
|
+
"""
|
|
1290
|
+
Builds the query body for getting min and max values using stats aggregation.
|
|
1291
|
+
"""
|
|
1292
|
+
return {
|
|
1293
|
+
"query": {"match_all": {}},
|
|
1294
|
+
"aggs": {
|
|
1295
|
+
"field_stats": {
|
|
1296
|
+
"stats": {
|
|
1297
|
+
"field": field_name,
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
},
|
|
1301
|
+
"size": 0, # We only need aggregations, not documents
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
@staticmethod
|
|
1305
|
+
def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, int | None]:
|
|
1306
|
+
"""
|
|
1307
|
+
Extracts min and max values from stats aggregation results.
|
|
1308
|
+
"""
|
|
1309
|
+
min_value = stats.get("min")
|
|
1310
|
+
max_value = stats.get("max")
|
|
1311
|
+
return {"min": min_value, "max": max_value}
|
|
1312
|
+
|
|
1313
|
+
def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | None]:
|
|
1314
|
+
"""
|
|
1315
|
+
Returns the minimum and maximum values for the given metadata field.
|
|
1316
|
+
|
|
1317
|
+
:param metadata_field: The metadata field to get the minimum and maximum values for.
|
|
1318
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1319
|
+
metadata field across all documents.
|
|
1320
|
+
"""
|
|
1321
|
+
self._ensure_initialized()
|
|
1322
|
+
|
|
1323
|
+
field_name = self._normalize_metadata_field_name(metadata_field)
|
|
1324
|
+
body = self._build_min_max_query_body(field_name)
|
|
1325
|
+
result = self.client.search(index=self._index, body=body)
|
|
1326
|
+
stats = result.get("aggregations", {}).get("field_stats", {})
|
|
1327
|
+
|
|
1328
|
+
return self._extract_min_max_from_stats(stats)
|
|
1329
|
+
|
|
1330
|
+
async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, int | None]:
|
|
1331
|
+
"""
|
|
1332
|
+
Asynchronously returns the minimum and maximum values for the given metadata field.
|
|
1333
|
+
|
|
1334
|
+
:param metadata_field: The metadata field to get the minimum and maximum values for.
|
|
1335
|
+
:returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
|
|
1336
|
+
metadata field across all documents.
|
|
1337
|
+
"""
|
|
1338
|
+
self._ensure_initialized()
|
|
1339
|
+
|
|
1340
|
+
field_name = self._normalize_metadata_field_name(metadata_field)
|
|
1341
|
+
body = self._build_min_max_query_body(field_name)
|
|
1342
|
+
result = await self.async_client.search(index=self._index, body=body)
|
|
1343
|
+
stats = result.get("aggregations", {}).get("field_stats", {})
|
|
1344
|
+
|
|
1345
|
+
return self._extract_min_max_from_stats(stats)
|
|
1346
|
+
|
|
1347
|
+
def get_metadata_field_unique_values(
|
|
1348
|
+
self,
|
|
1349
|
+
metadata_field: str,
|
|
1350
|
+
search_term: str | None = None,
|
|
1351
|
+
size: int | None = 10000,
|
|
1352
|
+
after: dict[str, Any] | None = None,
|
|
1353
|
+
) -> tuple[list[str], dict[str, Any] | None]:
|
|
1354
|
+
"""
|
|
1355
|
+
Returns unique values for a metadata field, optionally filtered by a search term in the content.
|
|
1356
|
+
Uses composite aggregations for proper pagination beyond 10k results.
|
|
1357
|
+
|
|
1358
|
+
See: https://www.elastic.co/docs/reference/aggregations/search-aggregations-bucket-composite-aggregation
|
|
1359
|
+
|
|
1360
|
+
:param metadata_field: The metadata field to get unique values for.
|
|
1361
|
+
:param search_term: Optional search term to filter documents by matching in the content field.
|
|
1362
|
+
:param size: The number of unique values to return per page. Defaults to 10000.
|
|
1363
|
+
:param after: Optional pagination key from the previous response. Use None for the first page.
|
|
1364
|
+
For subsequent pages, pass the `after_key` from the previous response.
|
|
1365
|
+
:returns: A tuple containing (list of unique values, after_key for pagination).
|
|
1366
|
+
The after_key is None when there are no more results. Use it in the `after` parameter
|
|
1367
|
+
for the next page.
|
|
1368
|
+
"""
|
|
1369
|
+
self._ensure_initialized()
|
|
1370
|
+
|
|
1371
|
+
field_name = self._normalize_metadata_field_name(metadata_field)
|
|
1372
|
+
|
|
1373
|
+
# filter by search_term if provided
|
|
1374
|
+
query: dict[str, Any] = {"match_all": {}}
|
|
1375
|
+
if search_term:
|
|
1376
|
+
# Use match_phrase for exact phrase matching to avoid tokenization issues
|
|
1377
|
+
query = {"match_phrase": {"content": search_term}}
|
|
1378
|
+
|
|
1379
|
+
# Build composite aggregation for proper pagination
|
|
1380
|
+
composite_agg: dict[str, Any] = {
|
|
1381
|
+
"size": size,
|
|
1382
|
+
"sources": [{field_name: {"terms": {"field": field_name}}}],
|
|
1383
|
+
}
|
|
1384
|
+
if after is not None:
|
|
1385
|
+
composite_agg["after"] = after
|
|
1386
|
+
|
|
1387
|
+
body = {
|
|
1388
|
+
"query": query,
|
|
1389
|
+
"aggs": {
|
|
1390
|
+
"unique_values": {
|
|
1391
|
+
"composite": composite_agg,
|
|
1392
|
+
}
|
|
1393
|
+
},
|
|
1394
|
+
"size": 0, # we only need aggregations, not documents
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
result = self.client.search(index=self._index, body=body)
|
|
1398
|
+
aggregations = result.get("aggregations", {})
|
|
1399
|
+
|
|
1400
|
+
# Extract unique values from composite aggregation buckets
|
|
1401
|
+
unique_values_agg = aggregations.get("unique_values", {})
|
|
1402
|
+
unique_values_buckets = unique_values_agg.get("buckets", [])
|
|
1403
|
+
unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets]
|
|
1404
|
+
|
|
1405
|
+
# Extract after_key for pagination
|
|
1406
|
+
# If we got fewer results than requested, we've reached the end
|
|
1407
|
+
after_key = unique_values_agg.get("after_key")
|
|
1408
|
+
if after_key is not None and size is not None and len(unique_values_buckets) < size:
|
|
1409
|
+
after_key = None
|
|
1410
|
+
|
|
1411
|
+
return unique_values, after_key
|
|
1412
|
+
|
|
1413
|
+
async def get_metadata_field_unique_values_async(
|
|
1414
|
+
self,
|
|
1415
|
+
metadata_field: str,
|
|
1416
|
+
search_term: str | None = None,
|
|
1417
|
+
size: int | None = 10000,
|
|
1418
|
+
after: dict[str, Any] | None = None,
|
|
1419
|
+
) -> tuple[list[str], dict[str, Any] | None]:
|
|
1420
|
+
"""
|
|
1421
|
+
Asynchronously returns unique values for a metadata field, optionally filtered by a search term in the content.
|
|
1422
|
+
Uses composite aggregations for proper pagination beyond 10k results.
|
|
1423
|
+
|
|
1424
|
+
See: https://www.elastic.co/docs/reference/aggregations/search-aggregations-bucket-composite-aggregation
|
|
1425
|
+
|
|
1426
|
+
:param metadata_field: The metadata field to get unique values for.
|
|
1427
|
+
:param search_term: Optional search term to filter documents by matching in the content field.
|
|
1428
|
+
:param size: The number of unique values to return per page. Defaults to 10000.
|
|
1429
|
+
:param after: Optional pagination key from the previous response. Use None for the first page.
|
|
1430
|
+
For subsequent pages, pass the `after_key` from the previous response.
|
|
1431
|
+
:returns: A tuple containing (list of unique values, after_key for pagination).
|
|
1432
|
+
The after_key is None when there are no more results. Use it in the `after` parameter
|
|
1433
|
+
for the next page.
|
|
1434
|
+
"""
|
|
1435
|
+
self._ensure_initialized()
|
|
1436
|
+
|
|
1437
|
+
field_name = self._normalize_metadata_field_name(metadata_field)
|
|
1438
|
+
|
|
1439
|
+
# filter by search_term if provided
|
|
1440
|
+
query: dict[str, Any] = {"match_all": {}}
|
|
1441
|
+
if search_term:
|
|
1442
|
+
# Use match_phrase for exact phrase matching to avoid tokenization issues
|
|
1443
|
+
query = {"match_phrase": {"content": search_term}}
|
|
1444
|
+
|
|
1445
|
+
# Build composite aggregation for proper pagination
|
|
1446
|
+
composite_agg: dict[str, Any] = {
|
|
1447
|
+
"size": size,
|
|
1448
|
+
"sources": [{field_name: {"terms": {"field": field_name}}}],
|
|
1449
|
+
}
|
|
1450
|
+
if after is not None:
|
|
1451
|
+
composite_agg["after"] = after
|
|
1452
|
+
|
|
1453
|
+
body = {
|
|
1454
|
+
"query": query,
|
|
1455
|
+
"aggs": {
|
|
1456
|
+
"unique_values": {
|
|
1457
|
+
"composite": composite_agg,
|
|
1458
|
+
}
|
|
1459
|
+
},
|
|
1460
|
+
"size": 0, # we only need aggregations, not documents
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
result = await self.async_client.search(index=self._index, body=body)
|
|
1464
|
+
aggregations = result.get("aggregations", {})
|
|
1465
|
+
|
|
1466
|
+
# Extract unique values from composite aggregation buckets
|
|
1467
|
+
unique_values_agg = aggregations.get("unique_values", {})
|
|
1468
|
+
unique_values_buckets = unique_values_agg.get("buckets", [])
|
|
1469
|
+
unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets]
|
|
1470
|
+
|
|
1471
|
+
# Extract after_key for pagination
|
|
1472
|
+
# If we got fewer results than requested, we've reached the end
|
|
1473
|
+
after_key = unique_values_agg.get("after_key")
|
|
1474
|
+
if after_key is not None and size is not None and len(unique_values_buckets) < size:
|
|
1475
|
+
after_key = None
|
|
1476
|
+
|
|
1477
|
+
return unique_values, after_key
|