elasticsearch-haystack 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1477 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument
6
+ # ruff: noqa: B008 function-call-in-default-argument
7
+ # ruff: noqa: S101 disable checks for uses of the assert keyword
8
+
9
+
10
+ from collections.abc import Mapping
11
+ from typing import Any, Literal
12
+
13
+ import numpy as np
14
+ from elastic_transport import NodeConfig
15
+ from haystack import default_from_dict, default_to_dict, logging
16
+ from haystack.dataclasses import Document
17
+ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
18
+ from haystack.document_stores.types import DuplicatePolicy
19
+ from haystack.utils import Secret, deserialize_secrets_inplace
20
+ from haystack.version import __version__ as haystack_version
21
+
22
+ from elasticsearch import AsyncElasticsearch, Elasticsearch, helpers
23
+
24
+ from .filters import _normalize_filters
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ Hosts = str | list[str | Mapping[str, str | int] | NodeConfig]
30
+
31
+ # document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to
32
+ # True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor
33
+ # (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method).
34
+ # Larger scaling factor decreases scaled scores. For example, an input of 10 is scaled to 0.99 with
35
+ # BM25_SCALING_FACTOR=2 but to 0.78 with BM25_SCALING_FACTOR=8 (default). The defaults were chosen empirically.
36
+ # Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly
37
+ # all be mapped to scores ~1.
38
+ BM25_SCALING_FACTOR = 8
39
+ DOC_ALREADY_EXISTS = 409
40
+
41
+ UPDATE_SCRIPT = """
42
+ for (entry in params.entrySet()) {
43
+ ctx._source[entry.getKey()] = entry.getValue();
44
+ }
45
+ """
46
+
47
+ SPECIAL_FIELDS = {"content", "embedding", "id", "score", "sparse_embedding", "blob"}
48
+
49
+
50
+ class ElasticsearchDocumentStore:
51
+ """
52
+ An ElasticsearchDocumentStore instance that works with Elastic Cloud or your own
53
+ Elasticsearch cluster.
54
+
55
+ Usage example (Elastic Cloud):
56
+ ```python
57
+ from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
58
+ document_store = ElasticsearchDocumentStore(
59
+ api_key_id=Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
60
+ api_key=Secret.from_env_var("ELASTIC_API_KEY", strict=False),
61
+ )
62
+ ```
63
+
64
+ Usage example (self-hosted Elasticsearch instance):
65
+ ```python
66
+ from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
67
+ document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
68
+ ```
69
+ In the above example we connect with security disabled just to show the basic usage.
70
+ We strongly recommend to enable security so that only authorized users can access your data.
71
+
72
+ For more details on how to connect to Elasticsearch and configure security,
73
+ see the official Elasticsearch
74
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
75
+
76
+ All extra keyword arguments will be passed to the Elasticsearch client.
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ *,
82
+ hosts: Hosts | None = None,
83
+ custom_mapping: dict[str, Any] | None = None,
84
+ index: str = "default",
85
+ api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False),
86
+ api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
87
+ embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
88
+ **kwargs: Any,
89
+ ):
90
+ """
91
+ Creates a new ElasticsearchDocumentStore instance.
92
+
93
+ It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one.
94
+
95
+ One can also set the similarity function used to compare Documents embeddings. This is mostly useful
96
+ when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
97
+
98
+ For more information on connection parameters, see the official Elasticsearch
99
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html)
100
+
101
+ For the full list of supported kwargs, see the official Elasticsearch
102
+ [reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
103
+
104
+ Authentication is provided via Secret objects, which by default are loaded from environment variables.
105
+ You can either provide both `api_key_id` and `api_key`, or just `api_key` containing a base64-encoded string
106
+ of `id:secret`. Secret instances can also be loaded from a token using the `Secret.from_token()` method.
107
+
108
+ :param hosts: List of hosts running the Elasticsearch client.
109
+ :param custom_mapping: Custom mapping for the index. If not provided, a default mapping will be used.
110
+ :param index: Name of index in Elasticsearch.
111
+ :param api_key: A Secret object containing the API key for authenticating or base64-encoded with the
112
+ concatenated secret and id for authenticating with Elasticsearch (separated by “:”).
113
+ :param api_key_id: A Secret object containing the API key ID for authenticating with Elasticsearch.
114
+ :param embedding_similarity_function: The similarity function used to compare Documents embeddings.
115
+ This parameter only takes effect if the index does not yet exist and is created.
116
+ To choose the most appropriate function, look for information about your embedding model.
117
+ To understand how document scores are computed, see the Elasticsearch
118
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
119
+ :param **kwargs: Optional arguments that `Elasticsearch` takes.
120
+ """
121
+ self._hosts = hosts
122
+ self._client: Elasticsearch | None = None
123
+ self._async_client: AsyncElasticsearch | None = None
124
+ self._index = index
125
+ self._api_key = api_key
126
+ self._api_key_id = api_key_id
127
+ self._embedding_similarity_function = embedding_similarity_function
128
+ self._custom_mapping = custom_mapping
129
+ self._kwargs = kwargs
130
+ self._initialized = False
131
+
132
+ if self._custom_mapping and not isinstance(self._custom_mapping, dict):
133
+ msg = "custom_mapping must be a dictionary"
134
+ raise ValueError(msg)
135
+
136
+ if not self._custom_mapping:
137
+ self._default_mappings = {
138
+ "properties": {
139
+ "embedding": {
140
+ "type": "dense_vector",
141
+ "index": True,
142
+ "similarity": self._embedding_similarity_function,
143
+ },
144
+ "content": {"type": "text"},
145
+ },
146
+ "dynamic_templates": [
147
+ {
148
+ "strings": {
149
+ "path_match": "*",
150
+ "match_mapping_type": "string",
151
+ "mapping": {
152
+ "type": "keyword",
153
+ },
154
+ }
155
+ }
156
+ ],
157
+ }
158
+
159
+ def _ensure_initialized(self):
160
+ """
161
+ Ensures both sync and async clients are initialized and the index exists.
162
+ """
163
+ if not self._initialized:
164
+ headers = self._kwargs.pop("headers", {})
165
+ headers["user-agent"] = f"haystack-py-ds/{haystack_version}"
166
+
167
+ api_key = self._handle_auth()
168
+
169
+ # Initialize both sync and async clients
170
+ self._client = Elasticsearch(
171
+ self._hosts,
172
+ api_key=api_key,
173
+ headers=headers,
174
+ **self._kwargs,
175
+ )
176
+ self._async_client = AsyncElasticsearch(
177
+ self._hosts,
178
+ api_key=api_key,
179
+ headers=headers,
180
+ **self._kwargs,
181
+ )
182
+
183
+ # Check client connection, this will raise if not connected
184
+ self._client.info()
185
+
186
+ if self._custom_mapping:
187
+ mappings = self._custom_mapping
188
+ else:
189
+ # Configure mapping for the embedding field if none is provided
190
+ mappings = self._default_mappings
191
+
192
+ # Create the index if it doesn't exist
193
+ if not self._client.indices.exists(index=self._index):
194
+ self._client.indices.create(index=self._index, mappings=mappings)
195
+
196
+ self._initialized = True
197
+
198
+ def _handle_auth(self) -> str | tuple[str, str] | None:
199
+ """
200
+ Handles authentication for the Elasticsearch client.
201
+
202
+ There are three possible scenarios.
203
+
204
+ 1) Authentication with both api_key and api_key_id, either as Secrets or as environment variables. In this case,
205
+ use both for authentication.
206
+
207
+ 2) Authentication with only api_key, either as a Secret or as an environment variable. In this case, the api_key
208
+ must be a base64-encoded string that encodes both id and secret <id:secret>.
209
+
210
+ 3) There's no authentication, neither api_key nor api_key_id are provided as a Secret nor defined as
211
+ environment variables. In this case, the client will connect without authentication.
212
+
213
+ :returns:
214
+ api_key: Optional[Union[str, Tuple[str, str]]]
215
+
216
+ """
217
+
218
+ api_key: str | tuple[str, str] | None # make the type checker happy
219
+
220
+ api_key_resolved = self._api_key.resolve_value()
221
+ api_key_id_resolved = self._api_key_id.resolve_value()
222
+
223
+ # Scenario 1: both are found, use them
224
+ if api_key_id_resolved and api_key_resolved:
225
+ api_key = (api_key_id_resolved, api_key_resolved)
226
+ return api_key
227
+
228
+ # Scenario 2: only api_key is set, must be a base64-encoded string that encodes id and secret (separated by “:”)
229
+ elif api_key_resolved and not api_key_id_resolved:
230
+ return api_key_resolved
231
+
232
+ # Error: only api_key_id is found, raise an error
233
+ elif api_key_id_resolved and not api_key_resolved:
234
+ msg = "api_key_id is provided but api_key is missing."
235
+ raise ValueError(msg)
236
+
237
+ else:
238
+ # Scenario 3: neither found, no authentication
239
+ return None
240
+
241
+ @property
242
+ def client(self) -> Elasticsearch:
243
+ """
244
+ Returns the synchronous Elasticsearch client, initializing it if necessary.
245
+ """
246
+ self._ensure_initialized()
247
+ assert self._client is not None
248
+ return self._client
249
+
250
+ @property
251
+ def async_client(self) -> AsyncElasticsearch:
252
+ """
253
+ Returns the asynchronous Elasticsearch client, initializing it if necessary.
254
+ """
255
+ self._ensure_initialized()
256
+ assert self._async_client is not None
257
+ return self._async_client
258
+
259
+ def to_dict(self) -> dict[str, Any]:
260
+ """
261
+ Serializes the component to a dictionary.
262
+
263
+ :returns:
264
+ Dictionary with serialized data.
265
+ """
266
+ # This is not the best solution to serialise this class but is the fastest to implement.
267
+ # Not all kwargs types can be serialised to text so this can fail. We must serialise each
268
+ # type explicitly to handle this properly.
269
+ return default_to_dict(
270
+ self,
271
+ hosts=self._hosts,
272
+ custom_mapping=self._custom_mapping,
273
+ index=self._index,
274
+ api_key=self._api_key.to_dict(),
275
+ api_key_id=self._api_key_id.to_dict(),
276
+ embedding_similarity_function=self._embedding_similarity_function,
277
+ **self._kwargs,
278
+ )
279
+
280
+ @classmethod
281
+ def from_dict(cls, data: dict[str, Any]) -> "ElasticsearchDocumentStore":
282
+ """
283
+ Deserializes the component from a dictionary.
284
+
285
+ :param data:
286
+ Dictionary to deserialize from.
287
+ :returns:
288
+ Deserialized component.
289
+ """
290
+ deserialize_secrets_inplace(data, keys=["api_key", "api_key_id"])
291
+ return default_from_dict(cls, data)
292
+
293
+ def count_documents(self) -> int:
294
+ """
295
+ Returns how many documents are present in the document store.
296
+
297
+ :returns:
298
+ Number of documents in the document store.
299
+ """
300
+ self._ensure_initialized()
301
+ return self.client.count(index=self._index)["count"]
302
+
303
+ async def count_documents_async(self) -> int:
304
+ """
305
+ Asynchronously returns how many documents are present in the document store.
306
+ :returns: Number of documents in the document store.
307
+ """
308
+ self._ensure_initialized()
309
+ result = await self._async_client.count(index=self._index) # type: ignore
310
+ return result["count"]
311
+
312
+ def _search_documents(self, **kwargs: Any) -> list[Document]:
313
+ """
314
+ Calls the Elasticsearch client's search method and handles pagination.
315
+ """
316
+ top_k = kwargs.get("size")
317
+ if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
318
+ top_k = kwargs["knn"]["k"]
319
+
320
+ documents: list[Document] = []
321
+ from_ = 0
322
+ # Handle pagination
323
+ while True:
324
+ res = self.client.search(
325
+ index=self._index,
326
+ from_=from_,
327
+ **kwargs,
328
+ )
329
+
330
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"])
331
+ from_ = len(documents)
332
+
333
+ if top_k is not None and from_ >= top_k:
334
+ break
335
+ if from_ >= res["hits"]["total"]["value"]:
336
+ break
337
+ return documents
338
+
339
+ async def _search_documents_async(self, **kwargs: Any) -> list[Document]:
340
+ """
341
+ Asynchronously calls the Elasticsearch client's search method and handles pagination.
342
+ """
343
+ top_k = kwargs.get("size")
344
+ if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
345
+ top_k = kwargs["knn"]["k"]
346
+
347
+ documents: list[Document] = []
348
+ from_ = 0
349
+
350
+ # handle pagination
351
+ while True:
352
+ res = await self._async_client.search(index=self._index, from_=from_, **kwargs) # type: ignore
353
+ documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) # type: ignore
354
+ from_ = len(documents)
355
+
356
+ if top_k is not None and from_ >= top_k:
357
+ break
358
+
359
+ if from_ >= res["hits"]["total"]["value"]:
360
+ break
361
+
362
+ return documents
363
+
364
+ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
365
+ """
366
+ The main query method for the document store. It retrieves all documents that match the filters.
367
+
368
+ :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
369
+ see the official Elasticsearch
370
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
371
+ :returns: List of `Document`s that match the filters.
372
+ """
373
+ if filters and "operator" not in filters and "conditions" not in filters:
374
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
375
+ raise ValueError(msg)
376
+
377
+ self._ensure_initialized()
378
+ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
379
+ documents = self._search_documents(query=query)
380
+ return documents
381
+
382
+ async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> list[Document]:
383
+ """
384
+ Asynchronously retrieves all documents that match the filters.
385
+
386
+ :param filters: A dictionary of filters to apply. For more information on the structure of the filters,
387
+ see the official Elasticsearch
388
+ [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
389
+ :returns: List of `Document`s that match the filters.
390
+ """
391
+ if filters and "operator" not in filters and "conditions" not in filters:
392
+ msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
393
+ raise ValueError(msg)
394
+
395
+ self._ensure_initialized()
396
+ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None
397
+ documents = await self._search_documents_async(query=query)
398
+ return documents
399
+
400
+ @staticmethod
401
+ def _deserialize_document(hit: dict[str, Any]) -> Document:
402
+ """
403
+ Creates a `Document` from the search hit provided.
404
+ This is mostly useful in self.filter_documents().
405
+ :param hit: A search hit from Elasticsearch.
406
+ :returns: `Document` created from the search hit.
407
+ """
408
+ data = hit["_source"]
409
+
410
+ if "highlight" in hit:
411
+ data["metadata"]["highlighted"] = hit["highlight"]
412
+ data["score"] = hit["_score"]
413
+
414
+ return Document.from_dict(data)
415
+
416
+ def write_documents(
417
+ self,
418
+ documents: list[Document],
419
+ policy: DuplicatePolicy = DuplicatePolicy.NONE,
420
+ refresh: Literal["wait_for", True, False] = "wait_for",
421
+ ) -> int:
422
+ """
423
+ Writes `Document`s to Elasticsearch.
424
+
425
+ :param documents: List of Documents to write to the document store.
426
+ :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
427
+ :param refresh: Controls when changes are made visible to search operations.
428
+ - `True`: Force refresh immediately after the operation.
429
+ - `False`: Do not refresh (better performance for bulk operations).
430
+ - `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
431
+ For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
432
+ :raises ValueError: If `documents` is not a list of `Document`s.
433
+ :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
434
+ `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
435
+ :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
436
+ :returns: Number of documents written to the document store.
437
+ """
438
+ if len(documents) > 0:
439
+ if not isinstance(documents[0], Document):
440
+ msg = "param 'documents' must contain a list of objects of type Document"
441
+ raise ValueError(msg)
442
+
443
+ if policy == DuplicatePolicy.NONE:
444
+ policy = DuplicatePolicy.FAIL
445
+
446
+ action = "index" if policy == DuplicatePolicy.OVERWRITE else "create"
447
+
448
+ elasticsearch_actions = []
449
+ for doc in documents:
450
+ doc_dict = doc.to_dict()
451
+
452
+ if "sparse_embedding" in doc_dict:
453
+ sparse_embedding = doc_dict.pop("sparse_embedding", None)
454
+ if sparse_embedding:
455
+ logger.warning(
456
+ "Document {doc_id} has the `sparse_embedding` field set,"
457
+ "but storing sparse embeddings in Elasticsearch is not currently supported."
458
+ "The `sparse_embedding` field will be ignored.",
459
+ doc_id=doc.id,
460
+ )
461
+ elasticsearch_actions.append(
462
+ {
463
+ "_op_type": action,
464
+ "_id": doc.id,
465
+ "_source": doc_dict,
466
+ }
467
+ )
468
+
469
+ documents_written, errors = helpers.bulk(
470
+ client=self.client,
471
+ actions=elasticsearch_actions,
472
+ refresh=refresh,
473
+ index=self._index,
474
+ raise_on_error=False,
475
+ stats_only=False,
476
+ )
477
+
478
+ if errors:
479
+ # with stats_only=False, errors is guaranteed to be a list of dicts
480
+ assert isinstance(errors, list)
481
+ duplicate_errors_ids = []
482
+ other_errors = []
483
+ for e in errors:
484
+ error_type = e["create"]["error"]["type"]
485
+ if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception":
486
+ duplicate_errors_ids.append(e["create"]["_id"])
487
+ elif policy == DuplicatePolicy.SKIP and error_type == "version_conflict_engine_exception":
488
+ # when the policy is skip, duplication errors are OK and we should not raise an exception
489
+ continue
490
+ else:
491
+ other_errors.append(e)
492
+
493
+ if len(duplicate_errors_ids) > 0:
494
+ msg = f"IDs '{', '.join(duplicate_errors_ids)}' already exist in the document store."
495
+ raise DuplicateDocumentError(msg)
496
+
497
+ if len(other_errors) > 0:
498
+ msg = f"Failed to write documents to Elasticsearch. Errors:\n{other_errors}"
499
+ raise DocumentStoreError(msg)
500
+
501
+ return documents_written
502
+
503
+ async def write_documents_async(
504
+ self,
505
+ documents: list[Document],
506
+ policy: DuplicatePolicy = DuplicatePolicy.NONE,
507
+ refresh: Literal["wait_for", True, False] = "wait_for",
508
+ ) -> int:
509
+ """
510
+ Asynchronously writes `Document`s to Elasticsearch.
511
+
512
+ :param documents: List of Documents to write to the document store.
513
+ :param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
514
+ :param refresh: Controls when changes are made visible to search operations.
515
+ - `True`: Force refresh immediately after the operation.
516
+ - `False`: Do not refresh (better performance for bulk operations).
517
+ - `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
518
+ For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
519
+ :raises ValueError: If `documents` is not a list of `Document`s.
520
+ :raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
521
+ `policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
522
+ :raises DocumentStoreError: If an error occurs while writing the documents to the document store.
523
+ :returns: Number of documents written to the document store.
524
+ """
525
+ self._ensure_initialized()
526
+
527
+ if len(documents) > 0:
528
+ if not isinstance(documents[0], Document):
529
+ msg = "param 'documents' must contain a list of objects of type Document"
530
+ raise ValueError(msg)
531
+
532
+ if policy == DuplicatePolicy.NONE:
533
+ policy = DuplicatePolicy.FAIL
534
+
535
+ actions = []
536
+ for doc in documents:
537
+ doc_dict = doc.to_dict()
538
+
539
+ if "sparse_embedding" in doc_dict:
540
+ sparse_embedding = doc_dict.pop("sparse_embedding", None)
541
+ if sparse_embedding:
542
+ logger.warning(
543
+ "Document {doc_id} has the `sparse_embedding` field set,"
544
+ "but storing sparse embeddings in Elasticsearch is not currently supported."
545
+ "The `sparse_embedding` field will be ignored.",
546
+ doc_id=doc.id,
547
+ )
548
+
549
+ action = {
550
+ "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index",
551
+ "_id": doc.id,
552
+ "_source": doc_dict,
553
+ }
554
+ actions.append(action)
555
+
556
+ try:
557
+ success, failed = await helpers.async_bulk(
558
+ client=self.async_client,
559
+ actions=actions,
560
+ index=self._index,
561
+ refresh=refresh,
562
+ raise_on_error=False,
563
+ stats_only=False,
564
+ )
565
+ if failed:
566
+ # with stats_only=False, failed is guaranteed to be a list of dicts
567
+ assert isinstance(failed, list)
568
+ if policy == DuplicatePolicy.FAIL:
569
+ for error in failed:
570
+ if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
571
+ msg = f"ID '{error['create']['_id']}' already exists in the document store"
572
+ raise DuplicateDocumentError(msg)
573
+ msg = f"Failed to write documents to Elasticsearch. Errors:\n{failed}"
574
+ raise DocumentStoreError(msg)
575
+ return success
576
+ except Exception as e:
577
+ msg = f"Failed to write documents to Elasticsearch: {e!s}"
578
+ raise DocumentStoreError(msg) from e
579
+
580
+ def delete_documents(self, document_ids: list[str], refresh: Literal["wait_for", True, False] = "wait_for") -> None:
581
+ """
582
+ Deletes all documents with a matching document_ids from the document store.
583
+
584
+ :param document_ids: the document ids to delete
585
+ :param refresh: Controls when changes are made visible to search operations.
586
+ - `True`: Force refresh immediately after the operation.
587
+ - `False`: Do not refresh (better performance for bulk operations).
588
+ - `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
589
+ For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
590
+ """
591
+ helpers.bulk(
592
+ client=self.client,
593
+ actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
594
+ refresh=refresh,
595
+ index=self._index,
596
+ raise_on_error=False,
597
+ )
598
+
599
+ def _prepare_delete_all_request(self, *, is_async: bool, refresh: bool) -> dict[str, Any]:
600
+ return {
601
+ "index": self._index,
602
+ "body": {"query": {"match_all": {}}}, # Delete all documents
603
+ "wait_for_completion": False if is_async else True, # block until done (set False for async)
604
+ "refresh": refresh,
605
+ }
606
+
607
+ async def delete_documents_async(
608
+ self, document_ids: list[str], refresh: Literal["wait_for", True, False] = "wait_for"
609
+ ) -> None:
610
+ """
611
+ Asynchronously deletes all documents with a matching document_ids from the document store.
612
+
613
+ :param document_ids: the document ids to delete
614
+ :param refresh: Controls when changes are made visible to search operations.
615
+ - `True`: Force refresh immediately after the operation.
616
+ - `False`: Do not refresh (better performance for bulk operations).
617
+ - `"wait_for"`: Wait for the next refresh cycle (default, ensures read-your-writes consistency).
618
+ For more details, see the [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
619
+ """
620
+ self._ensure_initialized()
621
+
622
+ try:
623
+ await helpers.async_bulk(
624
+ client=self.async_client,
625
+ actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
626
+ index=self._index,
627
+ refresh=refresh,
628
+ )
629
+ except Exception as e:
630
+ msg = f"Failed to delete documents from Elasticsearch: {e!s}"
631
+ raise DocumentStoreError(msg) from e
632
+
633
+ def delete_all_documents(self, recreate_index: bool = False, refresh: bool = True) -> None:
634
+ """
635
+ Deletes all documents in the document store.
636
+
637
+ A fast way to clear all documents from the document store while preserving any index settings and mappings.
638
+
639
+ :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
640
+ settings. If False, all documents will be deleted using the `delete_by_query` API.
641
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
642
+ completes. If False, no refresh is performed. For more details, see the
643
+ [Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
644
+ """
645
+ self._ensure_initialized() # _ensure_initialized ensures _client is not None and an index exists
646
+
647
+ if recreate_index:
648
+ # get the current index mappings and settings
649
+ index_name = self._index
650
+ mappings = self._client.indices.get(index=self._index)[index_name]["mappings"] # type: ignore
651
+ settings = self._client.indices.get(index=self._index)[index_name]["settings"] # type: ignore
652
+
653
+ # remove settings that cannot be set during index creation
654
+ settings["index"].pop("uuid", None)
655
+ settings["index"].pop("creation_date", None)
656
+ settings["index"].pop("provided_name", None)
657
+ settings["index"].pop("version", None)
658
+
659
+ self._client.indices.delete(index=self._index) # type: ignore
660
+ self._client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
661
+
662
+ # delete index
663
+ self._client.indices.delete(index=self._index) # type: ignore
664
+
665
+ # recreate with mappings
666
+ self._client.indices.create(index=self._index, mappings=mappings) # type: ignore
667
+
668
+ else:
669
+ result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False, refresh=refresh)) # type: ignore
670
+ logger.info(
671
+ "Deleted all the {n_docs} documents from the index '{index}'.",
672
+ index=self._index,
673
+ n_docs=result["deleted"],
674
+ )
675
+
676
+ async def delete_all_documents_async(self, recreate_index: bool = False, refresh: bool = True) -> None:
677
+ """
678
+ Asynchronously deletes all documents in the document store.
679
+
680
+ A fast way to clear all documents from the document store while preserving any index settings and mappings.
681
+ :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
682
+ settings. If False, all documents will be deleted using the `delete_by_query` API.
683
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
684
+ completes. If False, no refresh is performed. For more details, see the
685
+ [Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
686
+ """
687
+ self._ensure_initialized() # ensures _async_client is not None
688
+
689
+ try:
690
+ if recreate_index:
691
+ # get the current index mappings and settings
692
+ index_name = self._index
693
+ index_info = await self._async_client.indices.get(index=self._index) # type: ignore
694
+ mappings = index_info[index_name]["mappings"]
695
+ settings = index_info[index_name]["settings"]
696
+
697
+ # remove settings that cannot be set during index creation
698
+ settings["index"].pop("uuid", None)
699
+ settings["index"].pop("creation_date", None)
700
+ settings["index"].pop("provided_name", None)
701
+ settings["index"].pop("version", None)
702
+
703
+ # delete index
704
+ await self._async_client.indices.delete(index=self._index) # type: ignore
705
+
706
+ # recreate with settings and mappings
707
+ await self._async_client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
708
+
709
+ else:
710
+ # use delete_by_query for more efficient deletion without index recreation
711
+ # For async, we need to wait for completion to get the deleted count
712
+ delete_request = self._prepare_delete_all_request(is_async=True, refresh=refresh)
713
+ delete_request["wait_for_completion"] = True # Override to wait for completion in async
714
+ result = await self._async_client.delete_by_query(**delete_request) # type: ignore
715
+ logger.info(
716
+ "Deleted all the {n_docs} documents from the index '{index}'.",
717
+ index=self._index,
718
+ n_docs=result["deleted"],
719
+ )
720
+
721
+ except Exception as e:
722
+ msg = f"Failed to delete all documents from Elasticsearch: {e!s}"
723
+ raise DocumentStoreError(msg) from e
724
+
725
+ def delete_by_filter(self, filters: dict[str, Any], refresh: bool = False) -> int:
726
+ """
727
+ Deletes all documents that match the provided filters.
728
+
729
+ :param filters: The filters to apply to select documents for deletion.
730
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
731
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
732
+ completes. If False, no refresh is performed. For more details, see the
733
+ [Elasticsearch delete_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-delete-by-query#operation-delete-by-query-refresh).
734
+ :returns: The number of documents deleted.
735
+ """
736
+ self._ensure_initialized()
737
+
738
+ try:
739
+ normalized_filters = _normalize_filters(filters)
740
+ body = {"query": {"bool": {"filter": normalized_filters}}}
741
+ result = self.client.delete_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
742
+ deleted_count = result.get("deleted", 0)
743
+ logger.info(
744
+ "Deleted {n_docs} documents from index '{index}' using filters.",
745
+ n_docs=deleted_count,
746
+ index=self._index,
747
+ )
748
+ return deleted_count
749
+ except Exception as e:
750
+ msg = f"Failed to delete documents by filter from Elasticsearch: {e!s}"
751
+ raise DocumentStoreError(msg) from e
752
+
753
+ async def delete_by_filter_async(self, filters: dict[str, Any], refresh: bool = False) -> int:
754
+ """
755
+ Asynchronously deletes all documents that match the provided filters.
756
+
757
+ :param filters: The filters to apply to select documents for deletion.
758
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
759
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the delete by query after the request
760
+ completes. If False, no refresh is performed. For more details, see the
761
+ [Elasticsearch refresh documentation](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/refresh-parameter).
762
+ :returns: The number of documents deleted.
763
+ """
764
+ self._ensure_initialized()
765
+
766
+ try:
767
+ normalized_filters = _normalize_filters(filters)
768
+ body = {"query": {"bool": {"filter": normalized_filters}}}
769
+ result = await self.async_client.delete_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
770
+ deleted_count = result.get("deleted", 0)
771
+ logger.info(
772
+ "Deleted {n_docs} documents from index '{index}' using filters.",
773
+ n_docs=deleted_count,
774
+ index=self._index,
775
+ )
776
+ return deleted_count
777
+ except Exception as e:
778
+ msg = f"Failed to delete documents by filter from Elasticsearch: {e!s}"
779
+ raise DocumentStoreError(msg) from e
780
+
781
+ def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any], refresh: bool = False) -> int:
782
+ """
783
+ Updates the metadata of all documents that match the provided filters.
784
+
785
+ :param filters: The filters to apply to select documents for updating.
786
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
787
+ :param meta: The metadata fields to update.
788
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the update by query after the request
789
+ completes. If False, no refresh is performed. For more details, see the
790
+ [Elasticsearch update_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-update-by-query#operation-update-by-query-refresh).
791
+ :returns: The number of documents updated.
792
+ """
793
+ self._ensure_initialized()
794
+
795
+ try:
796
+ normalized_filters = _normalize_filters(filters)
797
+ # Build the update script to modify metadata fields
798
+ # Documents are stored with flattened metadata, so update fields directly in ctx._source
799
+ body = {
800
+ "query": {"bool": {"filter": normalized_filters}},
801
+ "script": {"source": UPDATE_SCRIPT, "params": meta, "lang": "painless"},
802
+ }
803
+
804
+ result = self.client.update_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
805
+ updated_count = result.get("updated", 0)
806
+ logger.info(
807
+ "Updated {n_docs} documents in index '{index}' using filters.",
808
+ n_docs=updated_count,
809
+ index=self._index,
810
+ )
811
+ return updated_count
812
+ except Exception as e:
813
+ msg = f"Failed to update documents by filter in Elasticsearch: {e!s}"
814
+ raise DocumentStoreError(msg) from e
815
+
816
+ async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any], refresh: bool = False) -> int:
817
+ """
818
+ Asynchronously updates the metadata of all documents that match the provided filters.
819
+
820
+ :param filters: The filters to apply to select documents for updating.
821
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
822
+ :param meta: The metadata fields to update.
823
+ :param refresh: If True, Elasticsearch refreshes all shards involved in the update by query after the request
824
+ completes. If False, no refresh is performed. For more details, see the
825
+ [Elasticsearch update_by_query refresh documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-update-by-query#operation-update-by-query-refresh).
826
+ :returns: The number of documents updated.
827
+ """
828
+ self._ensure_initialized()
829
+
830
+ try:
831
+ normalized_filters = _normalize_filters(filters)
832
+ # Build the update script to modify metadata fields
833
+ # Documents are stored with flattened metadata, so update fields directly in ctx._source
834
+ body = {
835
+ "query": {"bool": {"filter": normalized_filters}},
836
+ "script": {"source": UPDATE_SCRIPT, "params": meta, "lang": "painless"},
837
+ }
838
+
839
+ result = await self.async_client.update_by_query(index=self._index, body=body, refresh=refresh) # type: ignore
840
+ updated_count = result.get("updated", 0)
841
+ logger.info(
842
+ "Updated {n_docs} documents in index '{index}' using filters.",
843
+ n_docs=updated_count,
844
+ index=self._index,
845
+ )
846
+ return updated_count
847
+ except Exception as e:
848
+ msg = f"Failed to update documents by filter in Elasticsearch: {e!s}"
849
+ raise DocumentStoreError(msg) from e
850
+
851
+ def _bm25_retrieval(
852
+ self,
853
+ query: str,
854
+ *,
855
+ filters: dict[str, Any] | None = None,
856
+ fuzziness: str = "AUTO",
857
+ top_k: int = 10,
858
+ scale_score: bool = False,
859
+ ) -> list[Document]:
860
+ """
861
+ Retrieves documents using BM25 retrieval.
862
+
863
+ :param query: The query string to search for
864
+ :param filters: Optional filters to narrow down the search space
865
+ :param fuzziness: Fuzziness parameter for the search query
866
+ :param top_k: Maximum number of documents to return
867
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
868
+ :returns: List of Documents that match the query
869
+ :raises ValueError: If query_embedding is empty
870
+ """
871
+ if not query:
872
+ msg = "query must be a non empty string"
873
+ raise ValueError(msg)
874
+
875
+ body: dict[str, Any] = {
876
+ "size": top_k,
877
+ "query": {
878
+ "bool": {
879
+ "must": [
880
+ {
881
+ "multi_match": {
882
+ "query": query,
883
+ "fuzziness": fuzziness,
884
+ "type": "most_fields",
885
+ "operator": "OR",
886
+ }
887
+ }
888
+ ]
889
+ }
890
+ },
891
+ }
892
+
893
+ if filters:
894
+ body["query"]["bool"]["filter"] = _normalize_filters(filters)
895
+
896
+ documents = self._search_documents(**body)
897
+
898
+ if scale_score:
899
+ for doc in documents:
900
+ if doc.score is None:
901
+ continue
902
+ doc.score = float(1 / (1 + np.exp(-np.asarray(doc.score / BM25_SCALING_FACTOR))))
903
+
904
+ return documents
905
+
906
+ async def _bm25_retrieval_async(
907
+ self,
908
+ query: str,
909
+ *,
910
+ filters: dict[str, Any] | None = None,
911
+ fuzziness: str = "AUTO",
912
+ top_k: int = 10,
913
+ scale_score: bool = False,
914
+ ) -> list[Document]:
915
+ """
916
+ Asynchronously retrieves documents using BM25 retrieval.
917
+
918
+ :param query: The query string to search for
919
+ :param filters: Optional filters to narrow down the search space
920
+ :param fuzziness: Fuzziness parameter for the search query
921
+ :param top_k: Maximum number of documents to return
922
+ :param scale_score: Whether to scale the similarity score to the range [0,1]
923
+ :returns: List of Documents that match the query
924
+ :raises ValueError: If query_embedding is empty
925
+ """
926
+ self._ensure_initialized()
927
+
928
+ if not query:
929
+ msg = "query must be a non empty string"
930
+ raise ValueError(msg)
931
+
932
+ # Prepare the search body
933
+ search_body = {
934
+ "size": top_k,
935
+ "query": {
936
+ "bool": {
937
+ "must": [
938
+ {
939
+ "multi_match": {
940
+ "query": query,
941
+ "type": "most_fields",
942
+ "operator": "OR",
943
+ "fuzziness": fuzziness,
944
+ }
945
+ }
946
+ ]
947
+ }
948
+ },
949
+ }
950
+
951
+ if filters:
952
+ search_body["query"]["bool"]["filter"] = _normalize_filters(filters) # type:ignore
953
+
954
+ documents = await self._search_documents_async(**search_body)
955
+
956
+ if scale_score:
957
+ for doc in documents:
958
+ if doc.score is not None:
959
+ doc.score = float(1 / (1 + np.exp(-(doc.score / float(BM25_SCALING_FACTOR)))))
960
+
961
+ return documents
962
+
963
+ def _embedding_retrieval(
964
+ self,
965
+ query_embedding: list[float],
966
+ *,
967
+ filters: dict[str, Any] | None = None,
968
+ top_k: int = 10,
969
+ num_candidates: int | None = None,
970
+ ) -> list[Document]:
971
+ """
972
+ Retrieves documents using dense vector similarity search.
973
+
974
+ :param query_embedding: Embedding vector to search for
975
+ :param filters: Optional filters to narrow down the search space
976
+ :param top_k: Maximum number of documents to return
977
+ :param num_candidates: Number of candidates to consider in the search
978
+ :returns: List of Documents most similar to query_embedding
979
+ :raises ValueError: If query_embedding is empty
980
+ """
981
+ if not query_embedding:
982
+ msg = "query_embedding must be a non-empty list of floats"
983
+ raise ValueError(msg)
984
+
985
+ if not num_candidates:
986
+ num_candidates = top_k * 10
987
+
988
+ body: dict[str, Any] = {
989
+ "knn": {
990
+ "field": "embedding",
991
+ "query_vector": query_embedding,
992
+ "k": top_k,
993
+ "num_candidates": num_candidates,
994
+ },
995
+ }
996
+
997
+ if filters:
998
+ body["knn"]["filter"] = _normalize_filters(filters)
999
+
1000
+ docs = self._search_documents(**body)
1001
+ return docs
1002
+
1003
+ async def _embedding_retrieval_async(
1004
+ self,
1005
+ query_embedding: list[float],
1006
+ *,
1007
+ filters: dict[str, Any] | None = None,
1008
+ top_k: int = 10,
1009
+ num_candidates: int | None = None,
1010
+ ) -> list[Document]:
1011
+ """
1012
+ Asynchronously retrieves documents using dense vector similarity search.
1013
+
1014
+ :param query_embedding: Embedding vector to search for
1015
+ :param filters: Optional filters to narrow down the search space
1016
+ :param top_k: Maximum number of documents to return
1017
+ :param num_candidates: Number of candidates to consider in the search
1018
+ :returns: List of Documents most similar to query_embedding
1019
+ :raises ValueError: If query_embedding is empty
1020
+ """
1021
+ self._ensure_initialized()
1022
+
1023
+ if not query_embedding:
1024
+ msg = "query_embedding must be a non-empty list of floats"
1025
+ raise ValueError(msg)
1026
+
1027
+ # If num_candidates is not set, use top_k * 10 as default
1028
+ if num_candidates is None:
1029
+ num_candidates = top_k * 10
1030
+
1031
+ # Prepare the search body
1032
+ search_body = {
1033
+ "knn": {
1034
+ "field": "embedding",
1035
+ "query_vector": query_embedding,
1036
+ "k": top_k,
1037
+ "num_candidates": num_candidates,
1038
+ },
1039
+ }
1040
+
1041
+ if filters:
1042
+ search_body["knn"]["filter"] = _normalize_filters(filters)
1043
+
1044
+ return await self._search_documents_async(**search_body)
1045
+
1046
+ def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
1047
+ """
1048
+ Returns the number of documents that match the provided filters.
1049
+
1050
+ :param filters: The filters to apply to count documents.
1051
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1052
+ :returns: The number of documents that match the filters.
1053
+ """
1054
+ self._ensure_initialized()
1055
+
1056
+ normalized_filters = _normalize_filters(filters)
1057
+ body = {"query": {"bool": {"filter": normalized_filters}}}
1058
+ return self.client.count(index=self._index, body=body)["count"]
1059
+
1060
+ async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int:
1061
+ """
1062
+ Asynchronously returns the number of documents that match the provided filters.
1063
+
1064
+ :param filters: The filters to apply to count documents.
1065
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1066
+ :returns: The number of documents that match the filters.
1067
+ """
1068
+ self._ensure_initialized()
1069
+
1070
+ normalized_filters = _normalize_filters(filters)
1071
+ body = {"query": {"bool": {"filter": normalized_filters}}}
1072
+ result = await self.async_client.count(index=self._index, body=body)
1073
+ return result["count"]
1074
+
1075
+ @staticmethod
1076
+ def _normalize_metadata_field_name(metadata_field: str) -> str:
1077
+ """
1078
+ Normalizes a metadata field name by removing the "meta." prefix if present.
1079
+ """
1080
+ return metadata_field[5:] if metadata_field.startswith("meta.") else metadata_field
1081
+
1082
+ @staticmethod
1083
+ def _build_cardinality_aggregations(index_mapping: dict[str, Any], fields: list[str]) -> dict[str, Any]:
1084
+ """
1085
+ Builds cardinality aggregations for specified metadata fields in the index mapping.
1086
+
1087
+ :param index_mapping: The index mapping containing field definitions.
1088
+ :param fields: List of field names to build aggregations for.
1089
+ :returns: Dictionary of cardinality aggregations.
1090
+
1091
+ See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html
1092
+ """
1093
+ aggs = {}
1094
+ for field_name in fields:
1095
+ if field_name not in SPECIAL_FIELDS and field_name in index_mapping:
1096
+ aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}}
1097
+ return aggs
1098
+
1099
+ @staticmethod
1100
+ def _build_distinct_values_query_body(filters: dict[str, Any] | None, aggs: dict[str, Any]) -> dict[str, Any]:
1101
+ """
1102
+ Builds the query body for distinct values counting with filters and aggregations.
1103
+ """
1104
+ if filters:
1105
+ normalized_filters = _normalize_filters(filters)
1106
+ return {
1107
+ "query": {"bool": {"filter": normalized_filters}},
1108
+ "aggs": aggs,
1109
+ "size": 0, # we only need aggregations, not documents
1110
+ }
1111
+ else:
1112
+ return {
1113
+ "query": {"match_all": {}},
1114
+ "aggs": aggs,
1115
+ "size": 0, # we only need aggregations, not documents
1116
+ }
1117
+
1118
+ @staticmethod
1119
+ def _extract_distinct_counts_from_aggregations(
1120
+ aggregations: dict[str, Any], index_mapping: dict[str, Any], fields: list[str]
1121
+ ) -> dict[str, int]:
1122
+ """
1123
+ Extracts distinct value counts from search result aggregations.
1124
+
1125
+ :param aggregations: The aggregations result from the search query.
1126
+ :param index_mapping: The index mapping containing field definitions.
1127
+ :param fields: List of field names to extract counts for.
1128
+ :returns: Dictionary mapping field names to their distinct value counts.
1129
+ """
1130
+ distinct_counts = {}
1131
+ for field_name in fields:
1132
+ if field_name not in SPECIAL_FIELDS and field_name in index_mapping:
1133
+ agg_key = f"{field_name}_cardinality"
1134
+ if agg_key in aggregations:
1135
+ distinct_counts[field_name] = aggregations[agg_key]["value"]
1136
+ return distinct_counts
1137
+
1138
+ def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
1139
+ """
1140
+ Returns the number of unique values for each specified metadata field of the documents
1141
+ that match the provided filters.
1142
+
1143
+ :param filters: The filters to apply to count documents.
1144
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1145
+ :param metadata_fields: List of field names to calculate unique values for.
1146
+ Field names can include or omit the "meta." prefix.
1147
+ :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
1148
+ documents.
1149
+ :raises ValueError: If any of the requested fields don't exist in the index mapping.
1150
+ """
1151
+ self._ensure_initialized()
1152
+
1153
+ # use index mapping to get all fields
1154
+ mapping = self.client.indices.get_mapping(index=self._index)
1155
+ index_mapping = mapping[self._index]["mappings"]["properties"]
1156
+
1157
+ # normalize field names, e.g: remove "meta." prefix if present
1158
+ normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields]
1159
+
1160
+ # validate that all requested fields exist in the index mapping
1161
+ missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping]
1162
+ if missing_fields:
1163
+ msg = f"Fields not found in index mapping: {missing_fields}"
1164
+ raise ValueError(msg)
1165
+
1166
+ # build aggregations for specified metadata fields
1167
+ aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields)
1168
+ if not aggs:
1169
+ return {}
1170
+
1171
+ # build and execute search query
1172
+ body = self._build_distinct_values_query_body(filters, aggs)
1173
+ result = self.client.search(index=self._index, body=body)
1174
+
1175
+ # extract cardinality values from aggregations
1176
+ return self._extract_distinct_counts_from_aggregations(
1177
+ result.get("aggregations", {}), index_mapping, normalized_metadata_fields
1178
+ )
1179
+
1180
+ async def count_unique_metadata_by_filter_async(
1181
+ self, filters: dict[str, Any], metadata_fields: list[str]
1182
+ ) -> dict[str, int]:
1183
+ """
1184
+ Asynchronously returns the number of unique values for each specified metadata field of the documents
1185
+ that match the provided filters.
1186
+
1187
+ :param filters: The filters to apply to count documents.
1188
+ For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering)
1189
+ :param metadata_fields: List of field names to calculate unique values for.
1190
+ Field names can include or omit the "meta." prefix.
1191
+ :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered
1192
+ documents.
1193
+ :raises ValueError: If any of the requested fields don't exist in the index mapping.
1194
+ """
1195
+ self._ensure_initialized()
1196
+
1197
+ # use index mapping to get all fields
1198
+ mapping = await self.async_client.indices.get_mapping(index=self._index)
1199
+ index_mapping = mapping[self._index]["mappings"]["properties"]
1200
+
1201
+ # normalize field names
1202
+ normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields]
1203
+ # validate that all requested fields exist in the index mapping
1204
+ missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping]
1205
+ if missing_fields:
1206
+ msg = f"Fields not found in index mapping: {missing_fields}"
1207
+ raise ValueError(msg)
1208
+
1209
+ # build aggregations for specified metadata fields
1210
+ aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields)
1211
+ if not aggs:
1212
+ return {}
1213
+
1214
+ # build and execute search query
1215
+ body = self._build_distinct_values_query_body(filters, aggs)
1216
+ result = await self.async_client.search(index=self._index, body=body)
1217
+
1218
+ # extract cardinality values from aggregations
1219
+ return self._extract_distinct_counts_from_aggregations(
1220
+ result.get("aggregations", {}), index_mapping, normalized_metadata_fields
1221
+ )
1222
+
1223
+ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
1224
+ """
1225
+ Returns the information about the fields in the index.
1226
+
1227
+ If we populated the index with documents like:
1228
+
1229
+ ```python
1230
+ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1})
1231
+ Document(content="Doc 2", meta={"category": "B", "status": "inactive"})
1232
+ ```
1233
+
1234
+ This method would return:
1235
+
1236
+ ```python
1237
+ {
1238
+ 'content': {'type': 'text'},
1239
+ 'category': {'type': 'keyword'},
1240
+ 'status': {'type': 'keyword'},
1241
+ 'priority': {'type': 'long'},
1242
+ }
1243
+ ```
1244
+
1245
+ :returns: The information about the fields in the index.
1246
+ """
1247
+ self._ensure_initialized()
1248
+
1249
+ mapping = self.client.indices.get_mapping(index=self._index) # type: ignore
1250
+ index_mapping = mapping[self._index]["mappings"]["properties"]
1251
+ # remove all fields that are not metadata fields
1252
+ index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS}
1253
+ return index_mapping
1254
+
1255
+ async def get_metadata_fields_info_async(self) -> dict[str, dict[str, str]]:
1256
+ """
1257
+ Asynchronously returns the information about the fields in the index.
1258
+
1259
+ If we populated the index with documents like:
1260
+
1261
+ ```python
1262
+ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1})
1263
+ Document(content="Doc 2", meta={"category": "B", "status": "inactive"})
1264
+ ```
1265
+
1266
+ This method would return:
1267
+
1268
+ ```python
1269
+ {
1270
+ 'content': {'type': 'text'},
1271
+ 'category': {'type': 'keyword'},
1272
+ 'status': {'type': 'keyword'},
1273
+ 'priority': {'type': 'long'},
1274
+ }
1275
+ ```
1276
+
1277
+ :returns: The information about the fields in the index.
1278
+ """
1279
+ self._ensure_initialized()
1280
+
1281
+ mapping = await self.async_client.indices.get_mapping(index=self._index)
1282
+ index_mapping = mapping[self._index]["mappings"]["properties"]
1283
+ # remove all fields that are not metadata fields
1284
+ index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS}
1285
+ return index_mapping
1286
+
1287
+ @staticmethod
1288
+ def _build_min_max_query_body(field_name: str) -> dict[str, Any]:
1289
+ """
1290
+ Builds the query body for getting min and max values using stats aggregation.
1291
+ """
1292
+ return {
1293
+ "query": {"match_all": {}},
1294
+ "aggs": {
1295
+ "field_stats": {
1296
+ "stats": {
1297
+ "field": field_name,
1298
+ }
1299
+ }
1300
+ },
1301
+ "size": 0, # We only need aggregations, not documents
1302
+ }
1303
+
1304
+ @staticmethod
1305
+ def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, int | None]:
1306
+ """
1307
+ Extracts min and max values from stats aggregation results.
1308
+ """
1309
+ min_value = stats.get("min")
1310
+ max_value = stats.get("max")
1311
+ return {"min": min_value, "max": max_value}
1312
+
1313
+ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | None]:
1314
+ """
1315
+ Returns the minimum and maximum values for the given metadata field.
1316
+
1317
+ :param metadata_field: The metadata field to get the minimum and maximum values for.
1318
+ :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
1319
+ metadata field across all documents.
1320
+ """
1321
+ self._ensure_initialized()
1322
+
1323
+ field_name = self._normalize_metadata_field_name(metadata_field)
1324
+ body = self._build_min_max_query_body(field_name)
1325
+ result = self.client.search(index=self._index, body=body)
1326
+ stats = result.get("aggregations", {}).get("field_stats", {})
1327
+
1328
+ return self._extract_min_max_from_stats(stats)
1329
+
1330
+ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, int | None]:
1331
+ """
1332
+ Asynchronously returns the minimum and maximum values for the given metadata field.
1333
+
1334
+ :param metadata_field: The metadata field to get the minimum and maximum values for.
1335
+ :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the
1336
+ metadata field across all documents.
1337
+ """
1338
+ self._ensure_initialized()
1339
+
1340
+ field_name = self._normalize_metadata_field_name(metadata_field)
1341
+ body = self._build_min_max_query_body(field_name)
1342
+ result = await self.async_client.search(index=self._index, body=body)
1343
+ stats = result.get("aggregations", {}).get("field_stats", {})
1344
+
1345
+ return self._extract_min_max_from_stats(stats)
1346
+
1347
+ def get_metadata_field_unique_values(
1348
+ self,
1349
+ metadata_field: str,
1350
+ search_term: str | None = None,
1351
+ size: int | None = 10000,
1352
+ after: dict[str, Any] | None = None,
1353
+ ) -> tuple[list[str], dict[str, Any] | None]:
1354
+ """
1355
+ Returns unique values for a metadata field, optionally filtered by a search term in the content.
1356
+ Uses composite aggregations for proper pagination beyond 10k results.
1357
+
1358
+ See: https://www.elastic.co/docs/reference/aggregations/search-aggregations-bucket-composite-aggregation
1359
+
1360
+ :param metadata_field: The metadata field to get unique values for.
1361
+ :param search_term: Optional search term to filter documents by matching in the content field.
1362
+ :param size: The number of unique values to return per page. Defaults to 10000.
1363
+ :param after: Optional pagination key from the previous response. Use None for the first page.
1364
+ For subsequent pages, pass the `after_key` from the previous response.
1365
+ :returns: A tuple containing (list of unique values, after_key for pagination).
1366
+ The after_key is None when there are no more results. Use it in the `after` parameter
1367
+ for the next page.
1368
+ """
1369
+ self._ensure_initialized()
1370
+
1371
+ field_name = self._normalize_metadata_field_name(metadata_field)
1372
+
1373
+ # filter by search_term if provided
1374
+ query: dict[str, Any] = {"match_all": {}}
1375
+ if search_term:
1376
+ # Use match_phrase for exact phrase matching to avoid tokenization issues
1377
+ query = {"match_phrase": {"content": search_term}}
1378
+
1379
+ # Build composite aggregation for proper pagination
1380
+ composite_agg: dict[str, Any] = {
1381
+ "size": size,
1382
+ "sources": [{field_name: {"terms": {"field": field_name}}}],
1383
+ }
1384
+ if after is not None:
1385
+ composite_agg["after"] = after
1386
+
1387
+ body = {
1388
+ "query": query,
1389
+ "aggs": {
1390
+ "unique_values": {
1391
+ "composite": composite_agg,
1392
+ }
1393
+ },
1394
+ "size": 0, # we only need aggregations, not documents
1395
+ }
1396
+
1397
+ result = self.client.search(index=self._index, body=body)
1398
+ aggregations = result.get("aggregations", {})
1399
+
1400
+ # Extract unique values from composite aggregation buckets
1401
+ unique_values_agg = aggregations.get("unique_values", {})
1402
+ unique_values_buckets = unique_values_agg.get("buckets", [])
1403
+ unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets]
1404
+
1405
+ # Extract after_key for pagination
1406
+ # If we got fewer results than requested, we've reached the end
1407
+ after_key = unique_values_agg.get("after_key")
1408
+ if after_key is not None and size is not None and len(unique_values_buckets) < size:
1409
+ after_key = None
1410
+
1411
+ return unique_values, after_key
1412
+
1413
+ async def get_metadata_field_unique_values_async(
1414
+ self,
1415
+ metadata_field: str,
1416
+ search_term: str | None = None,
1417
+ size: int | None = 10000,
1418
+ after: dict[str, Any] | None = None,
1419
+ ) -> tuple[list[str], dict[str, Any] | None]:
1420
+ """
1421
+ Asynchronously returns unique values for a metadata field, optionally filtered by a search term in the content.
1422
+ Uses composite aggregations for proper pagination beyond 10k results.
1423
+
1424
+ See: https://www.elastic.co/docs/reference/aggregations/search-aggregations-bucket-composite-aggregation
1425
+
1426
+ :param metadata_field: The metadata field to get unique values for.
1427
+ :param search_term: Optional search term to filter documents by matching in the content field.
1428
+ :param size: The number of unique values to return per page. Defaults to 10000.
1429
+ :param after: Optional pagination key from the previous response. Use None for the first page.
1430
+ For subsequent pages, pass the `after_key` from the previous response.
1431
+ :returns: A tuple containing (list of unique values, after_key for pagination).
1432
+ The after_key is None when there are no more results. Use it in the `after` parameter
1433
+ for the next page.
1434
+ """
1435
+ self._ensure_initialized()
1436
+
1437
+ field_name = self._normalize_metadata_field_name(metadata_field)
1438
+
1439
+ # filter by search_term if provided
1440
+ query: dict[str, Any] = {"match_all": {}}
1441
+ if search_term:
1442
+ # Use match_phrase for exact phrase matching to avoid tokenization issues
1443
+ query = {"match_phrase": {"content": search_term}}
1444
+
1445
+ # Build composite aggregation for proper pagination
1446
+ composite_agg: dict[str, Any] = {
1447
+ "size": size,
1448
+ "sources": [{field_name: {"terms": {"field": field_name}}}],
1449
+ }
1450
+ if after is not None:
1451
+ composite_agg["after"] = after
1452
+
1453
+ body = {
1454
+ "query": query,
1455
+ "aggs": {
1456
+ "unique_values": {
1457
+ "composite": composite_agg,
1458
+ }
1459
+ },
1460
+ "size": 0, # we only need aggregations, not documents
1461
+ }
1462
+
1463
+ result = await self.async_client.search(index=self._index, body=body)
1464
+ aggregations = result.get("aggregations", {})
1465
+
1466
+ # Extract unique values from composite aggregation buckets
1467
+ unique_values_agg = aggregations.get("unique_values", {})
1468
+ unique_values_buckets = unique_values_agg.get("buckets", [])
1469
+ unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets]
1470
+
1471
+ # Extract after_key for pagination
1472
+ # If we got fewer results than requested, we've reached the end
1473
+ after_key = unique_values_agg.get("after_key")
1474
+ if after_key is not None and size is not None and len(unique_values_buckets) < size:
1475
+ after_key = None
1476
+
1477
+ return unique_values, after_key