elasticsearch-haystack 4.0.0__py3-none-any.whl → 4.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of elasticsearch-haystack might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: elasticsearch-haystack
3
- Version: 4.0.0
3
+ Version: 4.1.0
4
4
  Summary: Haystack 2.x Document Store for ElasticSearch
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -4,9 +4,9 @@ haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py,sha2
4
4
  haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py,sha256=-6eIHW5cU4k8-jAsUsCb15hJRalpkUhzy_dNxr5HUZo,7404
5
5
  haystack_integrations/document_stores/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  haystack_integrations/document_stores/elasticsearch/__init__.py,sha256=YTfu94dtVUBogbJFr1aJrKuaI6-Bw9VuHfPoyU7M8os,207
7
- haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=1KnQdBsD-QdvncxFZ1oaNTA9-vRJ7xXCDaKr5JcTVnE,31062
7
+ haystack_integrations/document_stores/elasticsearch/document_store.py,sha256=erIS-Tk9sdBYdRjTsjY_JmTQRH67Sjgk1BWvFCI4dtw,36023
8
8
  haystack_integrations/document_stores/elasticsearch/filters.py,sha256=Umip-PP4uFjuWeB1JWkKhaKClQ0VpiykoDlDu99wIV0,9759
9
- elasticsearch_haystack-4.0.0.dist-info/METADATA,sha256=Ukr9NBPT0Vwi8WhAA5agHeOQfB3_0_6RcGQDx_yJB2w,2105
10
- elasticsearch_haystack-4.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- elasticsearch_haystack-4.0.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
12
- elasticsearch_haystack-4.0.0.dist-info/RECORD,,
9
+ elasticsearch_haystack-4.1.0.dist-info/METADATA,sha256=TYYaNrsYzGPCLxuVBxGAXlXoaeFJqgL1VzPGybXAFNA,2105
10
+ elasticsearch_haystack-4.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ elasticsearch_haystack-4.1.0.dist-info/licenses/LICENSE,sha256=_M2kulivnaiTHiW-5CRlZrPmH47tt04pBgAgeDvfYi4,11342
12
+ elasticsearch_haystack-4.1.0.dist-info/RECORD,,
@@ -2,6 +2,11 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument
6
+ # ruff: noqa: B008 function-call-in-default-argument
7
+ # ruff: noqa: S101 disable checks for uses of the assert keyword
8
+
9
+
5
10
  from collections.abc import Mapping
6
11
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union
7
12
 
@@ -68,8 +73,8 @@ class ElasticsearchDocumentStore:
68
73
  hosts: Optional[Hosts] = None,
69
74
  custom_mapping: Optional[Dict[str, Any]] = None,
70
75
  index: str = "default",
71
- api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False), # noqa: B008
72
- api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False), # noqa: B008
76
+ api_key: Secret = Secret.from_env_var("ELASTIC_API_KEY", strict=False),
77
+ api_key_id: Secret = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False),
73
78
  embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine",
74
79
  **kwargs: Any,
75
80
  ):
@@ -119,6 +124,29 @@ class ElasticsearchDocumentStore:
119
124
  msg = "custom_mapping must be a dictionary"
120
125
  raise ValueError(msg)
121
126
 
127
+ if not self._custom_mapping:
128
+ self._default_mappings = {
129
+ "properties": {
130
+ "embedding": {
131
+ "type": "dense_vector",
132
+ "index": True,
133
+ "similarity": self._embedding_similarity_function,
134
+ },
135
+ "content": {"type": "text"},
136
+ },
137
+ "dynamic_templates": [
138
+ {
139
+ "strings": {
140
+ "path_match": "*",
141
+ "match_mapping_type": "string",
142
+ "mapping": {
143
+ "type": "keyword",
144
+ },
145
+ }
146
+ }
147
+ ],
148
+ }
149
+
122
150
  def _ensure_initialized(self):
123
151
  """
124
152
  Ensures both sync and async clients are initialized and the index exists.
@@ -150,27 +178,7 @@ class ElasticsearchDocumentStore:
150
178
  mappings = self._custom_mapping
151
179
  else:
152
180
  # Configure mapping for the embedding field if none is provided
153
- mappings = {
154
- "properties": {
155
- "embedding": {
156
- "type": "dense_vector",
157
- "index": True,
158
- "similarity": self._embedding_similarity_function,
159
- },
160
- "content": {"type": "text"},
161
- },
162
- "dynamic_templates": [
163
- {
164
- "strings": {
165
- "path_match": "*",
166
- "match_mapping_type": "string",
167
- "mapping": {
168
- "type": "keyword",
169
- },
170
- }
171
- }
172
- ],
173
- }
181
+ mappings = self._default_mappings
174
182
 
175
183
  # Create the index if it doesn't exist
176
184
  if not self._client.indices.exists(index=self._index):
@@ -227,7 +235,7 @@ class ElasticsearchDocumentStore:
227
235
  Returns the synchronous Elasticsearch client, initializing it if necessary.
228
236
  """
229
237
  self._ensure_initialized()
230
- assert self._client is not None # noqa: S101
238
+ assert self._client is not None
231
239
  return self._client
232
240
 
233
241
  @property
@@ -236,7 +244,7 @@ class ElasticsearchDocumentStore:
236
244
  Returns the asynchronous Elasticsearch client, initializing it if necessary.
237
245
  """
238
246
  self._ensure_initialized()
239
- assert self._async_client is not None # noqa: S101
247
+ assert self._async_client is not None
240
248
  return self._async_client
241
249
 
242
250
  def to_dict(self) -> Dict[str, Any]:
@@ -450,7 +458,7 @@ class ElasticsearchDocumentStore:
450
458
 
451
459
  if errors:
452
460
  # with stats_only=False, errors is guaranteed to be a list of dicts
453
- assert isinstance(errors, list) # noqa: S101
461
+ assert isinstance(errors, list)
454
462
  duplicate_errors_ids = []
455
463
  other_errors = []
456
464
  for e in errors:
@@ -529,7 +537,7 @@ class ElasticsearchDocumentStore:
529
537
  )
530
538
  if failed:
531
539
  # with stats_only=False, failed is guaranteed to be a list of dicts
532
- assert isinstance(failed, list) # noqa: S101
540
+ assert isinstance(failed, list)
533
541
  if policy == DuplicatePolicy.FAIL:
534
542
  for error in failed:
535
543
  if "create" in error and error["create"]["status"] == DOC_ALREADY_EXISTS:
@@ -556,6 +564,14 @@ class ElasticsearchDocumentStore:
556
564
  raise_on_error=False,
557
565
  )
558
566
 
567
+ def _prepare_delete_all_request(self, *, is_async: bool) -> Dict[str, Any]:
568
+ return {
569
+ "index": self._index,
570
+ "body": {"query": {"match_all": {}}}, # Delete all documents
571
+ "wait_for_completion": False if is_async else True, # block until done (set False for async)
572
+ "refresh": True, # Ensure changes are visible immediately
573
+ }
574
+
559
575
  async def delete_documents_async(self, document_ids: List[str]) -> None:
560
576
  """
561
577
  Asynchronously deletes all documents with a matching document_ids from the document store.
@@ -575,6 +591,92 @@ class ElasticsearchDocumentStore:
575
591
  msg = f"Failed to delete documents from Elasticsearch: {e!s}"
576
592
  raise DocumentStoreError(msg) from e
577
593
 
594
+ def delete_all_documents(self, recreate_index: bool = False) -> None:
595
+ """
596
+ Deletes all documents in the document store.
597
+
598
+ A fast way to clear all documents from the document store while preserving any index settings and mappings.
599
+
600
+ :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
601
+ settings. If False, all documents will be deleted using the `delete_by_query` API.
602
+ """
603
+ self._ensure_initialized() # _ensure_initialized ensures _client is not None and an index exists
604
+
605
+ if recreate_index:
606
+ # get the current index mappings and settings
607
+ index_name = self._index
608
+ mappings = self._client.indices.get(index=self._index)[index_name]["mappings"] # type: ignore
609
+ settings = self._client.indices.get(index=self._index)[index_name]["settings"] # type: ignore
610
+
611
+ # remove settings that cannot be set during index creation
612
+ settings["index"].pop("uuid", None)
613
+ settings["index"].pop("creation_date", None)
614
+ settings["index"].pop("provided_name", None)
615
+ settings["index"].pop("version", None)
616
+
617
+ self._client.indices.delete(index=self._index) # type: ignore
618
+ self._client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
619
+
620
+ # delete index
621
+ self._client.indices.delete(index=self._index) # type: ignore
622
+
623
+ # recreate with mappings
624
+ self._client.indices.create(index=self._index, mappings=mappings) # type: ignore
625
+
626
+ else:
627
+ result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False)) # type: ignore
628
+ logger.info(
629
+ "Deleted all the {n_docs} documents from the index '{index}'.",
630
+ index=self._index,
631
+ n_docs=result["deleted"],
632
+ )
633
+
634
+ async def delete_all_documents_async(self, recreate_index: bool = False) -> None:
635
+ """
636
+ Asynchronously deletes all documents in the document store.
637
+
638
+ A fast way to clear all documents from the document store while preserving any index settings and mappings.
639
+ :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
640
+ settings. If False, all documents will be deleted using the `delete_by_query` API.
641
+ """
642
+ self._ensure_initialized() # ensures _async_client is not None
643
+
644
+ try:
645
+ if recreate_index:
646
+ # get the current index mappings and settings
647
+ index_name = self._index
648
+ index_info = await self._async_client.indices.get(index=self._index) # type: ignore
649
+ mappings = index_info[index_name]["mappings"]
650
+ settings = index_info[index_name]["settings"]
651
+
652
+ # remove settings that cannot be set during index creation
653
+ settings["index"].pop("uuid", None)
654
+ settings["index"].pop("creation_date", None)
655
+ settings["index"].pop("provided_name", None)
656
+ settings["index"].pop("version", None)
657
+
658
+ # delete index
659
+ await self._async_client.indices.delete(index=self._index) # type: ignore
660
+
661
+ # recreate with settings and mappings
662
+ await self._async_client.indices.create(index=self._index, settings=settings, mappings=mappings) # type: ignore
663
+
664
+ else:
665
+ # use delete_by_query for more efficient deletion without index recreation
666
+ # For async, we need to wait for completion to get the deleted count
667
+ delete_request = self._prepare_delete_all_request(is_async=True)
668
+ delete_request["wait_for_completion"] = True # Override to wait for completion in async
669
+ result = await self._async_client.delete_by_query(**delete_request) # type: ignore
670
+ logger.info(
671
+ "Deleted all the {n_docs} documents from the index '{index}'.",
672
+ index=self._index,
673
+ n_docs=result["deleted"],
674
+ )
675
+
676
+ except Exception as e:
677
+ msg = f"Failed to delete all documents from Elasticsearch: {e!s}"
678
+ raise DocumentStoreError(msg) from e
679
+
578
680
  def _bm25_retrieval(
579
681
  self,
580
682
  query: str,