howler-api 3.0.0.dev374__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of howler-api might be problematic. Click here for more details.

Files changed (198) hide show
  1. howler/__init__.py +0 -0
  2. howler/actions/__init__.py +168 -0
  3. howler/actions/add_label.py +111 -0
  4. howler/actions/add_to_bundle.py +159 -0
  5. howler/actions/change_field.py +76 -0
  6. howler/actions/demote.py +160 -0
  7. howler/actions/example_plugin.py +104 -0
  8. howler/actions/prioritization.py +93 -0
  9. howler/actions/promote.py +147 -0
  10. howler/actions/remove_from_bundle.py +133 -0
  11. howler/actions/remove_label.py +111 -0
  12. howler/actions/transition.py +200 -0
  13. howler/api/__init__.py +249 -0
  14. howler/api/base.py +88 -0
  15. howler/api/socket.py +114 -0
  16. howler/api/v1/__init__.py +97 -0
  17. howler/api/v1/action.py +372 -0
  18. howler/api/v1/analytic.py +748 -0
  19. howler/api/v1/auth.py +382 -0
  20. howler/api/v1/clue.py +99 -0
  21. howler/api/v1/configs.py +58 -0
  22. howler/api/v1/dossier.py +222 -0
  23. howler/api/v1/help.py +28 -0
  24. howler/api/v1/hit.py +1181 -0
  25. howler/api/v1/notebook.py +82 -0
  26. howler/api/v1/overview.py +191 -0
  27. howler/api/v1/search.py +788 -0
  28. howler/api/v1/template.py +206 -0
  29. howler/api/v1/tool.py +183 -0
  30. howler/api/v1/user.py +416 -0
  31. howler/api/v1/utils/__init__.py +0 -0
  32. howler/api/v1/utils/etag.py +84 -0
  33. howler/api/v1/view.py +288 -0
  34. howler/app.py +235 -0
  35. howler/common/README.md +125 -0
  36. howler/common/__init__.py +0 -0
  37. howler/common/classification.py +979 -0
  38. howler/common/classification.yml +107 -0
  39. howler/common/exceptions.py +167 -0
  40. howler/common/loader.py +154 -0
  41. howler/common/logging/__init__.py +241 -0
  42. howler/common/logging/audit.py +138 -0
  43. howler/common/logging/format.py +38 -0
  44. howler/common/net.py +79 -0
  45. howler/common/net_static.py +1494 -0
  46. howler/common/random_user.py +316 -0
  47. howler/common/swagger.py +117 -0
  48. howler/config.py +64 -0
  49. howler/cronjobs/__init__.py +29 -0
  50. howler/cronjobs/retention.py +61 -0
  51. howler/cronjobs/rules.py +274 -0
  52. howler/cronjobs/view_cleanup.py +88 -0
  53. howler/datastore/README.md +112 -0
  54. howler/datastore/__init__.py +0 -0
  55. howler/datastore/bulk.py +72 -0
  56. howler/datastore/collection.py +2342 -0
  57. howler/datastore/constants.py +119 -0
  58. howler/datastore/exceptions.py +41 -0
  59. howler/datastore/howler_store.py +105 -0
  60. howler/datastore/migrations/fix_process.py +41 -0
  61. howler/datastore/operations.py +130 -0
  62. howler/datastore/schemas.py +90 -0
  63. howler/datastore/store.py +231 -0
  64. howler/datastore/support/__init__.py +0 -0
  65. howler/datastore/support/build.py +215 -0
  66. howler/datastore/support/schemas.py +90 -0
  67. howler/datastore/types.py +22 -0
  68. howler/error.py +91 -0
  69. howler/external/__init__.py +0 -0
  70. howler/external/generate_mitre.py +96 -0
  71. howler/external/generate_sigma_rules.py +31 -0
  72. howler/external/generate_tlds.py +47 -0
  73. howler/external/reindex_data.py +66 -0
  74. howler/external/wipe_databases.py +58 -0
  75. howler/gunicorn_config.py +25 -0
  76. howler/healthz.py +47 -0
  77. howler/helper/__init__.py +0 -0
  78. howler/helper/azure.py +50 -0
  79. howler/helper/discover.py +59 -0
  80. howler/helper/hit.py +236 -0
  81. howler/helper/oauth.py +247 -0
  82. howler/helper/search.py +92 -0
  83. howler/helper/workflow.py +110 -0
  84. howler/helper/ws.py +378 -0
  85. howler/odm/README.md +102 -0
  86. howler/odm/__init__.py +1 -0
  87. howler/odm/base.py +1543 -0
  88. howler/odm/charter.txt +146 -0
  89. howler/odm/helper.py +416 -0
  90. howler/odm/howler_enum.py +25 -0
  91. howler/odm/models/__init__.py +0 -0
  92. howler/odm/models/action.py +33 -0
  93. howler/odm/models/analytic.py +90 -0
  94. howler/odm/models/assemblyline.py +48 -0
  95. howler/odm/models/aws.py +23 -0
  96. howler/odm/models/azure.py +16 -0
  97. howler/odm/models/cbs.py +44 -0
  98. howler/odm/models/config.py +558 -0
  99. howler/odm/models/dossier.py +33 -0
  100. howler/odm/models/ecs/__init__.py +0 -0
  101. howler/odm/models/ecs/agent.py +17 -0
  102. howler/odm/models/ecs/autonomous_system.py +16 -0
  103. howler/odm/models/ecs/client.py +149 -0
  104. howler/odm/models/ecs/cloud.py +141 -0
  105. howler/odm/models/ecs/code_signature.py +27 -0
  106. howler/odm/models/ecs/container.py +32 -0
  107. howler/odm/models/ecs/dns.py +62 -0
  108. howler/odm/models/ecs/egress.py +10 -0
  109. howler/odm/models/ecs/elf.py +74 -0
  110. howler/odm/models/ecs/email.py +122 -0
  111. howler/odm/models/ecs/error.py +14 -0
  112. howler/odm/models/ecs/event.py +140 -0
  113. howler/odm/models/ecs/faas.py +24 -0
  114. howler/odm/models/ecs/file.py +84 -0
  115. howler/odm/models/ecs/geo.py +30 -0
  116. howler/odm/models/ecs/group.py +18 -0
  117. howler/odm/models/ecs/hash.py +16 -0
  118. howler/odm/models/ecs/host.py +17 -0
  119. howler/odm/models/ecs/http.py +37 -0
  120. howler/odm/models/ecs/ingress.py +12 -0
  121. howler/odm/models/ecs/interface.py +21 -0
  122. howler/odm/models/ecs/network.py +30 -0
  123. howler/odm/models/ecs/observer.py +45 -0
  124. howler/odm/models/ecs/organization.py +12 -0
  125. howler/odm/models/ecs/os.py +21 -0
  126. howler/odm/models/ecs/pe.py +17 -0
  127. howler/odm/models/ecs/process.py +216 -0
  128. howler/odm/models/ecs/registry.py +26 -0
  129. howler/odm/models/ecs/related.py +45 -0
  130. howler/odm/models/ecs/rule.py +51 -0
  131. howler/odm/models/ecs/server.py +24 -0
  132. howler/odm/models/ecs/threat.py +247 -0
  133. howler/odm/models/ecs/tls.py +58 -0
  134. howler/odm/models/ecs/url.py +51 -0
  135. howler/odm/models/ecs/user.py +57 -0
  136. howler/odm/models/ecs/user_agent.py +20 -0
  137. howler/odm/models/ecs/vulnerability.py +41 -0
  138. howler/odm/models/gcp.py +16 -0
  139. howler/odm/models/hit.py +356 -0
  140. howler/odm/models/howler_data.py +328 -0
  141. howler/odm/models/lead.py +24 -0
  142. howler/odm/models/localized_label.py +13 -0
  143. howler/odm/models/overview.py +16 -0
  144. howler/odm/models/pivot.py +40 -0
  145. howler/odm/models/template.py +24 -0
  146. howler/odm/models/user.py +83 -0
  147. howler/odm/models/view.py +34 -0
  148. howler/odm/random_data.py +888 -0
  149. howler/odm/randomizer.py +609 -0
  150. howler/patched.py +5 -0
  151. howler/plugins/__init__.py +25 -0
  152. howler/plugins/config.py +123 -0
  153. howler/remote/__init__.py +0 -0
  154. howler/remote/datatypes/README.md +355 -0
  155. howler/remote/datatypes/__init__.py +98 -0
  156. howler/remote/datatypes/counters.py +63 -0
  157. howler/remote/datatypes/events.py +66 -0
  158. howler/remote/datatypes/hash.py +206 -0
  159. howler/remote/datatypes/lock.py +42 -0
  160. howler/remote/datatypes/queues/__init__.py +0 -0
  161. howler/remote/datatypes/queues/comms.py +59 -0
  162. howler/remote/datatypes/queues/multi.py +32 -0
  163. howler/remote/datatypes/queues/named.py +93 -0
  164. howler/remote/datatypes/queues/priority.py +215 -0
  165. howler/remote/datatypes/set.py +118 -0
  166. howler/remote/datatypes/user_quota_tracker.py +54 -0
  167. howler/security/__init__.py +253 -0
  168. howler/security/socket.py +108 -0
  169. howler/security/utils.py +185 -0
  170. howler/services/__init__.py +0 -0
  171. howler/services/action_service.py +111 -0
  172. howler/services/analytic_service.py +128 -0
  173. howler/services/auth_service.py +323 -0
  174. howler/services/config_service.py +128 -0
  175. howler/services/dossier_service.py +252 -0
  176. howler/services/event_service.py +93 -0
  177. howler/services/hit_service.py +893 -0
  178. howler/services/jwt_service.py +158 -0
  179. howler/services/lucene_service.py +286 -0
  180. howler/services/notebook_service.py +119 -0
  181. howler/services/overview_service.py +44 -0
  182. howler/services/template_service.py +45 -0
  183. howler/services/user_service.py +331 -0
  184. howler/utils/__init__.py +0 -0
  185. howler/utils/annotations.py +28 -0
  186. howler/utils/chunk.py +38 -0
  187. howler/utils/dict_utils.py +200 -0
  188. howler/utils/isotime.py +17 -0
  189. howler/utils/list_utils.py +11 -0
  190. howler/utils/lucene.py +77 -0
  191. howler/utils/path.py +27 -0
  192. howler/utils/socket_utils.py +61 -0
  193. howler/utils/str_utils.py +256 -0
  194. howler/utils/uid.py +47 -0
  195. howler_api-3.0.0.dev374.dist-info/METADATA +71 -0
  196. howler_api-3.0.0.dev374.dist-info/RECORD +198 -0
  197. howler_api-3.0.0.dev374.dist-info/WHEEL +4 -0
  198. howler_api-3.0.0.dev374.dist-info/entry_points.txt +8 -0
@@ -0,0 +1,2342 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ import sys
7
+ import time
8
+ import typing
9
+ import warnings
10
+ from copy import deepcopy
11
+ from datetime import datetime
12
+ from os import environ
13
+ from random import random
14
+ from typing import Any, Dict, Generic, Optional, TypeVar, Union
15
+
16
+ import elasticsearch
17
+ from datemath import dm
18
+ from datemath.helpers import DateMathException
19
+
20
+ from howler import odm
21
+ from howler.common.exceptions import HowlerRuntimeError, HowlerValueError, NonRecoverableError
22
+ from howler.common.loader import APP_NAME
23
+ from howler.common.logging.format import HWL_DATE_FORMAT, HWL_LOG_FORMAT
24
+ from howler.datastore.constants import BACK_MAPPING, TYPE_MAPPING
25
+ from howler.datastore.exceptions import (
26
+ DataStoreException,
27
+ HowlerScanError,
28
+ MultiKeyError,
29
+ SearchException,
30
+ SearchRetryException,
31
+ VersionConflictException,
32
+ )
33
+ from howler.datastore.support.build import build_mapping
34
+ from howler.datastore.support.schemas import (
35
+ default_dynamic_strings,
36
+ default_dynamic_templates,
37
+ default_index,
38
+ default_mapping,
39
+ )
40
+ from howler.odm.base import (
41
+ BANNED_FIELDS,
42
+ IP,
43
+ ClassificationObject,
44
+ Enum,
45
+ Integer,
46
+ Keyword,
47
+ List,
48
+ Mapping,
49
+ Model,
50
+ ValidatedKeyword,
51
+ _Field,
52
+ )
53
+ from howler.utils.dict_utils import prune, recursive_update
54
+
55
+ if typing.TYPE_CHECKING:
56
+ from .store import ESStore
57
+
58
+
59
+ TRANSPORT_TIMEOUT = int(environ.get("HWL_DATASTORE_TRANSPORT_TIMEOUT", "10"))
60
+
61
+ logger = logging.getLogger("howler.api.datastore")
62
+ logger.setLevel(logging.INFO)
63
+ console = logging.StreamHandler()
64
+ console.setLevel(logging.INFO)
65
+ console.setFormatter(logging.Formatter(HWL_LOG_FORMAT, HWL_DATE_FORMAT))
66
+ logger.addHandler(console)
67
+
68
+ ModelType = TypeVar("ModelType", bound=Model)
69
+ write_block_settings = {"index.blocks.write": True}
70
+ write_unblock_settings = {"index.blocks.write": None}
71
+
72
+ # A token value to represent a document not existing. Its a string to match the
73
+ # type used for version values. Any string will do as long as it never matches
74
+ # a real version string.
75
+ CREATE_TOKEN = "create" # noqa: S105
76
+
77
+
78
+ def _strip_lists(model, data):
79
+ """Elasticsearch returns everything as lists, regardless of whether
80
+ we want the field to be multi-valued or not. This method uses the model's
81
+ knowledge of what should or should not have multiple values to fix the data.
82
+ """
83
+ fields = model.fields()
84
+ out = {}
85
+ for key, value in odm.flat_to_nested(data).items():
86
+ doc_type = fields.get(key, fields.get("", model))
87
+ # TODO: While we strip lists we don't want to know that the field is optional but we want to know what
88
+ # type of optional field that is. The following two lines of code change the doc_type to the
89
+ # child_type of the field. (Should model.fields() actually do that for us instead?)
90
+ if isinstance(doc_type, odm.Optional):
91
+ doc_type = doc_type.child_type
92
+
93
+ if isinstance(doc_type, odm.List):
94
+ out[key] = value
95
+ elif isinstance(doc_type, odm.Compound) or isinstance(doc_type, odm.Mapping):
96
+ out[key] = _strip_lists(doc_type.child_type, value)
97
+ elif isinstance(value, list):
98
+ out[key] = value[0]
99
+ else:
100
+ out[key] = value
101
+ return out
102
+
103
+
104
+ def sort_str(sort_dicts):
105
+ if sort_dicts is None:
106
+ return sort_dicts
107
+
108
+ sort_list = [f"{key}:{val}" for d in sort_dicts for key, val in d.items()]
109
+ return ",".join(sort_list)
110
+
111
+
112
+ def parse_sort(sort, ret_list=True):
113
+ """This function tries to do two things at once:
114
+ - convert AL sort syntax to elastic,
115
+ - convert any sorts on the key _id to _id_
116
+ """
117
+ if sort is None:
118
+ return sort
119
+
120
+ if isinstance(sort, list):
121
+ return [parse_sort(row, ret_list=False) for row in sort]
122
+ elif isinstance(sort, dict):
123
+ return {("id" if key == "_id" else key): value for key, value in sort.items()}
124
+
125
+ parts = sort.split(" ")
126
+ if len(parts) == 1:
127
+ if parts == "_id":
128
+ if ret_list:
129
+ return ["id"]
130
+ return "id"
131
+ if ret_list:
132
+ return [parts]
133
+ return parts
134
+ elif len(parts) == 2:
135
+ if parts[1] not in ["asc", "desc"]:
136
+ raise SearchException("Unknown sort parameter " + sort)
137
+ if parts[0] == "_id":
138
+ if ret_list:
139
+ return [{"id": parts[1]}]
140
+ return {"id": parts[1]}
141
+ if ret_list:
142
+ return [{parts[0]: parts[1]}]
143
+ return {parts[0]: parts[1]}
144
+ raise SearchException("Unknown sort parameter " + sort)
145
+
146
+
147
+ class ESCollection(Generic[ModelType]):
148
+ DEFAULT_OFFSET = 0
149
+ DEFAULT_ROW_SIZE = 25
150
+ DEFAULT_SEARCH_FIELD = "__text__"
151
+ DEFAULT_SORT = [{"_id": "asc"}]
152
+ FIELD_SANITIZER = re.compile("^[a-z][a-z0-9_\\-.]+$")
153
+ MAX_GROUP_LIMIT = 10
154
+ MAX_FACET_LIMIT = 100
155
+ MAX_RETRY_BACKOFF = 10
156
+ MAX_SEARCH_ROWS = 500
157
+ RETRY_NORMAL = 1
158
+ RETRY_NONE = 0
159
+ RETRY_INFINITY = -1
160
+ SCROLL_TIMEOUT = "5m"
161
+ UPDATE_SET = "SET"
162
+ UPDATE_INC = "INC"
163
+ UPDATE_DEC = "DEC"
164
+ UPDATE_MAX = "MAX"
165
+ UPDATE_MIN = "MIN"
166
+ UPDATE_APPEND = "APPEND"
167
+ UPDATE_APPEND_IF_MISSING = "APPEND_IF_MISSING"
168
+ UPDATE_REMOVE = "REMOVE"
169
+ UPDATE_DELETE = "DELETE"
170
+ UPDATE_OPERATIONS = [
171
+ UPDATE_APPEND,
172
+ UPDATE_APPEND_IF_MISSING,
173
+ UPDATE_DEC,
174
+ UPDATE_INC,
175
+ UPDATE_MAX,
176
+ UPDATE_MIN,
177
+ UPDATE_REMOVE,
178
+ UPDATE_SET,
179
+ UPDATE_DELETE,
180
+ ]
181
+ DEFAULT_SEARCH_VALUES: dict[str, typing.Any] = {
182
+ "timeout": None,
183
+ "field_list": None,
184
+ "facet_active": False,
185
+ "facet_mincount": 1,
186
+ "facet_fields": [],
187
+ "stats_active": False,
188
+ "stats_fields": [],
189
+ "field_script": None,
190
+ "filters": [],
191
+ "group_active": False,
192
+ "group_field": None,
193
+ "group_sort": None,
194
+ "group_limit": 1,
195
+ "histogram_active": False,
196
+ "histogram_field": None,
197
+ "histogram_type": None,
198
+ "histogram_gap": None,
199
+ "histogram_mincount": 1,
200
+ "histogram_start": None,
201
+ "histogram_end": None,
202
+ "start": 0,
203
+ "rows": DEFAULT_ROW_SIZE,
204
+ "query": "*",
205
+ "sort": DEFAULT_SORT,
206
+ "df": None,
207
+ "script_fields": [],
208
+ }
209
+ IGNORE_ENSURE_COLLECTION = False
210
+ ENSURE_COLLECTION_WARNED = False
211
+
212
+ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, max_attempts=10):
213
+ self.replicas = int(
214
+ environ.get(
215
+ f"ELASTIC_{name.upper()}_REPLICAS",
216
+ environ.get("ELASTIC_DEFAULT_REPLICAS", 0),
217
+ )
218
+ )
219
+ self.shards = int(environ.get(f"ELASTIC_{name.upper()}_SHARDS", environ.get("ELASTIC_DEFAULT_SHARDS", 1)))
220
+ self._index_list: list[str] = []
221
+
222
+ self.datastore = datastore
223
+ self.name = f"{APP_NAME}-{name}"
224
+ self.index_name = f"{self.name}_hot"
225
+ self.model_class = model_class
226
+ self.validate = validate
227
+ self.max_attempts = max_attempts
228
+
229
+ if not ESCollection.IGNORE_ENSURE_COLLECTION:
230
+ self._ensure_collection()
231
+ elif "pytest" not in sys.modules and not ESCollection.ENSURE_COLLECTION_WARNED:
232
+ logger.warning("Skipping ensure collection! This is dangerous. Waiting five seconds before continuing.")
233
+ time.sleep(5)
234
+ ESCollection.ENSURE_COLLECTION_WARNED = True
235
+
236
+ self.stored_fields = {}
237
+ if model_class:
238
+ for name, field in model_class.flat_fields().items():
239
+ if field.store:
240
+ self.stored_fields[name] = field
241
+
242
+ @property
243
+ def index_list_full(self):
244
+ if not self._index_list:
245
+ self._index_list = list(self.with_retries(self.datastore.client.indices.get, index=f"{self.name}-*").keys())
246
+
247
+ return [self.index_name] + sorted(self._index_list, reverse=True)
248
+
249
+ @property
250
+ def index_list(self):
251
+ """This property contains the list of valid indexes for the current collection.
252
+
253
+ :return: list of valid indexes for this collection
254
+ """
255
+ return [self.index_name]
256
+
257
+ def scan_with_retry(
258
+ self,
259
+ query,
260
+ sort=None,
261
+ source=None,
262
+ index=None,
263
+ scroll="5m",
264
+ size=1000,
265
+ request_timeout=None,
266
+ ):
267
+ if index is None:
268
+ index = self.index_name
269
+
270
+ # initial search
271
+ resp = self.with_retries(
272
+ self.datastore.client.search,
273
+ index=index,
274
+ query=query,
275
+ scroll=scroll,
276
+ size=size,
277
+ request_timeout=request_timeout,
278
+ sort=sort,
279
+ _source=source,
280
+ )
281
+ scroll_id = resp.get("_scroll_id")
282
+
283
+ try:
284
+ while scroll_id and resp["hits"]["hits"]:
285
+ for hit in resp["hits"]["hits"]:
286
+ yield hit
287
+
288
+ # Default to 0 if the value isn't included in the response
289
+ shards_successful = resp["_shards"].get("successful", 0)
290
+ shards_skipped = resp["_shards"].get("skipped", 0)
291
+ shards_total = resp["_shards"].get("total", 0)
292
+
293
+ # check if we have any errors
294
+ if (shards_successful + shards_skipped) < shards_total:
295
+ shards_message = (
296
+ f"{scroll_id}: Scroll request has only succeeded on {shards_successful} "
297
+ f"(+{shards_skipped} skipped) shards out of {shards_total}."
298
+ )
299
+ raise HowlerScanError(shards_message)
300
+ resp = self.with_retries(self.datastore.client.scroll, scroll_id=scroll_id, scroll=scroll)
301
+ scroll_id = resp.get("_scroll_id")
302
+
303
+ finally:
304
+ if scroll_id:
305
+ resp = self.with_retries(
306
+ self.datastore.client.clear_scroll,
307
+ scroll_id=[scroll_id],
308
+ ignore=(404,),
309
+ )
310
+ if not resp.get("succeeded", False):
311
+ logger.warning(
312
+ f"Could not clear scroll ID {scroll_id}, there is potential "
313
+ "memory leak in you Elastic cluster..."
314
+ )
315
+
316
+ def with_retries(self, func, *args, raise_conflicts=False, **kwargs):
317
+ """This function performs the passed function with the given args and kwargs and reconnect if it fails
318
+
319
+ :return: return the output of the function passed
320
+ """
321
+ retries = 0
322
+ updated = 0
323
+ deleted = 0
324
+
325
+ while True:
326
+ if retries >= self.max_attempts:
327
+ raise HowlerRuntimeError(f"Maximum of {self.max_attempts} retries reached. Aborting ES connection")
328
+
329
+ try:
330
+ ret_val = func(*args, **kwargs)
331
+
332
+ if retries:
333
+ logger.info("Reconnected to elasticsearch!")
334
+
335
+ if updated:
336
+ ret_val["updated"] += updated
337
+
338
+ if deleted:
339
+ ret_val["deleted"] += deleted
340
+
341
+ return ret_val
342
+ except elasticsearch.exceptions.NotFoundError as e:
343
+ if "index_not_found_exception" in str(e):
344
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
345
+ logger.debug("The index does not exist. Trying to recreate it...")
346
+ self._ensure_collection()
347
+ self.datastore.connection_reset()
348
+ retries += 1
349
+ else:
350
+ raise
351
+
352
+ except elasticsearch.exceptions.ConflictError as ce:
353
+ if raise_conflicts:
354
+ # De-sync potential treads trying to write to the index
355
+ time.sleep(random() * 0.1) # noqa: S311
356
+ raise VersionConflictException(str(ce))
357
+ updated += ce.info.get("updated", 0)
358
+ deleted += ce.info.get("deleted", 0)
359
+
360
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
361
+ self.datastore.connection_reset()
362
+ retries += 1
363
+
364
+ except elasticsearch.exceptions.ConnectionTimeout:
365
+ logger.warning(
366
+ f"Elasticsearch connection timeout, server(s): "
367
+ f"{' | '.join(self.datastore.get_hosts(safe=True))}"
368
+ f", retrying {func.__name__}..."
369
+ )
370
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
371
+ self.datastore.connection_reset()
372
+ retries += 1
373
+
374
+ except (
375
+ SearchRetryException,
376
+ elasticsearch.exceptions.ConnectionError,
377
+ elasticsearch.exceptions.AuthenticationException,
378
+ ) as e:
379
+ if not isinstance(e, SearchRetryException):
380
+ logger.warning(
381
+ f"No connection to Elasticsearch server(s): "
382
+ f"{' | '.join(self.datastore.get_hosts(safe=True))}"
383
+ f", because [{e}] retrying {func.__name__}..."
384
+ )
385
+
386
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
387
+ self.datastore.connection_reset()
388
+ retries += 1
389
+
390
+ except elasticsearch.exceptions.TransportError as e:
391
+ err_code, msg, cause = e.args
392
+ if err_code == 503 or err_code == "503":
393
+ logger.warning(f"Looks like index {self.name} is not ready yet, retrying...")
394
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
395
+ self.datastore.connection_reset()
396
+ retries += 1
397
+ elif err_code == 429 or err_code == "429":
398
+ logger.warning(
399
+ "Elasticsearch is too busy to perform the requested " f"task on index {self.name}, retrying..."
400
+ )
401
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
402
+ self.datastore.connection_reset()
403
+ retries += 1
404
+ elif err_code == 403 or err_code == "403":
405
+ logger.warning(
406
+ "Elasticsearch cluster is preventing writing operations " f"on index {self.name}, retrying..."
407
+ )
408
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
409
+ self.datastore.connection_reset()
410
+ retries += 1
411
+
412
+ else:
413
+ raise
414
+
415
+ def _get_task_results(self, task):
416
+ # This function is only used to wait for a asynchronous task to finish in a graceful manner without
417
+ # timing out the elastic client. You can create an async task for long running operation like:
418
+ # - update_by_query
419
+ # - delete_by_query
420
+ # - reindex ...
421
+ attempt = 0
422
+ res = None
423
+ while res is None:
424
+ attempt = attempt + 1
425
+ logger.warning("Checking status of task %s (Attempt %s)", task["task"], attempt)
426
+ try:
427
+ res = self.with_retries(
428
+ self.datastore.client.tasks.get,
429
+ task_id=task["task"],
430
+ wait_for_completion=True,
431
+ timeout="10s",
432
+ )
433
+ except (elasticsearch.exceptions.TransportError, elasticsearch.exceptions.ApiError) as e:
434
+ err_code, msg, _ = e.args
435
+ if (err_code == 500 or err_code == "500") and msg in [
436
+ "timeout_exception",
437
+ "receive_timeout_transport_exception",
438
+ ]:
439
+ pass
440
+ else:
441
+ logger.exception("Unexpected error on task check")
442
+ raise
443
+
444
+ result = res.get("response", res["task"]["status"])
445
+
446
+ logger.info("Task result:\n%s", str(result))
447
+
448
+ return result
449
+
450
+ def _get_current_alias(self, index: str) -> typing.Optional[str]:
451
+ if self.with_retries(self.datastore.client.indices.exists_alias, name=index):
452
+ return next(
453
+ iter(self.with_retries(self.datastore.client.indices.get_alias, index=index)),
454
+ None,
455
+ )
456
+
457
+ return None
458
+
459
+ def _wait_for_status(self, index, min_status="yellow"):
460
+ status_ok = False
461
+ while not status_ok:
462
+ try:
463
+ res = self.datastore.client.cluster.health(index=index, timeout="5s", wait_for_status=min_status)
464
+ status_ok = not res["timed_out"]
465
+ except elasticsearch.exceptions.TransportError as e:
466
+ err_code, _, _ = e.args
467
+ if err_code == 408 or err_code == "408":
468
+ logger.warning(f"Waiting for index {index} to get to status {min_status}...")
469
+ else:
470
+ raise
471
+
472
+ def _safe_index_copy(self, copy_function, src, target, settings=None, min_status="yellow"):
473
+ ret = copy_function(index=src, target=target, settings=settings, request_timeout=60)
474
+ if not ret["acknowledged"]:
475
+ raise DataStoreException(f"Failed to create index {target} from {src}.")
476
+
477
+ self._wait_for_status(target, min_status=min_status)
478
+
479
+ def _delete_async(self, index, query, max_docs=None, sort=None):
480
+ deleted = 0
481
+ while True:
482
+ task = self.with_retries(
483
+ self.datastore.client.delete_by_query,
484
+ index=index,
485
+ query=query,
486
+ wait_for_completion=False,
487
+ conflicts="proceed",
488
+ sort=sort,
489
+ max_docs=max_docs,
490
+ )
491
+ res = self._get_task_results(task)
492
+
493
+ if res["version_conflicts"] == 0:
494
+ res["deleted"] += deleted
495
+ return res
496
+ else:
497
+ deleted += res["deleted"]
498
+
499
+ def _update_async(self, index, script, query, max_docs=None):
500
+ updated = 0
501
+ while True:
502
+ task = self.with_retries(
503
+ self.datastore.client.update_by_query,
504
+ index=index,
505
+ script=script,
506
+ query=query,
507
+ wait_for_completion=False,
508
+ conflicts="proceed",
509
+ max_docs=max_docs,
510
+ )
511
+ res = self._get_task_results(task)
512
+
513
+ if res["version_conflicts"] == 0:
514
+ res["updated"] += updated
515
+ return res
516
+ else:
517
+ updated += res["updated"]
518
+
519
+ def commit(self):
520
+ """This function should be overloaded to perform a commit of the index data of all the different hosts
521
+ specified in self.datastore.hosts.
522
+
523
+ :return: Should return True of the commit was successful on all hosts
524
+ """
525
+ self.with_retries(self.datastore.client.indices.refresh, index=self.index_name)
526
+ self.with_retries(self.datastore.client.indices.clear_cache, index=self.index_name)
527
+ return True
528
+
529
+ def fix_replicas(self):
530
+ """This function should be overloaded to fix the replica configuration of the index of all the different hosts
531
+ specified in self.datastore.hosts.
532
+
533
+ :return: Should return True of the fix was successful on all hosts
534
+ """
535
+ replicas = self._get_index_settings()["index"]["number_of_replicas"]
536
+ settings = {"number_of_replicas": replicas}
537
+ return self.with_retries(self.datastore.client.indices.put_settings, index=self.index_name, settings=settings)[
538
+ "acknowledged"
539
+ ]
540
+
541
+ def fix_shards(self):
542
+ """This function should be overloaded to fix the shard configuration of the index of all the different hosts
543
+ specified in self.datastore.hosts.
544
+
545
+ :return: Should return True of the fix was successful on all hosts
546
+ """
547
+ settings = self._get_index_settings()
548
+ clone_settings = {"index.number_of_replicas": 0}
549
+ clone_finish_settings = None
550
+ clone_setup_settings = None
551
+ method = None
552
+ target_node = ""
553
+ temp_name = f"{self.name}__fix_shards"
554
+
555
+ indexes_settings = self.with_retries(self.datastore.client.indices.get_settings)
556
+ current_settings = indexes_settings.get(self._get_current_alias(self.name), None)
557
+ if not current_settings:
558
+ raise DataStoreException(
559
+ "Could not get current index settings. Something is wrong and requires manual intervention..."
560
+ )
561
+
562
+ cur_replicas = int(current_settings["settings"]["index"]["number_of_replicas"])
563
+ cur_shards = int(current_settings["settings"]["index"]["number_of_shards"])
564
+ target_shards = int(settings["index"]["number_of_shards"])
565
+ clone_finish_settings = {
566
+ "index.number_of_replicas": cur_replicas,
567
+ "index.routing.allocation.require._name": None,
568
+ }
569
+
570
+ if cur_shards > target_shards:
571
+ logger.info(
572
+ f"Current shards ({cur_shards}) is bigger then target shards ({target_shards}), "
573
+ "we will be shrinking the index."
574
+ )
575
+ if cur_shards % target_shards != 0:
576
+ logger.info("The target shards is not a factor of the current shards, aborting...")
577
+ return
578
+ else:
579
+ target_node = self.with_retries(self.datastore.client.cat.nodes, format="json")[0]["name"]
580
+ clone_setup_settings = {
581
+ "index.number_of_replicas": 0,
582
+ "index.routing.allocation.require._name": target_node,
583
+ }
584
+ method = self.datastore.client.indices.shrink
585
+ elif cur_shards < target_shards:
586
+ logger.info(
587
+ f"Current shards ({cur_shards}) is smaller then target shards ({target_shards}), "
588
+ "we will be splitting the index."
589
+ )
590
+ if target_shards % cur_shards != 0:
591
+ logger.warning("The current shards is not a factor of the target shards, aborting...")
592
+ return
593
+ else:
594
+ method = self.datastore.client.indices.split
595
+ else:
596
+ logger.info(
597
+ f"Current shards ({cur_shards}) is equal to the target shards ({target_shards}), "
598
+ "only house keeping operations will be performed."
599
+ )
600
+
601
+ if method:
602
+ # Before we do anything, we should make sure the source index is in a good state
603
+ logger.info(f"Waiting for {self.name.upper()} status to be GREEN.")
604
+ self._wait_for_status(self.name, min_status="green")
605
+
606
+ # Block all indexes to be written to
607
+ logger.info("Set a datastore wide write block on Elastic.")
608
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
609
+
610
+ # Clone it onto a temporary index
611
+ if not self.with_retries(self.datastore.client.indices.exists, index=temp_name):
612
+ # if there are specific settings to be applied to the index, apply them
613
+ if clone_setup_settings:
614
+ logger.info(f"Rellocating index to node {target_node.upper()}.")
615
+ self.with_retries(
616
+ self.datastore.client.indices.put_settings,
617
+ index=self.index_name,
618
+ settings=clone_setup_settings,
619
+ )
620
+
621
+ # Make sure no shard are relocating
622
+ while self.datastore.client.cluster.health(index=self.index_name)["relocating_shards"] != 0:
623
+ time.sleep(1)
624
+
625
+ # Make a clone of the current index
626
+ logger.info(f"Cloning {self.index_name.upper()} into {temp_name.upper()}.")
627
+ self._safe_index_copy(
628
+ self.datastore.client.indices.clone,
629
+ self.index_name,
630
+ temp_name,
631
+ settings=clone_settings,
632
+ min_status="green",
633
+ )
634
+
635
+ # Make 100% sure temporary index is ready
636
+ logger.info(f"Waiting for {temp_name.upper()} status to be GREEN.")
637
+ self._wait_for_status(temp_name, "green")
638
+
639
+ # Make sure temporary index is the alias if not already
640
+ if self._get_current_alias(self.name) != temp_name:
641
+ logger.info(
642
+ f"Make {temp_name.upper()} the current alias for {self.name.upper()} "
643
+ f"and delete {self.index_name.upper()}."
644
+ )
645
+ # Make the hot index the temporary index while deleting the original index
646
+ alias_actions = [
647
+ {"add": {"index": temp_name, "alias": self.name}},
648
+ {"remove_index": {"index": self.index_name}},
649
+ ]
650
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
651
+
652
+ # Make sure the original index is deleted
653
+ if self.with_retries(self.datastore.client.indices.exists, index=self.index_name):
654
+ logger.info(f"Delete extra {self.index_name.upper()} index.")
655
+ self.with_retries(self.datastore.client.indices.delete, index=self.index_name)
656
+
657
+ # Shrink/split the temporary index into the original index
658
+ logger.info(f"Perform shard fix operation from {temp_name.upper()} to {self.index_name.upper()}.")
659
+ self._safe_index_copy(method, temp_name, self.index_name, settings=settings)
660
+
661
+ # Make the original index the new alias
662
+ logger.info(
663
+ f"Make {self.index_name.upper()} the current alias for {self.name.upper()} "
664
+ f"and delete {temp_name.upper()}."
665
+ )
666
+ alias_actions = [
667
+ {"add": {"index": self.index_name, "alias": self.name}},
668
+ {"remove_index": {"index": temp_name}},
669
+ ]
670
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
671
+
672
+ # Restore writes
673
+ logger.debug("Restore datastore wide write operation on Elastic.")
674
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
675
+
676
+ # Restore normal routing and replicas
677
+ logger.debug(f"Restore original routing table for {self.name.upper()}.")
678
+ self.with_retries(
679
+ self.datastore.client.indices.put_settings,
680
+ index=self.name,
681
+ settings=clone_finish_settings,
682
+ )
683
+
684
+ def reindex(self):
685
+ """This function should be overloaded to perform a reindex of all the data of the different hosts
686
+ specified in self.datastore.hosts.
687
+
688
+ :return: Should return True of the commit was successful on all hosts
689
+ """
690
+ logger.warning("Beginning Reindex")
691
+ for index in self.index_list:
692
+ new_name = f"{index}__reindex"
693
+ index_data = None
694
+ if self.with_retries(self.datastore.client.indices.exists, index=index) and not self.with_retries(
695
+ self.datastore.client.indices.exists, index=new_name
696
+ ):
697
+ # Get information about the index to reindex
698
+ index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
699
+
700
+ # Create reindex target
701
+ logger.warning("Creating new index with name %s", new_name)
702
+ self.with_retries(
703
+ self.datastore.client.indices.create,
704
+ index=new_name,
705
+ mappings=self._get_index_mappings(),
706
+ settings=self._get_index_settings(),
707
+ )
708
+
709
+ # For all aliases related to the index, add a new alias to the reindex index
710
+ for alias, alias_data in index_data["aliases"].items():
711
+ # Make the reindex index the new write index if the original index was
712
+ if alias_data.get("is_write_index", True):
713
+ alias_actions = [
714
+ {
715
+ "add": {
716
+ "index": new_name,
717
+ "alias": alias,
718
+ "is_write_index": True,
719
+ }
720
+ },
721
+ {
722
+ "add": {
723
+ "index": index,
724
+ "alias": alias,
725
+ "is_write_index": False,
726
+ }
727
+ },
728
+ ]
729
+ else:
730
+ alias_actions = [{"add": {"index": new_name, "alias": alias}}]
731
+
732
+ logger.warning("Updating alias %s", alias)
733
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
734
+
735
+ # Reindex data into target
736
+ logger.warning("Beginning reindex from %s to %s", index, new_name)
737
+ r_task = self.with_retries(
738
+ self.datastore.client.reindex,
739
+ source={"index": index},
740
+ dest={"index": new_name},
741
+ wait_for_completion=False,
742
+ )
743
+ logger.warning("Reindex taskId: %s", r_task["task"])
744
+ self._get_task_results(r_task)
745
+
746
+ if self.with_retries(self.datastore.client.indices.exists, index=new_name):
747
+ if index_data is None:
748
+ index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
749
+
750
+ logger.warning("Committing reindexed data in index %s", new_name)
751
+ self.with_retries(self.datastore.client.indices.refresh, index=new_name)
752
+ self.with_retries(self.datastore.client.indices.clear_cache, index=new_name)
753
+
754
+ logger.warning("Deleting old index %s", index)
755
+ if self.with_retries(self.datastore.client.indices.exists, index=index):
756
+ self.with_retries(self.datastore.client.indices.delete, index=index)
757
+
758
+ logger.warning("Block write to index")
759
+ self.with_retries(
760
+ self.datastore.client.indices.put_settings,
761
+ settings=write_block_settings,
762
+ )
763
+
764
+ logger.warning("Renaming reindexed index from %s to %s", new_name, index)
765
+ try:
766
+ self._safe_index_copy(
767
+ self.datastore.client.indices.clone,
768
+ new_name,
769
+ index,
770
+ settings=self._get_index_settings(),
771
+ )
772
+
773
+ # Restore original aliases for the index
774
+ for alias, alias_data in index_data["aliases"].items():
775
+ # Make the reindex index the new write index if the original index was
776
+ if alias_data.get("is_write_index", True):
777
+ alias_actions = [
778
+ {
779
+ "add": {
780
+ "index": index,
781
+ "alias": alias,
782
+ "is_write_index": True,
783
+ }
784
+ },
785
+ {"remove_index": {"index": new_name}},
786
+ ]
787
+ self.with_retries(
788
+ self.datastore.client.indices.update_aliases,
789
+ actions=alias_actions,
790
+ )
791
+
792
+ if self.with_retries(self.datastore.client.indices.exists, index=new_name):
793
+ logger.warning("Deleting reindex target %s", new_name)
794
+ self.with_retries(self.datastore.client.indices.delete, index=new_name)
795
+ finally:
796
+ logger.warning("Unblock write to the index")
797
+ self.with_retries(
798
+ self.datastore.client.indices.put_settings,
799
+ settings=write_unblock_settings,
800
+ )
801
+
802
+ return True
803
+
804
+ def multiget(self, key_list, as_dictionary=True, as_obj=True, error_on_missing=True):
805
+ """Get a list of documents from the datastore and make sure they are normalized using
806
+ the model class
807
+
808
+ :param error_on_missing: Should it raise a key error when keys are missing
809
+ :param as_dictionary: Return a disctionary of items or a list
810
+ :param as_obj: Return objects or not
811
+ :param key_list: list of keys of documents to get
812
+ :return: list of instances of the model class
813
+ """
814
+
815
+ def add_to_output(data_output, data_id):
816
+ if "__non_doc_raw__" in data_output:
817
+ if as_dictionary:
818
+ out[data_id] = data_output["__non_doc_raw__"]
819
+ else:
820
+ out.append(data_output["__non_doc_raw__"]) # type: ignore
821
+ else:
822
+ data_output.pop("id", None)
823
+ if as_dictionary:
824
+ out[data_id] = self.normalize(data_output, as_obj=as_obj)
825
+ else:
826
+ out.append(self.normalize(data_output, as_obj=as_obj)) # type: ignore
827
+
828
+ out: Union[dict[str, Any], list[Any]]
829
+ if as_dictionary:
830
+ out = {}
831
+ else:
832
+ out = []
833
+
834
+ if key_list:
835
+ data = self.with_retries(self.datastore.client.mget, ids=key_list, index=self.name)
836
+
837
+ for row in data.get("docs", []):
838
+ if "found" in row and not row["found"]:
839
+ continue
840
+
841
+ try:
842
+ key_list.remove(row["_id"])
843
+ add_to_output(row["_source"], row["_id"])
844
+ except ValueError:
845
+ logger.exception(f'MGet returned multiple documents for id: {row["_id"]}')
846
+
847
+ if key_list and error_on_missing:
848
+ raise MultiKeyError(key_list, out)
849
+
850
+ return out
851
+
852
+ def normalize(self, data, as_obj=True) -> Union[ModelType, dict[str, Any], None]:
853
+ """Normalize the data using the model class
854
+
855
+ :param as_obj: Return an object instead of a dictionary
856
+ :param data: data to normalize
857
+ :return: instance of the model class
858
+ """
859
+ if as_obj and data is not None and self.model_class and not isinstance(data, self.model_class):
860
+ return self.model_class(data)
861
+
862
+ if isinstance(data, dict):
863
+ data = {k: v for k, v in data.items() if k not in BANNED_FIELDS}
864
+
865
+ return data
866
+
867
+ def exists(self, key):
868
+ """Check if a document exists in the datastore.
869
+
870
+ :param key: key of the document to get from the datastore
871
+ :return: true/false depending if the document exists or not
872
+ """
873
+ return self.with_retries(self.datastore.client.exists, index=self.name, id=key, _source=False)
874
+
875
+ def _get(self, key, retries, version=False):
876
+ """Versioned get-save for atomic update has two paths:
877
+ 1. Document doesn't exist at all. Create token will be returned for version.
878
+ This way only the first query to try and create the document will succeed.
879
+ 2. Document exists in hot. A version string with the info needed to do a versioned save is returned.
880
+
881
+ The create token is needed to differentiate between "I'm saving a new
882
+ document non-atomic (version=None)" and "I'm saving a new document
883
+ atomically (version=CREATE_TOKEN)".
884
+ """
885
+
886
+ def normalize_output(data_output):
887
+ if "__non_doc_raw__" in data_output:
888
+ return data_output["__non_doc_raw__"]
889
+ data_output.pop("id", None)
890
+ return data_output
891
+
892
+ if retries is None:
893
+ retries = self.RETRY_NONE
894
+
895
+ done = False
896
+ while not done:
897
+ try:
898
+ doc = self.with_retries(self.datastore.client.get, index=self.name, id=key)
899
+ if version:
900
+ return (
901
+ normalize_output(doc["_source"]),
902
+ f"{doc['_seq_no']}---{doc['_primary_term']}",
903
+ )
904
+ return normalize_output(doc["_source"])
905
+ except elasticsearch.exceptions.NotFoundError:
906
+ pass
907
+
908
+ if retries > 0:
909
+ time.sleep(0.05)
910
+ retries -= 1
911
+ elif retries < 0:
912
+ time.sleep(0.05)
913
+ else:
914
+ done = True
915
+
916
+ if version:
917
+ return None, CREATE_TOKEN
918
+
919
+ return None
920
+
921
+ def get(self, key, as_obj=True, version=False):
922
+ """Get a document from the datastore, retry a few times if not found and normalize the
923
+ document with the model provided with the collection.
924
+
925
+ This is the normal way to get data of the system.
926
+
927
+ :param archive_access: Temporary sets access value to archive during this call
928
+ :param as_obj: Should the data be returned as an ODM object
929
+ :param key: key of the document to get from the datastore
930
+ :param version: should the version number be returned by the call
931
+ :return: an instance of the model class loaded with the document data
932
+ """
933
+ data = self._get(key, self.RETRY_NORMAL, version=version)
934
+ if version:
935
+ data, version = data
936
+ return self.normalize(data, as_obj=as_obj), version
937
+ return self.normalize(data, as_obj=as_obj)
938
+
939
+ def get_if_exists(self, key, as_obj=True, version=False):
940
+ """Get a document from the datastore but do not retry if not found.
941
+
942
+ Use this more in caching scenarios because eventually consistent database may lead
943
+ to have document reported as missing even if they exist.
944
+
945
+ :param archive_access: Temporary sets access value to archive during this call
946
+ :param as_obj: Should the data be returned as an ODM object
947
+ :param key: key of the document to get from the datastore
948
+ :param version: should the version number be returned by the call
949
+ :return: an instance of the model class loaded with the document data
950
+ """
951
+ data = self._get(key, self.RETRY_NONE, version=version)
952
+ if version:
953
+ data, version = data
954
+ return self.normalize(data, as_obj=as_obj), version
955
+ return self.normalize(data, as_obj=as_obj)
956
+
957
+ def require(
958
+ self, key, as_obj=True, version=False
959
+ ) -> Union[
960
+ tuple[Optional[Union[dict[str, Any], ModelType]], str],
961
+ Optional[Union[dict[str, Any], ModelType]],
962
+ ]:
963
+ """Get a document from the datastore and retry forever because we know for sure
964
+ that this document should exist. If it does not right now, this will wait for the
965
+ document to show up in the datastore.
966
+
967
+ :param archive_access: Temporary sets access value to archive during this call
968
+ :param as_obj: Should the data be returned as an ODM object
969
+ :param key: key of the document to get from the datastore
970
+ :param version: should the version number be returned by the call
971
+ :return: an instance of the model class loaded with the document data
972
+ """
973
+ data = self._get(key, self.RETRY_INFINITY, version=version)
974
+ if version:
975
+ data, version = data
976
+ return self.normalize(data, as_obj=as_obj), version
977
+ return self.normalize(data, as_obj=as_obj)
978
+
979
+ def save(self, key, data, version=None):
980
+ """Save to document to the datastore using the key as its document id.
981
+
982
+ The document data will be normalized before being saved in the datastore.
983
+
984
+ :param key: ID of the document to save
985
+ :param data: raw data or instance of the model class to save as the document
986
+ :param version: version of the document to save over, if the version check fails this will raise an exception
987
+ :return: True if the document was saved properly
988
+ """
989
+ if " " in key:
990
+ raise DataStoreException("You are not allowed to use spaces in datastore keys.")
991
+
992
+ data = self.normalize(data)
993
+
994
+ if self.model_class:
995
+ saved_data = data.as_primitives(hidden_fields=True)
996
+ else:
997
+ if not isinstance(data, dict):
998
+ saved_data = {"__non_doc_raw__": data}
999
+ else:
1000
+ saved_data = deepcopy(data)
1001
+
1002
+ saved_data["id"] = key
1003
+ operation = "index"
1004
+ seq_no = None
1005
+ primary_term = None
1006
+
1007
+ if version == CREATE_TOKEN:
1008
+ operation = "create"
1009
+ elif version:
1010
+ seq_no, primary_term = version.split("---")
1011
+
1012
+ try:
1013
+ self.with_retries(
1014
+ self.datastore.client.index,
1015
+ index=self.name,
1016
+ id=key,
1017
+ document=json.dumps(saved_data),
1018
+ op_type=operation,
1019
+ if_seq_no=seq_no,
1020
+ if_primary_term=primary_term,
1021
+ raise_conflicts=True,
1022
+ )
1023
+ except elasticsearch.BadRequestError as e:
1024
+ raise NonRecoverableError(
1025
+ f"When saving document {key} to elasticsearch, an exception occurred:\n{repr(e)}\n\n"
1026
+ f"Data: {json.dumps(saved_data)}"
1027
+ ) from e
1028
+
1029
+ return True
1030
+
1031
+ def delete(self, key):
1032
+ """This function should delete the underlying document referenced by the key.
1033
+ It should return true if the document was in fact properly deleted.
1034
+
1035
+ :param key: id of the document to delete
1036
+ :return: True is delete successful
1037
+ """
1038
+ try:
1039
+ info = self.with_retries(self.datastore.client.delete, id=key, index=self.name)
1040
+ return info["result"] == "deleted"
1041
+ except elasticsearch.NotFoundError:
1042
+ return False
1043
+
1044
+ def delete_by_query(self, query, workers=20, sort=None, max_docs=None):
1045
+ """This function should delete the underlying documents referenced by the query.
1046
+ It should return true if the documents were in fact properly deleted.
1047
+
1048
+ :param query: Query of the documents to download
1049
+ :param workers: Number of workers used for deletion if basic currency delete is used
1050
+ :return: True is delete successful
1051
+ """
1052
+ query = {"bool": {"must": {"query_string": {"query": query}}}}
1053
+ info = self._delete_async(self.name, query=query, sort=sort_str(parse_sort(sort)), max_docs=max_docs)
1054
+ return info.get("deleted", 0) != 0
1055
+
1056
+ def _create_scripts_from_operations(self, operations):
1057
+ op_sources = []
1058
+ op_params = {}
1059
+ val_id = 0
1060
+ for op, doc_key, value in operations:
1061
+ if op == self.UPDATE_SET:
1062
+ op_sources.append(f"ctx._source.{doc_key} = params.value{val_id}")
1063
+ op_params[f"value{val_id}"] = value
1064
+ elif op == self.UPDATE_DELETE:
1065
+ op_sources.append(f"ctx._source.{doc_key}.remove(params.value{val_id})")
1066
+ op_params[f"value{val_id}"] = value
1067
+ elif op == self.UPDATE_APPEND:
1068
+ op_sources.append(f"ctx._source.{doc_key}.add(params.value{val_id})")
1069
+ op_params[f"value{val_id}"] = value
1070
+ elif op == self.UPDATE_APPEND_IF_MISSING:
1071
+ script = (
1072
+ f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) == -1) "
1073
+ f"{{ctx._source.{doc_key}.add(params.value{val_id})}}"
1074
+ )
1075
+ op_sources.append(script)
1076
+ op_params[f"value{val_id}"] = value
1077
+ elif op == self.UPDATE_REMOVE:
1078
+ script = (
1079
+ f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) != -1) "
1080
+ f"{{ctx._source.{doc_key}.remove(ctx._source.{doc_key}.indexOf(params.value{val_id}))}}"
1081
+ )
1082
+ op_sources.append(script)
1083
+ op_params[f"value{val_id}"] = value
1084
+ elif op == self.UPDATE_INC:
1085
+ op_sources.append(f"ctx._source.{doc_key} += params.value{val_id}")
1086
+ op_params[f"value{val_id}"] = value
1087
+ elif op == self.UPDATE_DEC:
1088
+ op_sources.append(f"ctx._source.{doc_key} -= params.value{val_id}")
1089
+ op_params[f"value{val_id}"] = value
1090
+ elif op == self.UPDATE_MAX:
1091
+ script = (
1092
+ f"if (ctx._source.{doc_key} == null || "
1093
+ f"ctx._source.{doc_key}.compareTo(params.value{val_id}) < 0) "
1094
+ f"{{ctx._source.{doc_key} = params.value{val_id}}}"
1095
+ )
1096
+ op_sources.append(script)
1097
+ op_params[f"value{val_id}"] = value
1098
+ elif op == self.UPDATE_MIN:
1099
+ script = (
1100
+ f"if (ctx._source.{doc_key} == null || "
1101
+ f"ctx._source.{doc_key}.compareTo(params.value{val_id}) > 0) "
1102
+ f"{{ctx._source.{doc_key} = params.value{val_id}}}"
1103
+ )
1104
+ op_sources.append(script)
1105
+ op_params[f"value{val_id}"] = value
1106
+
1107
+ val_id += 1
1108
+
1109
+ joined_sources = """;\n""".join(op_sources)
1110
+
1111
+ return {
1112
+ "lang": "painless",
1113
+ "source": joined_sources.replace("};\n", "}\n"),
1114
+ "params": op_params,
1115
+ }
1116
+
1117
+ def _validate_operations(self, operations):
1118
+ """Validate the different operations received for a partial update
1119
+
1120
+ TODO: When the field is of type Mapping, the validation/check only works for depth 1. A full recursive
1121
+ solution is needed to support multi-depth cases.
1122
+
1123
+ :param operations: list of operation tuples
1124
+ :raises: DatastoreException if operation not valid
1125
+ """
1126
+ if self.model_class:
1127
+ fields = self.model_class.flat_fields(show_compound=True)
1128
+ if "classification in fields":
1129
+ fields.update(
1130
+ {
1131
+ "__access_lvl__": Integer(),
1132
+ "__access_req__": List(Keyword()),
1133
+ "__access_grp1__": List(Keyword()),
1134
+ "__access_grp2__": List(Keyword()),
1135
+ }
1136
+ )
1137
+ else:
1138
+ fields = None
1139
+
1140
+ ret_ops = []
1141
+ for op, doc_key, value in operations:
1142
+ if op not in self.UPDATE_OPERATIONS:
1143
+ raise DataStoreException(f"Not a valid Update Operation: {op}")
1144
+
1145
+ if fields is not None:
1146
+ prev_key = None
1147
+ if doc_key not in fields:
1148
+ if "." in doc_key:
1149
+ prev_key = doc_key[: doc_key.rindex(".")]
1150
+ if prev_key in fields and not isinstance(fields[prev_key], Mapping):
1151
+ raise DataStoreException(f"Invalid field for model: {prev_key}")
1152
+ else:
1153
+ raise DataStoreException(f"Invalid field for model: {doc_key}")
1154
+
1155
+ if prev_key:
1156
+ field = fields[prev_key].child_type
1157
+ else:
1158
+ field = fields[doc_key]
1159
+
1160
+ if op in [
1161
+ self.UPDATE_APPEND,
1162
+ self.UPDATE_APPEND_IF_MISSING,
1163
+ self.UPDATE_REMOVE,
1164
+ ]:
1165
+ try:
1166
+ value = field.check(value)
1167
+ except (ValueError, TypeError, AttributeError):
1168
+ raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
1169
+
1170
+ elif op in [self.UPDATE_SET, self.UPDATE_DEC, self.UPDATE_INC]:
1171
+ try:
1172
+ value = field.check(value)
1173
+ except (ValueError, TypeError):
1174
+ raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
1175
+
1176
+ if isinstance(value, Model):
1177
+ value = value.as_primitives()
1178
+ elif isinstance(value, datetime):
1179
+ value = value.isoformat()
1180
+ elif isinstance(value, ClassificationObject):
1181
+ value = str(value)
1182
+
1183
+ ret_ops.append((op, doc_key, value))
1184
+
1185
+ return ret_ops
1186
+
1187
+ def update(self, key, operations, version=None):
1188
+ """This function performs an atomic update on some fields from the
1189
+ underlying documents referenced by the id using a list of operations.
1190
+
1191
+ Operations supported by the update function are the following:
1192
+ INTEGER ONLY: Increase and decreased value
1193
+ LISTS ONLY: Append and remove items
1194
+ ALL TYPES: Set value
1195
+
1196
+ :param key: ID of the document to modify
1197
+ :param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
1198
+ :return: True is update successful
1199
+ """
1200
+ operations = self._validate_operations(operations)
1201
+ script = self._create_scripts_from_operations(operations)
1202
+ seq_no = None
1203
+ primary_term = None
1204
+ if version:
1205
+ seq_no, primary_term = version.split("---")
1206
+
1207
+ try:
1208
+ res = self.with_retries(
1209
+ self.datastore.client.update,
1210
+ index=self.name,
1211
+ id=key,
1212
+ script=script,
1213
+ if_seq_no=seq_no,
1214
+ if_primary_term=primary_term,
1215
+ raise_conflicts=seq_no and primary_term,
1216
+ )
1217
+ return (
1218
+ res["result"] == "updated",
1219
+ f"{res['_seq_no']}---{res['_primary_term']}",
1220
+ )
1221
+ except elasticsearch.NotFoundError as e:
1222
+ logger.warning("Update - elasticsearch.NotFoundError: %s %s", e.message, e.info)
1223
+ except elasticsearch.BadRequestError as e:
1224
+ logger.warning("Update - elasticsearch.BadRequestError: %s %s", e.message, e.info)
1225
+ return False
1226
+ except VersionConflictException as e:
1227
+ logger.warning("Update - elasticsearch.ConflictError: %s", e.message)
1228
+ raise
1229
+ except Exception as e:
1230
+ logger.warning("Update - Generic Exception: %s", str(e))
1231
+ return False
1232
+
1233
+ return False
1234
+
1235
+ def update_by_query(self, query, operations, filters=None, access_control=None, max_docs=None):
1236
+ """This function performs an atomic update on some fields from the
1237
+ underlying documents matching the query and the filters using a list of operations.
1238
+
1239
+ Operations supported by the update function are the following:
1240
+ INTEGER ONLY: Increase and decreased value
1241
+ LISTS ONLY: Append and remove items
1242
+ ALL TYPES: Set value
1243
+
1244
+ :param access_control:
1245
+ :param filters: Filter queries to reduce the data
1246
+ :param query: Query to find the matching documents
1247
+ :param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
1248
+ :return: True is update successful
1249
+ """
1250
+ operations = self._validate_operations(operations)
1251
+ if filters is None:
1252
+ filters = []
1253
+
1254
+ if access_control:
1255
+ filters.append(access_control)
1256
+
1257
+ script = self._create_scripts_from_operations(operations)
1258
+
1259
+ try:
1260
+ res = self._update_async(
1261
+ self.name,
1262
+ script=script,
1263
+ query={
1264
+ "bool": {
1265
+ "must": {"query_string": {"query": query}},
1266
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1267
+ }
1268
+ },
1269
+ max_docs=max_docs,
1270
+ )
1271
+ except Exception:
1272
+ return False
1273
+
1274
+ return res["updated"]
1275
+
1276
+ def _format_output(self, result, fields=None, as_obj=True):
1277
+ # Getting search document data
1278
+ extra_fields = result.get("fields", {})
1279
+ source_data = result.pop("_source", None)
1280
+
1281
+ if source_data is not None:
1282
+ for f in BANNED_FIELDS:
1283
+ source_data.pop(f, None)
1284
+
1285
+ item_id = result["_id"]
1286
+
1287
+ if self.model_class:
1288
+ if not fields:
1289
+ fields = list(self.stored_fields.keys())
1290
+ fields.append("id")
1291
+ elif isinstance(fields, str):
1292
+ fields = fields.split(",")
1293
+
1294
+ extra_fields = _strip_lists(self.model_class, extra_fields)
1295
+ if as_obj:
1296
+ if "_index" in fields and "_index" in result:
1297
+ extra_fields["_index"] = result["_index"]
1298
+ if "*" in fields:
1299
+ fields = None
1300
+ return self.model_class(source_data, mask=fields, docid=item_id, extra_fields=extra_fields)
1301
+ else:
1302
+ source_data = recursive_update(source_data, extra_fields, allow_recursion=False)
1303
+ if "id" in fields:
1304
+ source_data["id"] = item_id
1305
+ if "_index" in fields and "_index" in result:
1306
+ source_data["_index"] = result["_index"]
1307
+
1308
+ if isinstance(fields, str):
1309
+ fields = [fields]
1310
+
1311
+ if fields is None or "*" in fields or "id" in fields:
1312
+ source_data["id"] = [item_id]
1313
+
1314
+ if fields is None or "*" in fields:
1315
+ return source_data
1316
+
1317
+ return prune(source_data, fields, self.stored_fields, mapping_class=Mapping)
1318
+
1319
+ def _search(self, args=None, deep_paging_id=None, use_archive=False, track_total_hits=None):
1320
+ if args is None:
1321
+ args = []
1322
+
1323
+ params = {}
1324
+ if deep_paging_id is not None:
1325
+ params = {"scroll": self.SCROLL_TIMEOUT}
1326
+ elif track_total_hits:
1327
+ params["track_total_hits"] = track_total_hits
1328
+
1329
+ parsed_values = deepcopy(self.DEFAULT_SEARCH_VALUES)
1330
+
1331
+ # TODO: we should validate values for max rows, group length, history length...
1332
+ for key, value in args:
1333
+ if key not in parsed_values:
1334
+ all_args = "; ".join("%s=%s" % (field_name, field_value) for field_name, field_value in args)
1335
+ raise HowlerValueError("Unknown query argument: %s %s of [%s]" % (key, value, all_args))
1336
+
1337
+ parsed_values[key] = value
1338
+
1339
+ # This is our minimal query, the following sections will fill it out
1340
+ # with whatever extra options the search has been given.
1341
+ query_body = {
1342
+ "query": {
1343
+ "bool": {
1344
+ "must": {"query_string": {"query": parsed_values["query"]}},
1345
+ "filter": [{"query_string": {"query": ff}} for ff in parsed_values["filters"]],
1346
+ }
1347
+ },
1348
+ "from_": parsed_values["start"],
1349
+ "size": parsed_values["rows"],
1350
+ "sort": parse_sort(parsed_values["sort"]),
1351
+ "_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
1352
+ }
1353
+
1354
+ if parsed_values["script_fields"]:
1355
+ fields = {}
1356
+ for f_name, f_script in parsed_values["script_fields"]:
1357
+ fields[f_name] = {"script": {"lang": "painless", "source": f_script}}
1358
+ query_body["script_fields"] = fields
1359
+
1360
+ if parsed_values["df"]:
1361
+ query_body["query"]["bool"]["must"]["query_string"]["default_field"] = parsed_values["df"]
1362
+
1363
+ # Time limit for the query
1364
+ if parsed_values["timeout"]:
1365
+ query_body["timeout"] = parsed_values["timeout"]
1366
+
1367
+ # Add an histogram aggregation
1368
+ if parsed_values["histogram_active"]:
1369
+ query_body.setdefault("aggregations", {})
1370
+ if parsed_values["histogram_type"] == "date_histogram":
1371
+ interval_type = "fixed_interval"
1372
+ else:
1373
+ interval_type = "interval"
1374
+ query_body["aggregations"]["histogram"] = {
1375
+ parsed_values["histogram_type"]: {
1376
+ "field": parsed_values["histogram_field"],
1377
+ interval_type: parsed_values["histogram_gap"],
1378
+ "min_doc_count": parsed_values["histogram_mincount"],
1379
+ "extended_bounds": {
1380
+ "min": parsed_values["histogram_start"],
1381
+ "max": parsed_values["histogram_end"],
1382
+ },
1383
+ }
1384
+ }
1385
+
1386
+ # Add a facet aggregation
1387
+ if parsed_values["facet_active"]:
1388
+ query_body.setdefault("aggregations", {})
1389
+ for field in parsed_values["facet_fields"]:
1390
+ field_script = parsed_values["field_script"]
1391
+ if field_script:
1392
+ facet_body = {
1393
+ "script": {"source": field_script},
1394
+ "min_doc_count": parsed_values["facet_mincount"],
1395
+ }
1396
+ else:
1397
+ facet_body = {
1398
+ "field": field,
1399
+ "min_doc_count": parsed_values["facet_mincount"],
1400
+ "size": parsed_values["rows"],
1401
+ }
1402
+ query_body["aggregations"][field] = {"terms": facet_body}
1403
+
1404
+ # Add a facet aggregation
1405
+ if parsed_values["stats_active"]:
1406
+ query_body.setdefault("aggregations", {})
1407
+ for field in parsed_values["stats_fields"]:
1408
+ field_script = parsed_values["field_script"]
1409
+ if field_script:
1410
+ stats_body = {"script": {"source": field_script}}
1411
+ else:
1412
+ stats_body = {"field": field}
1413
+
1414
+ query_body["aggregations"][f"{field}_stats"] = {"stats": stats_body}
1415
+
1416
+ # Add a group aggregation
1417
+ if parsed_values["group_active"]:
1418
+ query_body["collapse"] = {
1419
+ "field": parsed_values["group_field"],
1420
+ "inner_hits": {
1421
+ "name": "group",
1422
+ "_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
1423
+ "size": parsed_values["group_limit"],
1424
+ "sort": parse_sort(parsed_values["group_sort"]) or [{parsed_values["group_field"]: "asc"}],
1425
+ },
1426
+ }
1427
+
1428
+ try:
1429
+ if deep_paging_id is not None and not deep_paging_id == "*":
1430
+ # Get the next page
1431
+ result = self.with_retries(
1432
+ self.datastore.client.scroll,
1433
+ scroll_id=deep_paging_id,
1434
+ **params,
1435
+ )
1436
+ else:
1437
+ # Run the query
1438
+ result = self.with_retries(
1439
+ self.datastore.client.search,
1440
+ index=self.name,
1441
+ **params,
1442
+ **query_body,
1443
+ )
1444
+
1445
+ return result
1446
+ except (
1447
+ elasticsearch.ConnectionError,
1448
+ elasticsearch.ConnectionTimeout,
1449
+ ) as error:
1450
+ raise SearchRetryException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
1451
+
1452
+ except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
1453
+ try:
1454
+ err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
1455
+ except (ValueError, KeyError, IndexError):
1456
+ err_msg = str(e)
1457
+
1458
+ raise SearchException(err_msg)
1459
+
1460
+ except Exception as error:
1461
+ raise SearchException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
1462
+
1463
+ def search(
1464
+ self,
1465
+ query,
1466
+ offset=0,
1467
+ rows=None,
1468
+ sort=None,
1469
+ fl=None,
1470
+ timeout=None,
1471
+ filters=None,
1472
+ access_control=None,
1473
+ deep_paging_id=None,
1474
+ as_obj=True,
1475
+ use_archive=False,
1476
+ track_total_hits=None,
1477
+ script_fields=[],
1478
+ ):
1479
+ """This function should perform a search through the datastore and return a
1480
+ search result object that consist on the following::
1481
+
1482
+ {
1483
+ "offset": 0, # Offset in the search index
1484
+ "rows": 25, # Number of document returned per page
1485
+ "total": 123456, # Total number of documents matching the query
1486
+ "items": [ # List of dictionary where each keys are one of
1487
+ { # the field list parameter specified
1488
+ fl[0]: value,
1489
+ ...
1490
+ fl[x]: value
1491
+ }, ...]
1492
+ }
1493
+
1494
+ :param script_fields: List of name/script tuple of fields to be evaluated at runtime
1495
+ :param track_total_hits: Return to total matching document count
1496
+ :param use_archive: Query also the archive
1497
+ :param deep_paging_id: ID of the next page during deep paging searches
1498
+ :param as_obj: Return objects instead of dictionaries
1499
+ :param query: lucene query to search for
1500
+ :param offset: offset at which you want the results to start at (paging)
1501
+ :param rows: number of items that the search function should return
1502
+ :param sort: field to sort the data with
1503
+ :param fl: list of fields to return from the search
1504
+ :param timeout: maximum time of execution
1505
+ :param filters: additional queries to run on the original query to reduce the scope
1506
+ :param access_control: access control parameters to limiti the scope of the query
1507
+ :return: a search result object
1508
+ """
1509
+ if offset is None:
1510
+ offset = self.DEFAULT_OFFSET
1511
+
1512
+ if rows is None:
1513
+ rows = self.DEFAULT_ROW_SIZE
1514
+
1515
+ if sort is None:
1516
+ sort = self.DEFAULT_SORT
1517
+
1518
+ if filters is None:
1519
+ filters = []
1520
+ elif isinstance(filters, str):
1521
+ filters = [filters]
1522
+
1523
+ if access_control:
1524
+ filters.append(access_control)
1525
+
1526
+ args = [
1527
+ ("query", query),
1528
+ ("start", offset),
1529
+ ("rows", rows),
1530
+ ("sort", sort),
1531
+ ("df", self.DEFAULT_SEARCH_FIELD),
1532
+ ]
1533
+
1534
+ if fl:
1535
+ field_list = fl.split(",")
1536
+ args.append(("field_list", field_list))
1537
+ else:
1538
+ field_list = None
1539
+
1540
+ if timeout:
1541
+ args.append(("timeout", "%sms" % timeout))
1542
+
1543
+ if filters:
1544
+ args.append(("filters", filters))
1545
+
1546
+ if script_fields:
1547
+ args.append(("script_fields", script_fields))
1548
+
1549
+ result = self._search(
1550
+ args,
1551
+ deep_paging_id=deep_paging_id,
1552
+ use_archive=use_archive,
1553
+ track_total_hits=track_total_hits,
1554
+ )
1555
+
1556
+ ret_data: dict[str, Any] = {
1557
+ "offset": int(offset),
1558
+ "rows": int(rows),
1559
+ "total": int(result["hits"]["total"]["value"]),
1560
+ "items": [self._format_output(doc, field_list, as_obj=as_obj) for doc in result["hits"]["hits"]],
1561
+ }
1562
+
1563
+ new_deep_paging_id = result.get("_scroll_id", None)
1564
+
1565
+ # Check if the scroll is finished and close it
1566
+ if deep_paging_id is not None and new_deep_paging_id is None:
1567
+ self.with_retries(
1568
+ self.datastore.client.clear_scroll,
1569
+ scroll_id=[deep_paging_id],
1570
+ ignore=(404,),
1571
+ )
1572
+
1573
+ # Check if we can tell from inspection that we have finished the scroll
1574
+ if new_deep_paging_id is not None and len(ret_data["items"]) < ret_data["rows"]:
1575
+ self.with_retries(
1576
+ self.datastore.client.clear_scroll,
1577
+ scroll_id=[new_deep_paging_id],
1578
+ ignore=(404,),
1579
+ )
1580
+ new_deep_paging_id = None
1581
+
1582
+ if new_deep_paging_id is not None:
1583
+ ret_data["next_deep_paging_id"] = new_deep_paging_id
1584
+
1585
+ return ret_data
1586
+
1587
+ def stream_search(
1588
+ self,
1589
+ query,
1590
+ fl=None,
1591
+ filters=None,
1592
+ access_control=None,
1593
+ item_buffer_size=200,
1594
+ as_obj=True,
1595
+ use_archive=False,
1596
+ ):
1597
+ """This function should perform a search through the datastore and stream
1598
+ all related results as a dictionary of key value pair where each keys
1599
+ are one of the field specified in the field list parameter.
1600
+
1601
+ >>> # noinspection PyUnresolvedReferences
1602
+ >>> {
1603
+ >>> fl[0]: value,
1604
+ >>> ...
1605
+ >>> fl[x]: value
1606
+ >>> }
1607
+
1608
+ :param use_archive: Query also the archive
1609
+ :param as_obj: Return objects instead of dictionaries
1610
+ :param query: lucene query to search for
1611
+ :param fl: list of fields to return from the search
1612
+ :param filters: additional queries to run on the original query to reduce the scope
1613
+ :param access_control: access control parameters to run the query with
1614
+ :param buffer_size: number of items to buffer with each search call
1615
+ :return: a generator of dictionary of field list results
1616
+ """
1617
+ if item_buffer_size > 2000 or item_buffer_size < 50:
1618
+ raise SearchException("Variable item_buffer_size must be between 50 and 2000.")
1619
+
1620
+ if filters is None:
1621
+ filters = []
1622
+ elif isinstance(filters, str):
1623
+ filters = [filters]
1624
+
1625
+ if access_control:
1626
+ filters.append(access_control)
1627
+
1628
+ if fl:
1629
+ fl = fl.split(",")
1630
+
1631
+ query_expression = {
1632
+ "bool": {
1633
+ "must": {
1634
+ "query_string": {
1635
+ "query": query,
1636
+ "default_field": self.DEFAULT_SEARCH_FIELD,
1637
+ }
1638
+ },
1639
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1640
+ }
1641
+ }
1642
+ sort = parse_sort(self.datastore.DEFAULT_SORT)
1643
+ source = fl or list(self.stored_fields.keys())
1644
+
1645
+ for value in self.scan_with_retry(
1646
+ query=query_expression,
1647
+ sort=sort,
1648
+ source=source,
1649
+ index=self.name,
1650
+ size=item_buffer_size,
1651
+ ):
1652
+ # Unpack the results, ensure the id is always set
1653
+ yield self._format_output(value, fl, as_obj=as_obj)
1654
+
1655
+ def raw_eql_search(
1656
+ self,
1657
+ eql_query: str,
1658
+ fl: Optional[str] = None,
1659
+ filters: Optional[Union[list[str], str]] = None,
1660
+ rows: Optional[int] = None,
1661
+ timeout: Optional[int] = None,
1662
+ as_obj=True,
1663
+ ):
1664
+ if filters is None:
1665
+ filters = []
1666
+ elif isinstance(filters, str):
1667
+ filters = [filters]
1668
+
1669
+ parsed_filters = {
1670
+ "bool": {
1671
+ "must": {"query_string": {"query": "*:*"}},
1672
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1673
+ }
1674
+ }
1675
+
1676
+ if not fl:
1677
+ fl = "howler.id"
1678
+
1679
+ if rows is None:
1680
+ rows = 5
1681
+
1682
+ fields = [{"field": f} for f in fl.split(",")]
1683
+
1684
+ try:
1685
+ result = self.with_retries(
1686
+ self.datastore.client.eql.search,
1687
+ index=self.name,
1688
+ timestamp_field="timestamp",
1689
+ query=eql_query,
1690
+ fields=fields,
1691
+ filter=parsed_filters,
1692
+ size=rows,
1693
+ wait_for_completion_timeout=(f"{timeout}ms" if timeout is not None else None),
1694
+ )
1695
+
1696
+ ret_data: dict[str, Any] = {
1697
+ "rows": int(rows),
1698
+ "total": int(result["hits"]["total"]["value"]),
1699
+ "items": [
1700
+ self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in result["hits"].get("events", [])
1701
+ ],
1702
+ "sequences": [
1703
+ [self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in sequence.get("events", [])]
1704
+ for sequence in result["hits"].get("sequences", [])
1705
+ ],
1706
+ }
1707
+
1708
+ return ret_data
1709
+
1710
+ except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
1711
+ try:
1712
+ err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
1713
+ except (ValueError, KeyError, IndexError):
1714
+ err_msg = str(e)
1715
+
1716
+ raise SearchException(err_msg)
1717
+ except Exception as error:
1718
+ raise SearchException(f"collection: {self.name}, error: {str(error)}")
1719
+
1720
+ def keys(self, access_control=None):
1721
+ """This function streams the keys of all the documents of this collection.
1722
+
1723
+ :param access_control: access control parameter to limit the scope of the key scan
1724
+ :return: a generator of keys
1725
+ """
1726
+ for item in self.stream_search("id:*", fl="id", access_control=access_control):
1727
+ try:
1728
+ yield item._id
1729
+ except AttributeError:
1730
+ value = item["id"]
1731
+ if isinstance(value, list):
1732
+ for v in value:
1733
+ yield v
1734
+ else:
1735
+ yield value
1736
+
1737
+ def _validate_steps_count(self, start, end, gap):
1738
+ with warnings.catch_warnings():
1739
+ warnings.simplefilter("ignore")
1740
+
1741
+ gaps_count = None
1742
+ ret_type: Optional[type] = None
1743
+
1744
+ try:
1745
+ start = int(start)
1746
+ end = int(end)
1747
+ gap = int(gap)
1748
+
1749
+ gaps_count = int((end - start) / gap)
1750
+ ret_type = int
1751
+ except ValueError:
1752
+ pass
1753
+
1754
+ if not gaps_count:
1755
+ try:
1756
+ t_gap = gap.strip("+").strip("-")
1757
+
1758
+ parsed_start = dm(self.datastore.to_pydatemath(start)).int_timestamp
1759
+ parsed_end = dm(self.datastore.to_pydatemath(end)).int_timestamp
1760
+ parsed_gap = dm(self.datastore.to_pydatemath(f"+{t_gap}")).int_timestamp - dm("now").int_timestamp
1761
+
1762
+ gaps_count = int((parsed_end - parsed_start) / parsed_gap)
1763
+ ret_type = str
1764
+ except (DateMathException, AttributeError):
1765
+ pass
1766
+
1767
+ if gaps_count is None:
1768
+ raise SearchException(
1769
+ "Could not parse histogram ranges. Either you've mix integer and dates values or you "
1770
+ "have invalid date math values. (start='%s', end='%s', gap='%s')" % (start, end, gap)
1771
+ )
1772
+
1773
+ if gaps_count > self.MAX_FACET_LIMIT:
1774
+ raise SearchException(
1775
+ f"Histograms are limited to a maximum of {self.MAX_FACET_LIMIT} steps. "
1776
+ f"Current settings would generate {gaps_count} steps"
1777
+ )
1778
+ return ret_type
1779
+
1780
+ def count(
1781
+ self,
1782
+ query,
1783
+ access_control=None,
1784
+ ):
1785
+ """This function should perform a count operation through the datastore and return a
1786
+ search result object that consists of the following:
1787
+
1788
+ {
1789
+ "total": 123456, # Total number of documents matching the query
1790
+ }
1791
+
1792
+ :param query: lucene query to search for
1793
+ :param access_control: access control parameters to limit the scope of the query
1794
+ :return: a count result object
1795
+ """
1796
+ result = self.with_retries(self.datastore.client.count, index=self.name, q=query)
1797
+
1798
+ ret_data: dict[str, Any] = {
1799
+ "count": result["count"],
1800
+ }
1801
+
1802
+ return ret_data
1803
+
1804
+ def histogram(
1805
+ self,
1806
+ field,
1807
+ start,
1808
+ end,
1809
+ gap,
1810
+ query="id:*",
1811
+ mincount=None,
1812
+ filters=None,
1813
+ access_control=None,
1814
+ use_archive=False,
1815
+ ):
1816
+ type_modifier = self._validate_steps_count(start, end, gap)
1817
+ start = type_modifier(start)
1818
+ end = type_modifier(end)
1819
+ gap = type_modifier(gap)
1820
+
1821
+ if mincount is None:
1822
+ mincount = 1
1823
+
1824
+ if filters is None:
1825
+ filters = []
1826
+ elif isinstance(filters, str):
1827
+ filters = [filters]
1828
+ filters.append("{field}:[{min} TO {max}]".format(field=field, min=start, max=end))
1829
+
1830
+ args = [
1831
+ ("query", query),
1832
+ ("histogram_active", True),
1833
+ ("histogram_field", field),
1834
+ (
1835
+ "histogram_type",
1836
+ "date_histogram" if isinstance(gap, str) else "histogram",
1837
+ ),
1838
+ (
1839
+ "histogram_gap",
1840
+ gap.strip("+").strip("-") if isinstance(gap, str) else gap,
1841
+ ),
1842
+ ("histogram_mincount", mincount),
1843
+ ("histogram_start", start),
1844
+ ("histogram_end", end),
1845
+ ]
1846
+
1847
+ if access_control:
1848
+ filters.append(access_control)
1849
+
1850
+ if filters:
1851
+ args.append(("filters", filters))
1852
+
1853
+ result = self._search(args, use_archive=use_archive)
1854
+
1855
+ # Convert the histogram into a dictionary
1856
+ return {
1857
+ type_modifier(row.get("key_as_string", row["key"])): row["doc_count"]
1858
+ for row in result["aggregations"]["histogram"]["buckets"]
1859
+ }
1860
+
1861
+ def facet(
1862
+ self,
1863
+ field,
1864
+ query=None,
1865
+ prefix=None,
1866
+ contains=None,
1867
+ ignore_case=False,
1868
+ sort=None,
1869
+ rows=10,
1870
+ mincount=None,
1871
+ filters=None,
1872
+ access_control=None,
1873
+ use_archive=False,
1874
+ field_script=None,
1875
+ ):
1876
+ if not query:
1877
+ query = "id:*"
1878
+
1879
+ if not mincount:
1880
+ mincount = 1
1881
+
1882
+ if filters is None:
1883
+ filters = []
1884
+ elif isinstance(filters, str):
1885
+ filters = [filters]
1886
+
1887
+ args = [
1888
+ ("query", query),
1889
+ ("facet_active", True),
1890
+ ("facet_fields", [field]),
1891
+ ("facet_mincount", mincount),
1892
+ ("rows", rows),
1893
+ ]
1894
+
1895
+ # TODO: prefix, contains, ignore_case, sort
1896
+
1897
+ if access_control:
1898
+ filters.append(access_control)
1899
+
1900
+ if filters:
1901
+ args.append(("filters", filters))
1902
+
1903
+ if field_script:
1904
+ args.append(("field_script", field_script))
1905
+
1906
+ result = self._search(args, use_archive=use_archive)
1907
+
1908
+ # Convert the histogram into a dictionary
1909
+ return {
1910
+ row.get("key_as_string", row["key"]): row["doc_count"] for row in result["aggregations"][field]["buckets"]
1911
+ }
1912
+
1913
+ def stats(
1914
+ self,
1915
+ field,
1916
+ query="id:*",
1917
+ filters=None,
1918
+ access_control=None,
1919
+ use_archive=False,
1920
+ field_script=None,
1921
+ ):
1922
+ if filters is None:
1923
+ filters = []
1924
+ elif isinstance(filters, str):
1925
+ filters = [filters]
1926
+
1927
+ args = [
1928
+ ("query", query),
1929
+ ("stats_active", True),
1930
+ ("stats_fields", [field]),
1931
+ ("rows", 0),
1932
+ ]
1933
+
1934
+ if access_control:
1935
+ filters.append(access_control)
1936
+
1937
+ if filters:
1938
+ args.append(("filters", filters))
1939
+
1940
+ if field_script:
1941
+ args.append(("field_script", field_script))
1942
+
1943
+ result = self._search(args, use_archive=use_archive)
1944
+ return result["aggregations"][f"{field}_stats"]
1945
+
1946
+ def grouped_search(
1947
+ self,
1948
+ group_field,
1949
+ query="id:*",
1950
+ offset=0,
1951
+ sort=None,
1952
+ group_sort=None,
1953
+ fl=None,
1954
+ limit=1,
1955
+ rows=None,
1956
+ filters=None,
1957
+ access_control=None,
1958
+ as_obj=True,
1959
+ use_archive=False,
1960
+ track_total_hits=False,
1961
+ ):
1962
+ if rows is None:
1963
+ rows = self.DEFAULT_ROW_SIZE
1964
+
1965
+ if sort is None:
1966
+ sort = self.DEFAULT_SORT
1967
+
1968
+ if group_sort is None:
1969
+ group_sort = self.DEFAULT_SORT
1970
+
1971
+ if filters is None:
1972
+ filters = []
1973
+ elif isinstance(filters, str):
1974
+ filters = [filters]
1975
+
1976
+ args = [
1977
+ ("query", query),
1978
+ ("group_active", True),
1979
+ ("group_field", group_field),
1980
+ ("group_limit", limit),
1981
+ ("group_sort", group_sort),
1982
+ ("start", offset),
1983
+ ("rows", rows),
1984
+ ("sort", sort),
1985
+ ]
1986
+
1987
+ filters.append("%s:*" % group_field)
1988
+
1989
+ if fl:
1990
+ field_list = fl.split(",")
1991
+ args.append(("field_list", field_list))
1992
+ else:
1993
+ field_list = None
1994
+
1995
+ if access_control:
1996
+ filters.append(access_control)
1997
+
1998
+ if filters:
1999
+ args.append(("filters", filters))
2000
+
2001
+ result = self._search(args, use_archive=use_archive, track_total_hits=track_total_hits)
2002
+
2003
+ return {
2004
+ "offset": offset,
2005
+ "rows": rows,
2006
+ "total": int(result["hits"]["total"]["value"]),
2007
+ "items": [
2008
+ {
2009
+ "value": collapsed["fields"][group_field][0],
2010
+ "total": int(collapsed["inner_hits"]["group"]["hits"]["total"]["value"]),
2011
+ "items": [
2012
+ self._format_output(row, field_list, as_obj=as_obj)
2013
+ for row in collapsed["inner_hits"]["group"]["hits"]["hits"]
2014
+ ],
2015
+ }
2016
+ for collapsed in result["hits"]["hits"]
2017
+ ],
2018
+ }
2019
+
2020
+ @staticmethod
2021
+ def _get_odm_type(ds_type):
2022
+ try:
2023
+ return BACK_MAPPING[ds_type].__name__.lower()
2024
+ except KeyError:
2025
+ return ds_type.lower()
2026
+
2027
+ def fields(self, skip_mapping_children=False):
2028
+ """
2029
+ This function should return all the fields in the index with their types
2030
+ """
2031
+
2032
+ def flatten_fields(props):
2033
+ out = {}
2034
+ for name, value in props.items():
2035
+ if "properties" in value:
2036
+ for child, cprops in flatten_fields(value["properties"]).items():
2037
+ out[name + "." + child] = cprops
2038
+ elif "type" in value:
2039
+ out[name] = value
2040
+ else:
2041
+ raise HowlerValueError("Unknown field data " + str(props))
2042
+ return out
2043
+
2044
+ data = self.with_retries(self.datastore.client.indices.get, index=self.name)
2045
+ index_name = list(data.keys())[0]
2046
+ properties = flatten_fields(data[index_name]["mappings"].get("properties", {}))
2047
+
2048
+ if self.model_class:
2049
+ model_fields = self.model_class.flat_fields()
2050
+ else:
2051
+ model_fields = {}
2052
+
2053
+ collection_data = {}
2054
+
2055
+ for p_name, p_val in properties.items():
2056
+ if p_name.startswith("_") or "//" in p_name:
2057
+ continue
2058
+ if not self.FIELD_SANITIZER.match(p_name):
2059
+ continue
2060
+ field_model = model_fields.get(p_name, None)
2061
+
2062
+ if "." in p_name:
2063
+ parent_p_name = re.sub(r"^(.+)\..+?$", r"\1", p_name)
2064
+ if parent_p_name in model_fields and isinstance(model_fields.get(parent_p_name), Mapping):
2065
+ if parent_p_name not in collection_data:
2066
+ field_model = model_fields.get(parent_p_name, None)
2067
+ f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
2068
+
2069
+ collection_data[parent_p_name] = {
2070
+ "default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
2071
+ "indexed": p_val.get("index", p_val.get("enabled", True)),
2072
+ "list": field_model.multivalued if field_model else False,
2073
+ "stored": field_model.store if field_model else False,
2074
+ "type": f_type,
2075
+ "description": (field_model.description if field_model else ""),
2076
+ "regex": (
2077
+ field_model.child_type.validation_regex.pattern
2078
+ if issubclass(type(field_model.child_type), ValidatedKeyword)
2079
+ or issubclass(type(field_model.child_type), IP)
2080
+ else None
2081
+ ),
2082
+ "values": (
2083
+ list(field_model.child_type.values)
2084
+ if issubclass(type(field_model.child_type), Enum)
2085
+ else None
2086
+ ),
2087
+ "deprecated_description": (field_model.deprecated_description if field_model else ""),
2088
+ }
2089
+
2090
+ if skip_mapping_children:
2091
+ continue
2092
+ else:
2093
+ continue
2094
+
2095
+ f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
2096
+ collection_data[p_name] = {
2097
+ "default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
2098
+ "indexed": p_val.get("index", p_val.get("enabled", True)),
2099
+ "list": field_model.multivalued if field_model else False,
2100
+ "stored": field_model.store if field_model else False,
2101
+ "deprecated": field_model.deprecated if field_model else False,
2102
+ "type": f_type,
2103
+ "description": field_model.description if field_model else "",
2104
+ "regex": (
2105
+ field_model.validation_regex.pattern
2106
+ if issubclass(type(field_model), ValidatedKeyword) or issubclass(type(field_model), IP)
2107
+ else None
2108
+ ),
2109
+ "values": list(field_model.values) if issubclass(type(field_model), Enum) else None,
2110
+ "deprecated_description": (field_model.deprecated_description if field_model else ""),
2111
+ }
2112
+
2113
+ collection_data.pop("id", None)
2114
+
2115
+ return collection_data
2116
+
2117
+ def _ilm_policy_exists(self):
2118
+ try:
2119
+ self.datastore.client.ilm.get_lifecycle(name=f"{self.name}_policy")
2120
+ except elasticsearch.NotFoundError:
2121
+ return False
2122
+ else:
2123
+ return True
2124
+
2125
+ def _delete_ilm_policy(self):
2126
+ try:
2127
+ self.datastore.client.ilm.delete_lifecycle(name=f"{self.name}_policy")
2128
+ except elasticsearch.ApiError:
2129
+ return False
2130
+ else:
2131
+ return True
2132
+
2133
+ def _get_index_settings(self) -> dict:
2134
+ default_stub: dict = deepcopy(default_index)
2135
+ settings: dict = default_stub.pop("settings", {})
2136
+
2137
+ if "index" not in settings:
2138
+ settings["index"] = {}
2139
+ settings["index"]["number_of_shards"] = self.shards
2140
+ settings["index"]["number_of_replicas"] = self.replicas
2141
+
2142
+ if "mapping" not in settings["index"]:
2143
+ settings["index"]["mapping"] = {}
2144
+
2145
+ if "total_fields" not in settings["index"]["mapping"]:
2146
+ settings["index"]["mapping"]["total_fields"] = {}
2147
+
2148
+ limit = len(self.model_class.flat_fields()) + 500 if self.model_class else 1500
2149
+ if limit < 1500:
2150
+ limit = 1500
2151
+ elif limit > 1500:
2152
+ logger.warning("ODM field size is larger than 1500 - set to %s", limit)
2153
+ settings["index"]["mapping"]["total_fields"]["limit"] = limit
2154
+
2155
+ return settings
2156
+
2157
+ def _get_index_mappings(self) -> dict:
2158
+ mappings: dict = deepcopy(default_mapping)
2159
+ if self.model_class:
2160
+ mappings["properties"], mappings["dynamic_templates"] = build_mapping(self.model_class.fields().values())
2161
+ mappings["dynamic_templates"].insert(0, default_dynamic_strings)
2162
+ else:
2163
+ mappings["dynamic_templates"] = deepcopy(default_dynamic_templates)
2164
+
2165
+ if not mappings["dynamic_templates"]:
2166
+ # Setting dynamic to strict prevents any documents with fields not in the properties to be added
2167
+ mappings["dynamic"] = "strict"
2168
+
2169
+ mappings["properties"]["id"] = {
2170
+ "store": True,
2171
+ "doc_values": True,
2172
+ "type": "keyword",
2173
+ }
2174
+
2175
+ mappings["properties"]["__text__"] = {
2176
+ "store": False,
2177
+ "type": "text",
2178
+ }
2179
+
2180
+ return mappings
2181
+
2182
+ def __get_possible_fields(self, field):
2183
+ field_types = [field.__name__.lower()]
2184
+ if field.__bases__[0] != _Field:
2185
+ field_types.extend(self.__get_possible_fields(field.__bases__[0]))
2186
+
2187
+ if field_type := TYPE_MAPPING.get(field.__name__, None):
2188
+ field_types.append(field_type)
2189
+
2190
+ return field_types
2191
+
2192
+ def _check_fields(self, model=None):
2193
+ if not self.validate:
2194
+ return
2195
+
2196
+ if model is None:
2197
+ if self.model_class:
2198
+ return self._check_fields(self.model_class)
2199
+ return
2200
+
2201
+ fields = self.fields()
2202
+ model = self.model_class.flat_fields(skip_mappings=True)
2203
+
2204
+ missing = set(model.keys()) - set(fields.keys())
2205
+ if missing:
2206
+ # TODO: Bump mapping limit
2207
+ try:
2208
+ self._add_fields({key: model[key] for key in missing})
2209
+ except elasticsearch.BadRequestError as err:
2210
+ handled = False
2211
+ if err.body and isinstance(err.body, dict) and "error" in err.body and "reason" in err.body["error"]:
2212
+ reason: str = err.body["error"]["reason"]
2213
+ if reason.startswith("Limit of total fields"):
2214
+ current_count = int(re.sub(r".+\[(\d+)].+", r"\1", reason))
2215
+ logger.warning(
2216
+ "Current field cap %s is too low, increasing to %s", current_count, current_count + 500
2217
+ )
2218
+ self.with_retries(
2219
+ self.datastore.client.indices.put_settings,
2220
+ settings={"index.mapping.total_fields.limit": current_count + 500},
2221
+ )
2222
+ self._add_fields({key: model[key] for key in missing})
2223
+ handled = True
2224
+ if not handled:
2225
+ raise
2226
+
2227
+ matching = set(fields.keys()) & set(model.keys())
2228
+ for field_name in matching:
2229
+ if fields[field_name]["indexed"] != model[field_name].index and model[field_name].index:
2230
+ raise HowlerRuntimeError(f"Field {field_name} should be indexed but is not.")
2231
+
2232
+ possible_field_types = self.__get_possible_fields(model[field_name].__class__)
2233
+
2234
+ if fields[field_name]["type"] not in possible_field_types:
2235
+ raise HowlerRuntimeError(
2236
+ f"Field {field_name} didn't have the expected store "
2237
+ f"type. [{fields[field_name]['type']} != "
2238
+ f"{model[field_name].__class__.__name__.lower()}]"
2239
+ )
2240
+
2241
+ def _ensure_collection(self):
2242
+ """This function should test if the collection that you are trying to access does indeed exist
2243
+ and should create it if it does not.
2244
+
2245
+ :return:
2246
+ """
2247
+ # Create HOT index
2248
+ if not self.with_retries(self.datastore.client.indices.exists, index=self.name):
2249
+ logger.debug(f"Index {self.name.upper()} does not exists. Creating it now...")
2250
+ try:
2251
+ self.with_retries(
2252
+ self.datastore.client.indices.create,
2253
+ index=self.index_name,
2254
+ mappings=self._get_index_mappings(),
2255
+ settings=self._get_index_settings(),
2256
+ )
2257
+ except elasticsearch.exceptions.RequestError as e:
2258
+ if "resource_already_exists_exception" not in str(e):
2259
+ raise
2260
+ logger.warning(f"Tried to create an index template that already exists: {self.name.upper()}")
2261
+
2262
+ self.with_retries(
2263
+ self.datastore.client.indices.put_alias,
2264
+ index=self.index_name,
2265
+ name=self.name,
2266
+ )
2267
+ elif not self.with_retries(
2268
+ self.datastore.client.indices.exists, index=self.index_name
2269
+ ) and not self.with_retries(self.datastore.client.indices.exists_alias, name=self.name):
2270
+ # Turn on write block
2271
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
2272
+
2273
+ # Create a copy on the result index
2274
+ self._safe_index_copy(self.datastore.client.indices.clone, self.name, self.index_name)
2275
+
2276
+ # Make the hot index the new clone
2277
+ self.with_retries(
2278
+ self.datastore.client.indices.update_aliases,
2279
+ actions=[
2280
+ {"add": {"index": self.index_name, "alias": self.name}},
2281
+ {"remove_index": {"index": self.name}},
2282
+ ],
2283
+ )
2284
+
2285
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
2286
+
2287
+ self._check_fields()
2288
+
2289
+ def _add_fields(self, missing_fields: Dict):
2290
+ no_fix = []
2291
+ properties = {}
2292
+ for name, field in missing_fields.items():
2293
+ # Figure out the path of the field in the document, if the name is set in the field, it
2294
+ # is going to be duplicated in the path from missing_fields, so drop it
2295
+ prefix = name.split(".")
2296
+ if field.name:
2297
+ prefix = prefix[:-1]
2298
+
2299
+ # Build the fields and templates for this new mapping
2300
+ sub_properties, sub_templates = build_mapping([field], prefix=prefix, allow_refuse_implicit=False)
2301
+ properties.update(sub_properties)
2302
+ if sub_templates:
2303
+ no_fix.append(name)
2304
+
2305
+ # If we have collected any fields that we can't just blindly add, as they might conflict
2306
+ # with existing things, (we might have the refuse_all_implicit_mappings rule in place)
2307
+ # simply raise an exception
2308
+ if no_fix:
2309
+ raise HowlerValueError(
2310
+ f"Can't update database mapping for {self.name}, " f"couldn't safely amend mapping for {no_fix}"
2311
+ )
2312
+
2313
+ # If we got this far, the missing fields have been described in properties, upload them to the
2314
+ # server, and we should be able to move on.
2315
+ for index in self.index_list_full:
2316
+ self.with_retries(self.datastore.client.indices.put_mapping, index=index, properties=properties)
2317
+
2318
+ if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
2319
+ current_template = self.with_retries(self.datastore.client.indices.get_template, name=self.name)[self.name]
2320
+ self.with_retries(
2321
+ self.datastore.client.indices.put_template,
2322
+ name=self.name,
2323
+ **recursive_update(current_template, {"mappings": {"properties": properties}}),
2324
+ )
2325
+
2326
+ def wipe(self):
2327
+ """This function should completely delete the collection
2328
+
2329
+ NEVER USE THIS!
2330
+
2331
+ :return:
2332
+ """
2333
+ logger.debug("Wipe operation started for collection: %s" % self.name.upper())
2334
+
2335
+ for index in self.index_list:
2336
+ if self.with_retries(self.datastore.client.indices.exists, index=index):
2337
+ self.with_retries(self.datastore.client.indices.delete, index=index)
2338
+
2339
+ if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
2340
+ self.with_retries(self.datastore.client.indices.delete_template, name=self.name)
2341
+
2342
+ self._ensure_collection()