howler-api 2.13.0.dev329__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of howler-api might be problematic. Click here for more details.

Files changed (200) hide show
  1. howler/__init__.py +0 -0
  2. howler/actions/__init__.py +167 -0
  3. howler/actions/add_label.py +111 -0
  4. howler/actions/add_to_bundle.py +159 -0
  5. howler/actions/change_field.py +76 -0
  6. howler/actions/demote.py +160 -0
  7. howler/actions/example_plugin.py +104 -0
  8. howler/actions/prioritization.py +93 -0
  9. howler/actions/promote.py +147 -0
  10. howler/actions/remove_from_bundle.py +133 -0
  11. howler/actions/remove_label.py +111 -0
  12. howler/actions/transition.py +200 -0
  13. howler/api/__init__.py +249 -0
  14. howler/api/base.py +88 -0
  15. howler/api/socket.py +114 -0
  16. howler/api/v1/__init__.py +97 -0
  17. howler/api/v1/action.py +372 -0
  18. howler/api/v1/analytic.py +748 -0
  19. howler/api/v1/auth.py +382 -0
  20. howler/api/v1/borealis.py +101 -0
  21. howler/api/v1/configs.py +55 -0
  22. howler/api/v1/dossier.py +222 -0
  23. howler/api/v1/help.py +28 -0
  24. howler/api/v1/hit.py +1181 -0
  25. howler/api/v1/notebook.py +82 -0
  26. howler/api/v1/overview.py +191 -0
  27. howler/api/v1/search.py +715 -0
  28. howler/api/v1/template.py +206 -0
  29. howler/api/v1/tool.py +183 -0
  30. howler/api/v1/user.py +414 -0
  31. howler/api/v1/utils/__init__.py +0 -0
  32. howler/api/v1/utils/etag.py +84 -0
  33. howler/api/v1/view.py +288 -0
  34. howler/app.py +235 -0
  35. howler/common/README.md +144 -0
  36. howler/common/__init__.py +0 -0
  37. howler/common/classification.py +979 -0
  38. howler/common/classification.yml +107 -0
  39. howler/common/exceptions.py +167 -0
  40. howler/common/hexdump.py +48 -0
  41. howler/common/iprange.py +171 -0
  42. howler/common/loader.py +154 -0
  43. howler/common/logging/__init__.py +241 -0
  44. howler/common/logging/audit.py +138 -0
  45. howler/common/logging/format.py +38 -0
  46. howler/common/net.py +79 -0
  47. howler/common/net_static.py +1494 -0
  48. howler/common/random_user.py +316 -0
  49. howler/common/swagger.py +117 -0
  50. howler/config.py +64 -0
  51. howler/cronjobs/__init__.py +29 -0
  52. howler/cronjobs/retention.py +61 -0
  53. howler/cronjobs/rules.py +274 -0
  54. howler/cronjobs/view_cleanup.py +88 -0
  55. howler/datastore/README.md +112 -0
  56. howler/datastore/__init__.py +0 -0
  57. howler/datastore/bulk.py +72 -0
  58. howler/datastore/collection.py +2327 -0
  59. howler/datastore/constants.py +117 -0
  60. howler/datastore/exceptions.py +41 -0
  61. howler/datastore/howler_store.py +105 -0
  62. howler/datastore/migrations/fix_process.py +41 -0
  63. howler/datastore/operations.py +130 -0
  64. howler/datastore/schemas.py +90 -0
  65. howler/datastore/store.py +231 -0
  66. howler/datastore/support/__init__.py +0 -0
  67. howler/datastore/support/build.py +214 -0
  68. howler/datastore/support/schemas.py +90 -0
  69. howler/datastore/types.py +22 -0
  70. howler/error.py +91 -0
  71. howler/external/__init__.py +0 -0
  72. howler/external/generate_mitre.py +96 -0
  73. howler/external/generate_sigma_rules.py +31 -0
  74. howler/external/generate_tlds.py +47 -0
  75. howler/external/reindex_data.py +46 -0
  76. howler/external/wipe_databases.py +58 -0
  77. howler/gunicorn_config.py +25 -0
  78. howler/healthz.py +47 -0
  79. howler/helper/__init__.py +0 -0
  80. howler/helper/azure.py +50 -0
  81. howler/helper/discover.py +59 -0
  82. howler/helper/hit.py +236 -0
  83. howler/helper/oauth.py +247 -0
  84. howler/helper/search.py +92 -0
  85. howler/helper/workflow.py +110 -0
  86. howler/helper/ws.py +378 -0
  87. howler/odm/README.md +102 -0
  88. howler/odm/__init__.py +1 -0
  89. howler/odm/base.py +1504 -0
  90. howler/odm/charter.txt +146 -0
  91. howler/odm/helper.py +416 -0
  92. howler/odm/howler_enum.py +25 -0
  93. howler/odm/models/__init__.py +0 -0
  94. howler/odm/models/action.py +33 -0
  95. howler/odm/models/analytic.py +90 -0
  96. howler/odm/models/assemblyline.py +48 -0
  97. howler/odm/models/aws.py +23 -0
  98. howler/odm/models/azure.py +16 -0
  99. howler/odm/models/cbs.py +44 -0
  100. howler/odm/models/config.py +558 -0
  101. howler/odm/models/dossier.py +33 -0
  102. howler/odm/models/ecs/__init__.py +0 -0
  103. howler/odm/models/ecs/agent.py +17 -0
  104. howler/odm/models/ecs/autonomous_system.py +16 -0
  105. howler/odm/models/ecs/client.py +149 -0
  106. howler/odm/models/ecs/cloud.py +141 -0
  107. howler/odm/models/ecs/code_signature.py +27 -0
  108. howler/odm/models/ecs/container.py +32 -0
  109. howler/odm/models/ecs/dns.py +62 -0
  110. howler/odm/models/ecs/egress.py +10 -0
  111. howler/odm/models/ecs/elf.py +74 -0
  112. howler/odm/models/ecs/email.py +122 -0
  113. howler/odm/models/ecs/error.py +14 -0
  114. howler/odm/models/ecs/event.py +140 -0
  115. howler/odm/models/ecs/faas.py +24 -0
  116. howler/odm/models/ecs/file.py +84 -0
  117. howler/odm/models/ecs/geo.py +30 -0
  118. howler/odm/models/ecs/group.py +18 -0
  119. howler/odm/models/ecs/hash.py +16 -0
  120. howler/odm/models/ecs/host.py +17 -0
  121. howler/odm/models/ecs/http.py +37 -0
  122. howler/odm/models/ecs/ingress.py +12 -0
  123. howler/odm/models/ecs/interface.py +21 -0
  124. howler/odm/models/ecs/network.py +30 -0
  125. howler/odm/models/ecs/observer.py +45 -0
  126. howler/odm/models/ecs/organization.py +12 -0
  127. howler/odm/models/ecs/os.py +21 -0
  128. howler/odm/models/ecs/pe.py +17 -0
  129. howler/odm/models/ecs/process.py +216 -0
  130. howler/odm/models/ecs/registry.py +26 -0
  131. howler/odm/models/ecs/related.py +45 -0
  132. howler/odm/models/ecs/rule.py +51 -0
  133. howler/odm/models/ecs/server.py +24 -0
  134. howler/odm/models/ecs/threat.py +247 -0
  135. howler/odm/models/ecs/tls.py +58 -0
  136. howler/odm/models/ecs/url.py +51 -0
  137. howler/odm/models/ecs/user.py +57 -0
  138. howler/odm/models/ecs/user_agent.py +20 -0
  139. howler/odm/models/ecs/vulnerability.py +41 -0
  140. howler/odm/models/gcp.py +16 -0
  141. howler/odm/models/hit.py +356 -0
  142. howler/odm/models/howler_data.py +328 -0
  143. howler/odm/models/lead.py +33 -0
  144. howler/odm/models/localized_label.py +13 -0
  145. howler/odm/models/overview.py +16 -0
  146. howler/odm/models/pivot.py +40 -0
  147. howler/odm/models/template.py +24 -0
  148. howler/odm/models/user.py +83 -0
  149. howler/odm/models/view.py +34 -0
  150. howler/odm/random_data.py +888 -0
  151. howler/odm/randomizer.py +606 -0
  152. howler/patched.py +5 -0
  153. howler/plugins/__init__.py +25 -0
  154. howler/plugins/config.py +123 -0
  155. howler/remote/__init__.py +0 -0
  156. howler/remote/datatypes/README.md +355 -0
  157. howler/remote/datatypes/__init__.py +98 -0
  158. howler/remote/datatypes/counters.py +63 -0
  159. howler/remote/datatypes/events.py +66 -0
  160. howler/remote/datatypes/hash.py +206 -0
  161. howler/remote/datatypes/lock.py +42 -0
  162. howler/remote/datatypes/queues/__init__.py +0 -0
  163. howler/remote/datatypes/queues/comms.py +59 -0
  164. howler/remote/datatypes/queues/multi.py +32 -0
  165. howler/remote/datatypes/queues/named.py +93 -0
  166. howler/remote/datatypes/queues/priority.py +215 -0
  167. howler/remote/datatypes/set.py +118 -0
  168. howler/remote/datatypes/user_quota_tracker.py +54 -0
  169. howler/security/__init__.py +253 -0
  170. howler/security/socket.py +108 -0
  171. howler/security/utils.py +185 -0
  172. howler/services/__init__.py +0 -0
  173. howler/services/action_service.py +111 -0
  174. howler/services/analytic_service.py +128 -0
  175. howler/services/auth_service.py +323 -0
  176. howler/services/config_service.py +128 -0
  177. howler/services/dossier_service.py +252 -0
  178. howler/services/event_service.py +93 -0
  179. howler/services/hit_service.py +893 -0
  180. howler/services/jwt_service.py +158 -0
  181. howler/services/lucene_service.py +286 -0
  182. howler/services/notebook_service.py +119 -0
  183. howler/services/overview_service.py +44 -0
  184. howler/services/template_service.py +45 -0
  185. howler/services/user_service.py +330 -0
  186. howler/utils/__init__.py +0 -0
  187. howler/utils/annotations.py +28 -0
  188. howler/utils/chunk.py +38 -0
  189. howler/utils/dict_utils.py +200 -0
  190. howler/utils/isotime.py +17 -0
  191. howler/utils/list_utils.py +11 -0
  192. howler/utils/lucene.py +77 -0
  193. howler/utils/path.py +27 -0
  194. howler/utils/socket_utils.py +61 -0
  195. howler/utils/str_utils.py +256 -0
  196. howler/utils/uid.py +47 -0
  197. howler_api-2.13.0.dev329.dist-info/METADATA +71 -0
  198. howler_api-2.13.0.dev329.dist-info/RECORD +200 -0
  199. howler_api-2.13.0.dev329.dist-info/WHEEL +4 -0
  200. howler_api-2.13.0.dev329.dist-info/entry_points.txt +8 -0
@@ -0,0 +1,2327 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ import sys
7
+ import time
8
+ import typing
9
+ import warnings
10
+ from copy import deepcopy
11
+ from datetime import datetime
12
+ from os import environ
13
+ from random import random
14
+ from typing import Any, Dict, Generic, Optional, TypeVar, Union
15
+
16
+ import elasticsearch
17
+ from datemath import dm
18
+ from datemath.helpers import DateMathException
19
+
20
+ from howler import odm
21
+ from howler.common.exceptions import HowlerRuntimeError, HowlerValueError, NonRecoverableError
22
+ from howler.common.loader import APP_NAME
23
+ from howler.common.logging.format import HWL_DATE_FORMAT, HWL_LOG_FORMAT
24
+ from howler.datastore.constants import BACK_MAPPING, TYPE_MAPPING
25
+ from howler.datastore.exceptions import (
26
+ DataStoreException,
27
+ HowlerScanError,
28
+ MultiKeyError,
29
+ SearchException,
30
+ SearchRetryException,
31
+ VersionConflictException,
32
+ )
33
+ from howler.datastore.support.build import build_mapping
34
+ from howler.datastore.support.schemas import (
35
+ default_dynamic_strings,
36
+ default_dynamic_templates,
37
+ default_index,
38
+ default_mapping,
39
+ )
40
+ from howler.odm.base import (
41
+ BANNED_FIELDS,
42
+ IP,
43
+ ClassificationObject,
44
+ Enum,
45
+ Integer,
46
+ Keyword,
47
+ List,
48
+ Mapping,
49
+ Model,
50
+ ValidatedKeyword,
51
+ _Field,
52
+ )
53
+ from howler.utils.dict_utils import prune, recursive_update
54
+
55
+ if typing.TYPE_CHECKING:
56
+ from .store import ESStore
57
+
58
+
59
+ TRANSPORT_TIMEOUT = int(environ.get("HWL_DATASTORE_TRANSPORT_TIMEOUT", "10"))
60
+
61
+ logger = logging.getLogger("howler.api.datastore")
62
+ logger.setLevel(logging.INFO)
63
+ console = logging.StreamHandler()
64
+ console.setLevel(logging.INFO)
65
+ console.setFormatter(logging.Formatter(HWL_LOG_FORMAT, HWL_DATE_FORMAT))
66
+ logger.addHandler(console)
67
+
68
+ ModelType = TypeVar("ModelType", bound=Model)
69
+ write_block_settings = {"index.blocks.write": True}
70
+ write_unblock_settings = {"index.blocks.write": None}
71
+
72
+ # A token value to represent a document not existing. Its a string to match the
73
+ # type used for version values. Any string will do as long as it never matches
74
+ # a real version string.
75
+ CREATE_TOKEN = "create" # noqa: S105
76
+
77
+
78
+ def _strip_lists(model, data):
79
+ """Elasticsearch returns everything as lists, regardless of whether
80
+ we want the field to be multi-valued or not. This method uses the model's
81
+ knowledge of what should or should not have multiple values to fix the data.
82
+ """
83
+ fields = model.fields()
84
+ out = {}
85
+ for key, value in odm.flat_to_nested(data).items():
86
+ doc_type = fields.get(key, fields.get("", model))
87
+ # TODO: While we strip lists we don't want to know that the field is optional but we want to know what
88
+ # type of optional field that is. The following two lines of code change the doc_type to the
89
+ # child_type of the field. (Should model.fields() actually do that for us instead?)
90
+ if isinstance(doc_type, odm.Optional):
91
+ doc_type = doc_type.child_type
92
+
93
+ if isinstance(doc_type, odm.List):
94
+ out[key] = value
95
+ elif isinstance(doc_type, odm.Compound) or isinstance(doc_type, odm.Mapping):
96
+ out[key] = _strip_lists(doc_type.child_type, value)
97
+ elif isinstance(value, list):
98
+ out[key] = value[0]
99
+ else:
100
+ out[key] = value
101
+ return out
102
+
103
+
104
+ def sort_str(sort_dicts):
105
+ if sort_dicts is None:
106
+ return sort_dicts
107
+
108
+ sort_list = [f"{key}:{val}" for d in sort_dicts for key, val in d.items()]
109
+ return ",".join(sort_list)
110
+
111
+
112
+ def parse_sort(sort, ret_list=True):
113
+ """This function tries to do two things at once:
114
+ - convert AL sort syntax to elastic,
115
+ - convert any sorts on the key _id to _id_
116
+ """
117
+ if sort is None:
118
+ return sort
119
+
120
+ if isinstance(sort, list):
121
+ return [parse_sort(row, ret_list=False) for row in sort]
122
+ elif isinstance(sort, dict):
123
+ return {("id" if key == "_id" else key): value for key, value in sort.items()}
124
+
125
+ parts = sort.split(" ")
126
+ if len(parts) == 1:
127
+ if parts == "_id":
128
+ if ret_list:
129
+ return ["id"]
130
+ return "id"
131
+ if ret_list:
132
+ return [parts]
133
+ return parts
134
+ elif len(parts) == 2:
135
+ if parts[1] not in ["asc", "desc"]:
136
+ raise SearchException("Unknown sort parameter " + sort)
137
+ if parts[0] == "_id":
138
+ if ret_list:
139
+ return [{"id": parts[1]}]
140
+ return {"id": parts[1]}
141
+ if ret_list:
142
+ return [{parts[0]: parts[1]}]
143
+ return {parts[0]: parts[1]}
144
+ raise SearchException("Unknown sort parameter " + sort)
145
+
146
+
147
+ class ESCollection(Generic[ModelType]):
148
+ DEFAULT_OFFSET = 0
149
+ DEFAULT_ROW_SIZE = 25
150
+ DEFAULT_SEARCH_FIELD = "__text__"
151
+ DEFAULT_SORT = [{"_id": "asc"}]
152
+ FIELD_SANITIZER = re.compile("^[a-z][a-z0-9_\\-.]+$")
153
+ MAX_GROUP_LIMIT = 10
154
+ MAX_FACET_LIMIT = 100
155
+ MAX_RETRY_BACKOFF = 10
156
+ MAX_SEARCH_ROWS = 500
157
+ RETRY_NORMAL = 1
158
+ RETRY_NONE = 0
159
+ RETRY_INFINITY = -1
160
+ SCROLL_TIMEOUT = "5m"
161
+ UPDATE_SET = "SET"
162
+ UPDATE_INC = "INC"
163
+ UPDATE_DEC = "DEC"
164
+ UPDATE_MAX = "MAX"
165
+ UPDATE_MIN = "MIN"
166
+ UPDATE_APPEND = "APPEND"
167
+ UPDATE_APPEND_IF_MISSING = "APPEND_IF_MISSING"
168
+ UPDATE_REMOVE = "REMOVE"
169
+ UPDATE_DELETE = "DELETE"
170
+ UPDATE_OPERATIONS = [
171
+ UPDATE_APPEND,
172
+ UPDATE_APPEND_IF_MISSING,
173
+ UPDATE_DEC,
174
+ UPDATE_INC,
175
+ UPDATE_MAX,
176
+ UPDATE_MIN,
177
+ UPDATE_REMOVE,
178
+ UPDATE_SET,
179
+ UPDATE_DELETE,
180
+ ]
181
+ DEFAULT_SEARCH_VALUES: dict[str, typing.Any] = {
182
+ "timeout": None,
183
+ "field_list": None,
184
+ "facet_active": False,
185
+ "facet_mincount": 1,
186
+ "facet_fields": [],
187
+ "stats_active": False,
188
+ "stats_fields": [],
189
+ "field_script": None,
190
+ "filters": [],
191
+ "group_active": False,
192
+ "group_field": None,
193
+ "group_sort": None,
194
+ "group_limit": 1,
195
+ "histogram_active": False,
196
+ "histogram_field": None,
197
+ "histogram_type": None,
198
+ "histogram_gap": None,
199
+ "histogram_mincount": 1,
200
+ "histogram_start": None,
201
+ "histogram_end": None,
202
+ "start": 0,
203
+ "rows": DEFAULT_ROW_SIZE,
204
+ "query": "*",
205
+ "sort": DEFAULT_SORT,
206
+ "df": None,
207
+ "script_fields": [],
208
+ }
209
+ IGNORE_ENSURE_COLLECTION = False
210
+
211
+ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, max_attempts=10):
212
+ self.replicas = int(
213
+ environ.get(
214
+ f"ELASTIC_{name.upper()}_REPLICAS",
215
+ environ.get("ELASTIC_DEFAULT_REPLICAS", 0),
216
+ )
217
+ )
218
+ self.shards = int(environ.get(f"ELASTIC_{name.upper()}_SHARDS", environ.get("ELASTIC_DEFAULT_SHARDS", 1)))
219
+ self._index_list: list[str] = []
220
+
221
+ self.datastore = datastore
222
+ self.name = f"{APP_NAME}-{name}"
223
+ self.index_name = f"{self.name}_hot"
224
+ self.model_class = model_class
225
+ self.validate = validate
226
+ self.max_attempts = max_attempts
227
+
228
+ if not ESCollection.IGNORE_ENSURE_COLLECTION:
229
+ self._ensure_collection()
230
+ elif "pytest" not in sys.modules:
231
+ logger.warning("Skipping ensure collection! This is dangerous. Waiting five seconds before continuing.")
232
+ time.sleep(5)
233
+
234
+ self.stored_fields = {}
235
+ if model_class:
236
+ for name, field in model_class.flat_fields().items():
237
+ if field.store:
238
+ self.stored_fields[name] = field
239
+
240
+ @property
241
+ def index_list_full(self):
242
+ if not self._index_list:
243
+ self._index_list = list(self.with_retries(self.datastore.client.indices.get, index=f"{self.name}-*").keys())
244
+
245
+ return [self.index_name] + sorted(self._index_list, reverse=True)
246
+
247
+ @property
248
+ def index_list(self):
249
+ """This property contains the list of valid indexes for the current collection.
250
+
251
+ :return: list of valid indexes for this collection
252
+ """
253
+ return [self.index_name]
254
+
255
+ def scan_with_retry(
256
+ self,
257
+ query,
258
+ sort=None,
259
+ source=None,
260
+ index=None,
261
+ scroll="5m",
262
+ size=1000,
263
+ request_timeout=None,
264
+ ):
265
+ if index is None:
266
+ index = self.index_name
267
+
268
+ # initial search
269
+ resp = self.with_retries(
270
+ self.datastore.client.search,
271
+ index=index,
272
+ query=query,
273
+ scroll=scroll,
274
+ size=size,
275
+ request_timeout=request_timeout,
276
+ sort=sort,
277
+ _source=source,
278
+ )
279
+ scroll_id = resp.get("_scroll_id")
280
+
281
+ try:
282
+ while scroll_id and resp["hits"]["hits"]:
283
+ for hit in resp["hits"]["hits"]:
284
+ yield hit
285
+
286
+ # Default to 0 if the value isn't included in the response
287
+ shards_successful = resp["_shards"].get("successful", 0)
288
+ shards_skipped = resp["_shards"].get("skipped", 0)
289
+ shards_total = resp["_shards"].get("total", 0)
290
+
291
+ # check if we have any errors
292
+ if (shards_successful + shards_skipped) < shards_total:
293
+ shards_message = (
294
+ f"{scroll_id}: Scroll request has only succeeded on {shards_successful} "
295
+ f"(+{shards_skipped} skipped) shards out of {shards_total}."
296
+ )
297
+ raise HowlerScanError(shards_message)
298
+ resp = self.with_retries(self.datastore.client.scroll, scroll_id=scroll_id, scroll=scroll)
299
+ scroll_id = resp.get("_scroll_id")
300
+
301
+ finally:
302
+ if scroll_id:
303
+ resp = self.with_retries(
304
+ self.datastore.client.clear_scroll,
305
+ scroll_id=[scroll_id],
306
+ ignore=(404,),
307
+ )
308
+ if not resp.get("succeeded", False):
309
+ logger.warning(
310
+ f"Could not clear scroll ID {scroll_id}, there is potential "
311
+ "memory leak in you Elastic cluster..."
312
+ )
313
+
314
+ def with_retries(self, func, *args, raise_conflicts=False, **kwargs):
315
+ """This function performs the passed function with the given args and kwargs and reconnect if it fails
316
+
317
+ :return: return the output of the function passed
318
+ """
319
+ retries = 0
320
+ updated = 0
321
+ deleted = 0
322
+
323
+ while True:
324
+ if retries >= self.max_attempts:
325
+ raise HowlerRuntimeError(f"Maximum of {self.max_attempts} retries reached. Aborting ES connection")
326
+
327
+ try:
328
+ ret_val = func(*args, **kwargs)
329
+
330
+ if retries:
331
+ logger.info("Reconnected to elasticsearch!")
332
+
333
+ if updated:
334
+ ret_val["updated"] += updated
335
+
336
+ if deleted:
337
+ ret_val["deleted"] += deleted
338
+
339
+ return ret_val
340
+ except elasticsearch.exceptions.NotFoundError as e:
341
+ if "index_not_found_exception" in str(e):
342
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
343
+ logger.debug("The index does not exist. Trying to recreate it...")
344
+ self._ensure_collection()
345
+ self.datastore.connection_reset()
346
+ retries += 1
347
+ else:
348
+ raise
349
+
350
+ except elasticsearch.exceptions.ConflictError as ce:
351
+ if raise_conflicts:
352
+ # De-sync potential treads trying to write to the index
353
+ time.sleep(random() * 0.1) # noqa: S311
354
+ raise VersionConflictException(str(ce))
355
+ updated += ce.info.get("updated", 0)
356
+ deleted += ce.info.get("deleted", 0)
357
+
358
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
359
+ self.datastore.connection_reset()
360
+ retries += 1
361
+
362
+ except elasticsearch.exceptions.ConnectionTimeout:
363
+ logger.warning(
364
+ f"Elasticsearch connection timeout, server(s): "
365
+ f"{' | '.join(self.datastore.get_hosts(safe=True))}"
366
+ f", retrying {func.__name__}..."
367
+ )
368
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
369
+ self.datastore.connection_reset()
370
+ retries += 1
371
+
372
+ except (
373
+ SearchRetryException,
374
+ elasticsearch.exceptions.ConnectionError,
375
+ elasticsearch.exceptions.AuthenticationException,
376
+ ) as e:
377
+ if not isinstance(e, SearchRetryException):
378
+ logger.warning(
379
+ f"No connection to Elasticsearch server(s): "
380
+ f"{' | '.join(self.datastore.get_hosts(safe=True))}"
381
+ f", because [{e}] retrying {func.__name__}..."
382
+ )
383
+
384
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
385
+ self.datastore.connection_reset()
386
+ retries += 1
387
+
388
+ except elasticsearch.exceptions.TransportError as e:
389
+ err_code, msg, cause = e.args
390
+ if err_code == 503 or err_code == "503":
391
+ logger.warning(f"Looks like index {self.name} is not ready yet, retrying...")
392
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
393
+ self.datastore.connection_reset()
394
+ retries += 1
395
+ elif err_code == 429 or err_code == "429":
396
+ logger.warning(
397
+ "Elasticsearch is too busy to perform the requested " f"task on index {self.name}, retrying..."
398
+ )
399
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
400
+ self.datastore.connection_reset()
401
+ retries += 1
402
+ elif err_code == 403 or err_code == "403":
403
+ logger.warning(
404
+ "Elasticsearch cluster is preventing writing operations " f"on index {self.name}, retrying..."
405
+ )
406
+ time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
407
+ self.datastore.connection_reset()
408
+ retries += 1
409
+
410
+ else:
411
+ raise
412
+
413
+ def _get_task_results(self, task):
414
+ # This function is only used to wait for a asynchronous task to finish in a graceful manner without
415
+ # timing out the elastic client. You can create an async task for long running operation like:
416
+ # - update_by_query
417
+ # - delete_by_query
418
+ # - reindex ...
419
+ res = None
420
+ while res is None:
421
+ try:
422
+ res = self.with_retries(
423
+ self.datastore.client.tasks.get,
424
+ task_id=task["task"],
425
+ wait_for_completion=True,
426
+ timeout="10s",
427
+ )
428
+ except elasticsearch.exceptions.TransportError as e:
429
+ err_code, msg, _ = e.args
430
+ if (err_code == 500 or err_code == "500") and msg in [
431
+ "timeout_exception",
432
+ "receive_timeout_transport_exception",
433
+ ]:
434
+ pass
435
+ else:
436
+ raise
437
+
438
+ return res.get("response", res["task"]["status"])
439
+
440
+ def _get_current_alias(self, index: str) -> typing.Optional[str]:
441
+ if self.with_retries(self.datastore.client.indices.exists_alias, name=index):
442
+ return next(
443
+ iter(self.with_retries(self.datastore.client.indices.get_alias, index=index)),
444
+ None,
445
+ )
446
+
447
+ return None
448
+
449
+ def _wait_for_status(self, index, min_status="yellow"):
450
+ status_ok = False
451
+ while not status_ok:
452
+ try:
453
+ res = self.datastore.client.cluster.health(index=index, timeout="5s", wait_for_status=min_status)
454
+ status_ok = not res["timed_out"]
455
+ except elasticsearch.exceptions.TransportError as e:
456
+ err_code, _, _ = e.args
457
+ if err_code == 408 or err_code == "408":
458
+ logger.warning(f"Waiting for index {index} to get to status {min_status}...")
459
+ else:
460
+ raise
461
+
462
+ def _safe_index_copy(self, copy_function, src, target, settings=None, min_status="yellow"):
463
+ ret = copy_function(index=src, target=target, settings=settings, request_timeout=60)
464
+ if not ret["acknowledged"]:
465
+ raise DataStoreException(f"Failed to create index {target} from {src}.")
466
+
467
+ self._wait_for_status(target, min_status=min_status)
468
+
469
+ def _delete_async(self, index, query, max_docs=None, sort=None):
470
+ deleted = 0
471
+ while True:
472
+ task = self.with_retries(
473
+ self.datastore.client.delete_by_query,
474
+ index=index,
475
+ query=query,
476
+ wait_for_completion=False,
477
+ conflicts="proceed",
478
+ sort=sort,
479
+ max_docs=max_docs,
480
+ )
481
+ res = self._get_task_results(task)
482
+
483
+ if res["version_conflicts"] == 0:
484
+ res["deleted"] += deleted
485
+ return res
486
+ else:
487
+ deleted += res["deleted"]
488
+
489
+ def _update_async(self, index, script, query, max_docs=None):
490
+ updated = 0
491
+ while True:
492
+ task = self.with_retries(
493
+ self.datastore.client.update_by_query,
494
+ index=index,
495
+ script=script,
496
+ query=query,
497
+ wait_for_completion=False,
498
+ conflicts="proceed",
499
+ max_docs=max_docs,
500
+ )
501
+ res = self._get_task_results(task)
502
+
503
+ if res["version_conflicts"] == 0:
504
+ res["updated"] += updated
505
+ return res
506
+ else:
507
+ updated += res["updated"]
508
+
509
+ def commit(self):
510
+ """This function should be overloaded to perform a commit of the index data of all the different hosts
511
+ specified in self.datastore.hosts.
512
+
513
+ :return: Should return True of the commit was successful on all hosts
514
+ """
515
+ self.with_retries(self.datastore.client.indices.refresh, index=self.index_name)
516
+ self.with_retries(self.datastore.client.indices.clear_cache, index=self.index_name)
517
+ return True
518
+
519
+ def fix_replicas(self):
520
+ """This function should be overloaded to fix the replica configuration of the index of all the different hosts
521
+ specified in self.datastore.hosts.
522
+
523
+ :return: Should return True of the fix was successful on all hosts
524
+ """
525
+ replicas = self._get_index_settings()["index"]["number_of_replicas"]
526
+ settings = {"number_of_replicas": replicas}
527
+ return self.with_retries(self.datastore.client.indices.put_settings, index=self.index_name, settings=settings)[
528
+ "acknowledged"
529
+ ]
530
+
531
+ def fix_shards(self):
532
+ """This function should be overloaded to fix the shard configuration of the index of all the different hosts
533
+ specified in self.datastore.hosts.
534
+
535
+ :return: Should return True of the fix was successful on all hosts
536
+ """
537
+ settings = self._get_index_settings()
538
+ clone_settings = {"index.number_of_replicas": 0}
539
+ clone_finish_settings = None
540
+ clone_setup_settings = None
541
+ method = None
542
+ target_node = ""
543
+ temp_name = f"{self.name}__fix_shards"
544
+
545
+ indexes_settings = self.with_retries(self.datastore.client.indices.get_settings)
546
+ current_settings = indexes_settings.get(self._get_current_alias(self.name), None)
547
+ if not current_settings:
548
+ raise DataStoreException(
549
+ "Could not get current index settings. Something is wrong and requires manual intervention..."
550
+ )
551
+
552
+ cur_replicas = int(current_settings["settings"]["index"]["number_of_replicas"])
553
+ cur_shards = int(current_settings["settings"]["index"]["number_of_shards"])
554
+ target_shards = int(settings["index"]["number_of_shards"])
555
+ clone_finish_settings = {
556
+ "index.number_of_replicas": cur_replicas,
557
+ "index.routing.allocation.require._name": None,
558
+ }
559
+
560
+ if cur_shards > target_shards:
561
+ logger.info(
562
+ f"Current shards ({cur_shards}) is bigger then target shards ({target_shards}), "
563
+ "we will be shrinking the index."
564
+ )
565
+ if cur_shards % target_shards != 0:
566
+ logger.info("The target shards is not a factor of the current shards, aborting...")
567
+ return
568
+ else:
569
+ target_node = self.with_retries(self.datastore.client.cat.nodes, format="json")[0]["name"]
570
+ clone_setup_settings = {
571
+ "index.number_of_replicas": 0,
572
+ "index.routing.allocation.require._name": target_node,
573
+ }
574
+ method = self.datastore.client.indices.shrink
575
+ elif cur_shards < target_shards:
576
+ logger.info(
577
+ f"Current shards ({cur_shards}) is smaller then target shards ({target_shards}), "
578
+ "we will be splitting the index."
579
+ )
580
+ if target_shards % cur_shards != 0:
581
+ logger.warning("The current shards is not a factor of the target shards, aborting...")
582
+ return
583
+ else:
584
+ method = self.datastore.client.indices.split
585
+ else:
586
+ logger.info(
587
+ f"Current shards ({cur_shards}) is equal to the target shards ({target_shards}), "
588
+ "only house keeping operations will be performed."
589
+ )
590
+
591
+ if method:
592
+ # Before we do anything, we should make sure the source index is in a good state
593
+ logger.info(f"Waiting for {self.name.upper()} status to be GREEN.")
594
+ self._wait_for_status(self.name, min_status="green")
595
+
596
+ # Block all indexes to be written to
597
+ logger.info("Set a datastore wide write block on Elastic.")
598
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
599
+
600
+ # Clone it onto a temporary index
601
+ if not self.with_retries(self.datastore.client.indices.exists, index=temp_name):
602
+ # if there are specific settings to be applied to the index, apply them
603
+ if clone_setup_settings:
604
+ logger.info(f"Rellocating index to node {target_node.upper()}.")
605
+ self.with_retries(
606
+ self.datastore.client.indices.put_settings,
607
+ index=self.index_name,
608
+ settings=clone_setup_settings,
609
+ )
610
+
611
+ # Make sure no shard are relocating
612
+ while self.datastore.client.cluster.health(index=self.index_name)["relocating_shards"] != 0:
613
+ time.sleep(1)
614
+
615
+ # Make a clone of the current index
616
+ logger.info(f"Cloning {self.index_name.upper()} into {temp_name.upper()}.")
617
+ self._safe_index_copy(
618
+ self.datastore.client.indices.clone,
619
+ self.index_name,
620
+ temp_name,
621
+ settings=clone_settings,
622
+ min_status="green",
623
+ )
624
+
625
+ # Make 100% sure temporary index is ready
626
+ logger.info(f"Waiting for {temp_name.upper()} status to be GREEN.")
627
+ self._wait_for_status(temp_name, "green")
628
+
629
+ # Make sure temporary index is the alias if not already
630
+ if self._get_current_alias(self.name) != temp_name:
631
+ logger.info(
632
+ f"Make {temp_name.upper()} the current alias for {self.name.upper()} "
633
+ f"and delete {self.index_name.upper()}."
634
+ )
635
+ # Make the hot index the temporary index while deleting the original index
636
+ alias_actions = [
637
+ {"add": {"index": temp_name, "alias": self.name}},
638
+ {"remove_index": {"index": self.index_name}},
639
+ ]
640
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
641
+
642
+ # Make sure the original index is deleted
643
+ if self.with_retries(self.datastore.client.indices.exists, index=self.index_name):
644
+ logger.info(f"Delete extra {self.index_name.upper()} index.")
645
+ self.with_retries(self.datastore.client.indices.delete, index=self.index_name)
646
+
647
+ # Shrink/split the temporary index into the original index
648
+ logger.info(f"Perform shard fix operation from {temp_name.upper()} to {self.index_name.upper()}.")
649
+ self._safe_index_copy(method, temp_name, self.index_name, settings=settings)
650
+
651
+ # Make the original index the new alias
652
+ logger.info(
653
+ f"Make {self.index_name.upper()} the current alias for {self.name.upper()} "
654
+ f"and delete {temp_name.upper()}."
655
+ )
656
+ alias_actions = [
657
+ {"add": {"index": self.index_name, "alias": self.name}},
658
+ {"remove_index": {"index": temp_name}},
659
+ ]
660
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
661
+
662
+ # Restore writes
663
+ logger.debug("Restore datastore wide write operation on Elastic.")
664
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
665
+
666
+ # Restore normal routing and replicas
667
+ logger.debug(f"Restore original routing table for {self.name.upper()}.")
668
+ self.with_retries(
669
+ self.datastore.client.indices.put_settings,
670
+ index=self.name,
671
+ settings=clone_finish_settings,
672
+ )
673
+
674
+ def reindex(self):
675
+ """This function should be overloaded to perform a reindex of all the data of the different hosts
676
+ specified in self.datastore.hosts.
677
+
678
+ :return: Should return True of the commit was successful on all hosts
679
+ """
680
+ for index in self.index_list:
681
+ new_name = f"{index}__reindex"
682
+ index_data = None
683
+ if self.with_retries(self.datastore.client.indices.exists, index=index) and not self.with_retries(
684
+ self.datastore.client.indices.exists, index=new_name
685
+ ):
686
+ # Get information about the index to reindex
687
+ index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
688
+
689
+ # Create reindex target
690
+ self.with_retries(
691
+ self.datastore.client.indices.create,
692
+ index=new_name,
693
+ mappings=self._get_index_mappings(),
694
+ settings=self._get_index_settings(),
695
+ )
696
+
697
+ # For all aliases related to the index, add a new alias to the reindex index
698
+ for alias, alias_data in index_data["aliases"].items():
699
+ # Make the reindex index the new write index if the original index was
700
+ if alias_data.get("is_write_index", True):
701
+ alias_actions = [
702
+ {
703
+ "add": {
704
+ "index": new_name,
705
+ "alias": alias,
706
+ "is_write_index": True,
707
+ }
708
+ },
709
+ {
710
+ "add": {
711
+ "index": index,
712
+ "alias": alias,
713
+ "is_write_index": False,
714
+ }
715
+ },
716
+ ]
717
+ else:
718
+ alias_actions = [{"add": {"index": new_name, "alias": alias}}]
719
+
720
+ self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
721
+
722
+ # Reindex data into target
723
+ r_task = self.with_retries(
724
+ self.datastore.client.reindex,
725
+ source={"index": index},
726
+ dest={"index": new_name},
727
+ wait_for_completion=False,
728
+ )
729
+ self._get_task_results(r_task)
730
+
731
+ if self.with_retries(self.datastore.client.indices.exists, index=new_name):
732
+ if index_data is None:
733
+ index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
734
+
735
+ # Commit reindexed data
736
+ self.with_retries(self.datastore.client.indices.refresh, index=new_name)
737
+ self.with_retries(self.datastore.client.indices.clear_cache, index=new_name)
738
+
739
+ # Delete old index
740
+ if self.with_retries(self.datastore.client.indices.exists, index=index):
741
+ self.with_retries(self.datastore.client.indices.delete, index=index)
742
+
743
+ # Block write to the index
744
+ self.with_retries(
745
+ self.datastore.client.indices.put_settings,
746
+ settings=write_block_settings,
747
+ )
748
+
749
+ # Rename reindexed index
750
+ try:
751
+ self._safe_index_copy(
752
+ self.datastore.client.indices.clone,
753
+ new_name,
754
+ index,
755
+ settings=self._get_index_settings(),
756
+ )
757
+
758
+ # Restore original aliases for the index
759
+ for alias, alias_data in index_data["aliases"].items():
760
+ # Make the reindex index the new write index if the original index was
761
+ if alias_data.get("is_write_index", True):
762
+ alias_actions = [
763
+ {
764
+ "add": {
765
+ "index": index,
766
+ "alias": alias,
767
+ "is_write_index": True,
768
+ }
769
+ },
770
+ {"remove_index": {"index": new_name}},
771
+ ]
772
+ self.with_retries(
773
+ self.datastore.client.indices.update_aliases,
774
+ actions=alias_actions,
775
+ )
776
+
777
+ # Delete the reindex target if it still exists
778
+ if self.with_retries(self.datastore.client.indices.exists, index=new_name):
779
+ self.with_retries(self.datastore.client.indices.delete, index=new_name)
780
+ finally:
781
+ # Unblock write to the index
782
+ self.with_retries(
783
+ self.datastore.client.indices.put_settings,
784
+ settings=write_unblock_settings,
785
+ )
786
+
787
+ return True
788
+
789
+ def multiget(self, key_list, as_dictionary=True, as_obj=True, error_on_missing=True):
790
+ """Get a list of documents from the datastore and make sure they are normalized using
791
+ the model class
792
+
793
+ :param error_on_missing: Should it raise a key error when keys are missing
794
+ :param as_dictionary: Return a disctionary of items or a list
795
+ :param as_obj: Return objects or not
796
+ :param key_list: list of keys of documents to get
797
+ :return: list of instances of the model class
798
+ """
799
+
800
+ def add_to_output(data_output, data_id):
801
+ if "__non_doc_raw__" in data_output:
802
+ if as_dictionary:
803
+ out[data_id] = data_output["__non_doc_raw__"]
804
+ else:
805
+ out.append(data_output["__non_doc_raw__"]) # type: ignore
806
+ else:
807
+ data_output.pop("id", None)
808
+ if as_dictionary:
809
+ out[data_id] = self.normalize(data_output, as_obj=as_obj)
810
+ else:
811
+ out.append(self.normalize(data_output, as_obj=as_obj)) # type: ignore
812
+
813
+ out: Union[dict[str, Any], list[Any]]
814
+ if as_dictionary:
815
+ out = {}
816
+ else:
817
+ out = []
818
+
819
+ if key_list:
820
+ data = self.with_retries(self.datastore.client.mget, ids=key_list, index=self.name)
821
+
822
+ for row in data.get("docs", []):
823
+ if "found" in row and not row["found"]:
824
+ continue
825
+
826
+ try:
827
+ key_list.remove(row["_id"])
828
+ add_to_output(row["_source"], row["_id"])
829
+ except ValueError:
830
+ logger.exception(f'MGet returned multiple documents for id: {row["_id"]}')
831
+
832
+ if key_list and error_on_missing:
833
+ raise MultiKeyError(key_list, out)
834
+
835
+ return out
836
+
837
+ def normalize(self, data, as_obj=True) -> Union[ModelType, dict[str, Any], None]:
838
+ """Normalize the data using the model class
839
+
840
+ :param as_obj: Return an object instead of a dictionary
841
+ :param data: data to normalize
842
+ :return: instance of the model class
843
+ """
844
+ if as_obj and data is not None and self.model_class and not isinstance(data, self.model_class):
845
+ return self.model_class(data)
846
+
847
+ if isinstance(data, dict):
848
+ data = {k: v for k, v in data.items() if k not in BANNED_FIELDS}
849
+
850
+ return data
851
+
852
+ def exists(self, key):
853
+ """Check if a document exists in the datastore.
854
+
855
+ :param key: key of the document to get from the datastore
856
+ :return: true/false depending if the document exists or not
857
+ """
858
+ return self.with_retries(self.datastore.client.exists, index=self.name, id=key, _source=False)
859
+
860
+ def _get(self, key, retries, version=False):
861
+ """Versioned get-save for atomic update has two paths:
862
+ 1. Document doesn't exist at all. Create token will be returned for version.
863
+ This way only the first query to try and create the document will succeed.
864
+ 2. Document exists in hot. A version string with the info needed to do a versioned save is returned.
865
+
866
+ The create token is needed to differentiate between "I'm saving a new
867
+ document non-atomic (version=None)" and "I'm saving a new document
868
+ atomically (version=CREATE_TOKEN)".
869
+ """
870
+
871
+ def normalize_output(data_output):
872
+ if "__non_doc_raw__" in data_output:
873
+ return data_output["__non_doc_raw__"]
874
+ data_output.pop("id", None)
875
+ return data_output
876
+
877
+ if retries is None:
878
+ retries = self.RETRY_NONE
879
+
880
+ done = False
881
+ while not done:
882
+ try:
883
+ doc = self.with_retries(self.datastore.client.get, index=self.name, id=key)
884
+ if version:
885
+ return (
886
+ normalize_output(doc["_source"]),
887
+ f"{doc['_seq_no']}---{doc['_primary_term']}",
888
+ )
889
+ return normalize_output(doc["_source"])
890
+ except elasticsearch.exceptions.NotFoundError:
891
+ pass
892
+
893
+ if retries > 0:
894
+ time.sleep(0.05)
895
+ retries -= 1
896
+ elif retries < 0:
897
+ time.sleep(0.05)
898
+ else:
899
+ done = True
900
+
901
+ if version:
902
+ return None, CREATE_TOKEN
903
+
904
+ return None
905
+
906
+ def get(self, key, as_obj=True, version=False):
907
+ """Get a document from the datastore, retry a few times if not found and normalize the
908
+ document with the model provided with the collection.
909
+
910
+ This is the normal way to get data of the system.
911
+
912
+ :param archive_access: Temporary sets access value to archive during this call
913
+ :param as_obj: Should the data be returned as an ODM object
914
+ :param key: key of the document to get from the datastore
915
+ :param version: should the version number be returned by the call
916
+ :return: an instance of the model class loaded with the document data
917
+ """
918
+ data = self._get(key, self.RETRY_NORMAL, version=version)
919
+ if version:
920
+ data, version = data
921
+ return self.normalize(data, as_obj=as_obj), version
922
+ return self.normalize(data, as_obj=as_obj)
923
+
924
+ def get_if_exists(self, key, as_obj=True, version=False):
925
+ """Get a document from the datastore but do not retry if not found.
926
+
927
+ Use this more in caching scenarios because eventually consistent database may lead
928
+ to have document reported as missing even if they exist.
929
+
930
+ :param archive_access: Temporary sets access value to archive during this call
931
+ :param as_obj: Should the data be returned as an ODM object
932
+ :param key: key of the document to get from the datastore
933
+ :param version: should the version number be returned by the call
934
+ :return: an instance of the model class loaded with the document data
935
+ """
936
+ data = self._get(key, self.RETRY_NONE, version=version)
937
+ if version:
938
+ data, version = data
939
+ return self.normalize(data, as_obj=as_obj), version
940
+ return self.normalize(data, as_obj=as_obj)
941
+
942
+ def require(
943
+ self, key, as_obj=True, version=False
944
+ ) -> Union[
945
+ tuple[Optional[Union[dict[str, Any], ModelType]], str],
946
+ Optional[Union[dict[str, Any], ModelType]],
947
+ ]:
948
+ """Get a document from the datastore and retry forever because we know for sure
949
+ that this document should exist. If it does not right now, this will wait for the
950
+ document to show up in the datastore.
951
+
952
+ :param archive_access: Temporary sets access value to archive during this call
953
+ :param as_obj: Should the data be returned as an ODM object
954
+ :param key: key of the document to get from the datastore
955
+ :param version: should the version number be returned by the call
956
+ :return: an instance of the model class loaded with the document data
957
+ """
958
+ data = self._get(key, self.RETRY_INFINITY, version=version)
959
+ if version:
960
+ data, version = data
961
+ return self.normalize(data, as_obj=as_obj), version
962
+ return self.normalize(data, as_obj=as_obj)
963
+
964
+ def save(self, key, data, version=None):
965
+ """Save to document to the datastore using the key as its document id.
966
+
967
+ The document data will be normalized before being saved in the datastore.
968
+
969
+ :param key: ID of the document to save
970
+ :param data: raw data or instance of the model class to save as the document
971
+ :param version: version of the document to save over, if the version check fails this will raise an exception
972
+ :return: True if the document was saved properly
973
+ """
974
+ if " " in key:
975
+ raise DataStoreException("You are not allowed to use spaces in datastore keys.")
976
+
977
+ data = self.normalize(data)
978
+
979
+ if self.model_class:
980
+ saved_data = data.as_primitives(hidden_fields=True)
981
+ else:
982
+ if not isinstance(data, dict):
983
+ saved_data = {"__non_doc_raw__": data}
984
+ else:
985
+ saved_data = deepcopy(data)
986
+
987
+ saved_data["id"] = key
988
+ operation = "index"
989
+ seq_no = None
990
+ primary_term = None
991
+
992
+ if version == CREATE_TOKEN:
993
+ operation = "create"
994
+ elif version:
995
+ seq_no, primary_term = version.split("---")
996
+
997
+ try:
998
+ self.with_retries(
999
+ self.datastore.client.index,
1000
+ index=self.name,
1001
+ id=key,
1002
+ document=json.dumps(saved_data),
1003
+ op_type=operation,
1004
+ if_seq_no=seq_no,
1005
+ if_primary_term=primary_term,
1006
+ raise_conflicts=True,
1007
+ )
1008
+ except elasticsearch.BadRequestError as e:
1009
+ raise NonRecoverableError(
1010
+ f"When saving document {key} to elasticsearch, an exception occurred:\n{repr(e)}\n\n"
1011
+ f"Data: {json.dumps(saved_data)}"
1012
+ ) from e
1013
+
1014
+ return True
1015
+
1016
+ def delete(self, key):
1017
+ """This function should delete the underlying document referenced by the key.
1018
+ It should return true if the document was in fact properly deleted.
1019
+
1020
+ :param key: id of the document to delete
1021
+ :return: True is delete successful
1022
+ """
1023
+ try:
1024
+ info = self.with_retries(self.datastore.client.delete, id=key, index=self.name)
1025
+ return info["result"] == "deleted"
1026
+ except elasticsearch.NotFoundError:
1027
+ return False
1028
+
1029
+ def delete_by_query(self, query, workers=20, sort=None, max_docs=None):
1030
+ """This function should delete the underlying documents referenced by the query.
1031
+ It should return true if the documents were in fact properly deleted.
1032
+
1033
+ :param query: Query of the documents to download
1034
+ :param workers: Number of workers used for deletion if basic currency delete is used
1035
+ :return: True is delete successful
1036
+ """
1037
+ query = {"bool": {"must": {"query_string": {"query": query}}}}
1038
+ info = self._delete_async(self.name, query=query, sort=sort_str(parse_sort(sort)), max_docs=max_docs)
1039
+ return info.get("deleted", 0) != 0
1040
+
1041
+ def _create_scripts_from_operations(self, operations):
1042
+ op_sources = []
1043
+ op_params = {}
1044
+ val_id = 0
1045
+ for op, doc_key, value in operations:
1046
+ if op == self.UPDATE_SET:
1047
+ op_sources.append(f"ctx._source.{doc_key} = params.value{val_id}")
1048
+ op_params[f"value{val_id}"] = value
1049
+ elif op == self.UPDATE_DELETE:
1050
+ op_sources.append(f"ctx._source.{doc_key}.remove(params.value{val_id})")
1051
+ op_params[f"value{val_id}"] = value
1052
+ elif op == self.UPDATE_APPEND:
1053
+ op_sources.append(f"ctx._source.{doc_key}.add(params.value{val_id})")
1054
+ op_params[f"value{val_id}"] = value
1055
+ elif op == self.UPDATE_APPEND_IF_MISSING:
1056
+ script = (
1057
+ f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) == -1) "
1058
+ f"{{ctx._source.{doc_key}.add(params.value{val_id})}}"
1059
+ )
1060
+ op_sources.append(script)
1061
+ op_params[f"value{val_id}"] = value
1062
+ elif op == self.UPDATE_REMOVE:
1063
+ script = (
1064
+ f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) != -1) "
1065
+ f"{{ctx._source.{doc_key}.remove(ctx._source.{doc_key}.indexOf(params.value{val_id}))}}"
1066
+ )
1067
+ op_sources.append(script)
1068
+ op_params[f"value{val_id}"] = value
1069
+ elif op == self.UPDATE_INC:
1070
+ op_sources.append(f"ctx._source.{doc_key} += params.value{val_id}")
1071
+ op_params[f"value{val_id}"] = value
1072
+ elif op == self.UPDATE_DEC:
1073
+ op_sources.append(f"ctx._source.{doc_key} -= params.value{val_id}")
1074
+ op_params[f"value{val_id}"] = value
1075
+ elif op == self.UPDATE_MAX:
1076
+ script = (
1077
+ f"if (ctx._source.{doc_key} == null || "
1078
+ f"ctx._source.{doc_key}.compareTo(params.value{val_id}) < 0) "
1079
+ f"{{ctx._source.{doc_key} = params.value{val_id}}}"
1080
+ )
1081
+ op_sources.append(script)
1082
+ op_params[f"value{val_id}"] = value
1083
+ elif op == self.UPDATE_MIN:
1084
+ script = (
1085
+ f"if (ctx._source.{doc_key} == null || "
1086
+ f"ctx._source.{doc_key}.compareTo(params.value{val_id}) > 0) "
1087
+ f"{{ctx._source.{doc_key} = params.value{val_id}}}"
1088
+ )
1089
+ op_sources.append(script)
1090
+ op_params[f"value{val_id}"] = value
1091
+
1092
+ val_id += 1
1093
+
1094
+ joined_sources = """;\n""".join(op_sources)
1095
+
1096
+ return {
1097
+ "lang": "painless",
1098
+ "source": joined_sources.replace("};\n", "}\n"),
1099
+ "params": op_params,
1100
+ }
1101
+
1102
+ def _validate_operations(self, operations):
1103
+ """Validate the different operations received for a partial update
1104
+
1105
+ TODO: When the field is of type Mapping, the validation/check only works for depth 1. A full recursive
1106
+ solution is needed to support multi-depth cases.
1107
+
1108
+ :param operations: list of operation tuples
1109
+ :raises: DatastoreException if operation not valid
1110
+ """
1111
+ if self.model_class:
1112
+ fields = self.model_class.flat_fields(show_compound=True)
1113
+ if "classification in fields":
1114
+ fields.update(
1115
+ {
1116
+ "__access_lvl__": Integer(),
1117
+ "__access_req__": List(Keyword()),
1118
+ "__access_grp1__": List(Keyword()),
1119
+ "__access_grp2__": List(Keyword()),
1120
+ }
1121
+ )
1122
+ else:
1123
+ fields = None
1124
+
1125
+ ret_ops = []
1126
+ for op, doc_key, value in operations:
1127
+ if op not in self.UPDATE_OPERATIONS:
1128
+ raise DataStoreException(f"Not a valid Update Operation: {op}")
1129
+
1130
+ if fields is not None:
1131
+ prev_key = None
1132
+ if doc_key not in fields:
1133
+ if "." in doc_key:
1134
+ prev_key = doc_key[: doc_key.rindex(".")]
1135
+ if prev_key in fields and not isinstance(fields[prev_key], Mapping):
1136
+ raise DataStoreException(f"Invalid field for model: {prev_key}")
1137
+ else:
1138
+ raise DataStoreException(f"Invalid field for model: {doc_key}")
1139
+
1140
+ if prev_key:
1141
+ field = fields[prev_key].child_type
1142
+ else:
1143
+ field = fields[doc_key]
1144
+
1145
+ if op in [
1146
+ self.UPDATE_APPEND,
1147
+ self.UPDATE_APPEND_IF_MISSING,
1148
+ self.UPDATE_REMOVE,
1149
+ ]:
1150
+ try:
1151
+ value = field.check(value)
1152
+ except (ValueError, TypeError, AttributeError):
1153
+ raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
1154
+
1155
+ elif op in [self.UPDATE_SET, self.UPDATE_DEC, self.UPDATE_INC]:
1156
+ try:
1157
+ value = field.check(value)
1158
+ except (ValueError, TypeError):
1159
+ raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
1160
+
1161
+ if isinstance(value, Model):
1162
+ value = value.as_primitives()
1163
+ elif isinstance(value, datetime):
1164
+ value = value.isoformat()
1165
+ elif isinstance(value, ClassificationObject):
1166
+ value = str(value)
1167
+
1168
+ ret_ops.append((op, doc_key, value))
1169
+
1170
+ return ret_ops
1171
+
1172
+ def update(self, key, operations, version=None):
1173
+ """This function performs an atomic update on some fields from the
1174
+ underlying documents referenced by the id using a list of operations.
1175
+
1176
+ Operations supported by the update function are the following:
1177
+ INTEGER ONLY: Increase and decreased value
1178
+ LISTS ONLY: Append and remove items
1179
+ ALL TYPES: Set value
1180
+
1181
+ :param key: ID of the document to modify
1182
+ :param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
1183
+ :return: True is update successful
1184
+ """
1185
+ operations = self._validate_operations(operations)
1186
+ script = self._create_scripts_from_operations(operations)
1187
+ seq_no = None
1188
+ primary_term = None
1189
+ if version:
1190
+ seq_no, primary_term = version.split("---")
1191
+
1192
+ try:
1193
+ res = self.with_retries(
1194
+ self.datastore.client.update,
1195
+ index=self.name,
1196
+ id=key,
1197
+ script=script,
1198
+ if_seq_no=seq_no,
1199
+ if_primary_term=primary_term,
1200
+ raise_conflicts=seq_no and primary_term,
1201
+ )
1202
+ return (
1203
+ res["result"] == "updated",
1204
+ f"{res['_seq_no']}---{res['_primary_term']}",
1205
+ )
1206
+ except elasticsearch.NotFoundError as e:
1207
+ logger.warning("Update - elasticsearch.NotFoundError: %s %s", e.message, e.info)
1208
+ except elasticsearch.BadRequestError as e:
1209
+ logger.warning("Update - elasticsearch.BadRequestError: %s %s", e.message, e.info)
1210
+ return False
1211
+ except VersionConflictException as e:
1212
+ logger.warning("Update - elasticsearch.ConflictError: %s", e.message)
1213
+ raise
1214
+ except Exception as e:
1215
+ logger.warning("Update - Generic Exception: %s", str(e))
1216
+ return False
1217
+
1218
+ return False
1219
+
1220
+ def update_by_query(self, query, operations, filters=None, access_control=None, max_docs=None):
1221
+ """This function performs an atomic update on some fields from the
1222
+ underlying documents matching the query and the filters using a list of operations.
1223
+
1224
+ Operations supported by the update function are the following:
1225
+ INTEGER ONLY: Increase and decreased value
1226
+ LISTS ONLY: Append and remove items
1227
+ ALL TYPES: Set value
1228
+
1229
+ :param access_control:
1230
+ :param filters: Filter queries to reduce the data
1231
+ :param query: Query to find the matching documents
1232
+ :param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
1233
+ :return: True is update successful
1234
+ """
1235
+ operations = self._validate_operations(operations)
1236
+ if filters is None:
1237
+ filters = []
1238
+
1239
+ if access_control:
1240
+ filters.append(access_control)
1241
+
1242
+ script = self._create_scripts_from_operations(operations)
1243
+
1244
+ try:
1245
+ res = self._update_async(
1246
+ self.name,
1247
+ script=script,
1248
+ query={
1249
+ "bool": {
1250
+ "must": {"query_string": {"query": query}},
1251
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1252
+ }
1253
+ },
1254
+ max_docs=max_docs,
1255
+ )
1256
+ except Exception:
1257
+ return False
1258
+
1259
+ return res["updated"]
1260
+
1261
+ def _format_output(self, result, fields=None, as_obj=True):
1262
+ # Getting search document data
1263
+ extra_fields = result.get("fields", {})
1264
+ source_data = result.pop("_source", None)
1265
+
1266
+ if source_data is not None:
1267
+ for f in BANNED_FIELDS:
1268
+ source_data.pop(f, None)
1269
+
1270
+ item_id = result["_id"]
1271
+
1272
+ if self.model_class:
1273
+ if not fields:
1274
+ fields = list(self.stored_fields.keys())
1275
+ fields.append("id")
1276
+ elif isinstance(fields, str):
1277
+ fields = fields.split(",")
1278
+
1279
+ extra_fields = _strip_lists(self.model_class, extra_fields)
1280
+ if as_obj:
1281
+ if "_index" in fields and "_index" in result:
1282
+ extra_fields["_index"] = result["_index"]
1283
+ if "*" in fields:
1284
+ fields = None
1285
+ return self.model_class(source_data, mask=fields, docid=item_id, extra_fields=extra_fields)
1286
+ else:
1287
+ source_data = recursive_update(source_data, extra_fields, allow_recursion=False)
1288
+ if "id" in fields:
1289
+ source_data["id"] = item_id
1290
+ if "_index" in fields and "_index" in result:
1291
+ source_data["_index"] = result["_index"]
1292
+
1293
+ if isinstance(fields, str):
1294
+ fields = [fields]
1295
+
1296
+ if fields is None or "*" in fields or "id" in fields:
1297
+ source_data["id"] = [item_id]
1298
+
1299
+ if fields is None or "*" in fields:
1300
+ return source_data
1301
+
1302
+ return prune(source_data, fields, self.stored_fields, mapping_class=Mapping)
1303
+
1304
+ def _search(self, args=None, deep_paging_id=None, use_archive=False, track_total_hits=None):
1305
+ if args is None:
1306
+ args = []
1307
+
1308
+ params = {}
1309
+ if deep_paging_id is not None:
1310
+ params = {"scroll": self.SCROLL_TIMEOUT}
1311
+ elif track_total_hits:
1312
+ params["track_total_hits"] = track_total_hits
1313
+
1314
+ parsed_values = deepcopy(self.DEFAULT_SEARCH_VALUES)
1315
+
1316
+ # TODO: we should validate values for max rows, group length, history length...
1317
+ for key, value in args:
1318
+ if key not in parsed_values:
1319
+ all_args = "; ".join("%s=%s" % (field_name, field_value) for field_name, field_value in args)
1320
+ raise HowlerValueError("Unknown query argument: %s %s of [%s]" % (key, value, all_args))
1321
+
1322
+ parsed_values[key] = value
1323
+
1324
+ # This is our minimal query, the following sections will fill it out
1325
+ # with whatever extra options the search has been given.
1326
+ query_body = {
1327
+ "query": {
1328
+ "bool": {
1329
+ "must": {"query_string": {"query": parsed_values["query"]}},
1330
+ "filter": [{"query_string": {"query": ff}} for ff in parsed_values["filters"]],
1331
+ }
1332
+ },
1333
+ "from_": parsed_values["start"],
1334
+ "size": parsed_values["rows"],
1335
+ "sort": parse_sort(parsed_values["sort"]),
1336
+ "_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
1337
+ }
1338
+
1339
+ if parsed_values["script_fields"]:
1340
+ fields = {}
1341
+ for f_name, f_script in parsed_values["script_fields"]:
1342
+ fields[f_name] = {"script": {"lang": "painless", "source": f_script}}
1343
+ query_body["script_fields"] = fields
1344
+
1345
+ if parsed_values["df"]:
1346
+ query_body["query"]["bool"]["must"]["query_string"]["default_field"] = parsed_values["df"]
1347
+
1348
+ # Time limit for the query
1349
+ if parsed_values["timeout"]:
1350
+ query_body["timeout"] = parsed_values["timeout"]
1351
+
1352
+ # Add an histogram aggregation
1353
+ if parsed_values["histogram_active"]:
1354
+ query_body.setdefault("aggregations", {})
1355
+ if parsed_values["histogram_type"] == "date_histogram":
1356
+ interval_type = "fixed_interval"
1357
+ else:
1358
+ interval_type = "interval"
1359
+ query_body["aggregations"]["histogram"] = {
1360
+ parsed_values["histogram_type"]: {
1361
+ "field": parsed_values["histogram_field"],
1362
+ interval_type: parsed_values["histogram_gap"],
1363
+ "min_doc_count": parsed_values["histogram_mincount"],
1364
+ "extended_bounds": {
1365
+ "min": parsed_values["histogram_start"],
1366
+ "max": parsed_values["histogram_end"],
1367
+ },
1368
+ }
1369
+ }
1370
+
1371
+ # Add a facet aggregation
1372
+ if parsed_values["facet_active"]:
1373
+ query_body.setdefault("aggregations", {})
1374
+ for field in parsed_values["facet_fields"]:
1375
+ field_script = parsed_values["field_script"]
1376
+ if field_script:
1377
+ facet_body = {
1378
+ "script": {"source": field_script},
1379
+ "min_doc_count": parsed_values["facet_mincount"],
1380
+ }
1381
+ else:
1382
+ facet_body = {
1383
+ "field": field,
1384
+ "min_doc_count": parsed_values["facet_mincount"],
1385
+ "size": parsed_values["rows"],
1386
+ }
1387
+ query_body["aggregations"][field] = {"terms": facet_body}
1388
+
1389
+ # Add a facet aggregation
1390
+ if parsed_values["stats_active"]:
1391
+ query_body.setdefault("aggregations", {})
1392
+ for field in parsed_values["stats_fields"]:
1393
+ field_script = parsed_values["field_script"]
1394
+ if field_script:
1395
+ stats_body = {"script": {"source": field_script}}
1396
+ else:
1397
+ stats_body = {"field": field}
1398
+
1399
+ query_body["aggregations"][f"{field}_stats"] = {"stats": stats_body}
1400
+
1401
+ # Add a group aggregation
1402
+ if parsed_values["group_active"]:
1403
+ query_body["collapse"] = {
1404
+ "field": parsed_values["group_field"],
1405
+ "inner_hits": {
1406
+ "name": "group",
1407
+ "_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
1408
+ "size": parsed_values["group_limit"],
1409
+ "sort": parse_sort(parsed_values["group_sort"]) or [{parsed_values["group_field"]: "asc"}],
1410
+ },
1411
+ }
1412
+
1413
+ try:
1414
+ if deep_paging_id is not None and not deep_paging_id == "*":
1415
+ # Get the next page
1416
+ result = self.with_retries(
1417
+ self.datastore.client.scroll,
1418
+ scroll_id=deep_paging_id,
1419
+ **params,
1420
+ )
1421
+ else:
1422
+ # Run the query
1423
+ result = self.with_retries(
1424
+ self.datastore.client.search,
1425
+ index=self.name,
1426
+ **params,
1427
+ **query_body,
1428
+ )
1429
+
1430
+ return result
1431
+ except (
1432
+ elasticsearch.ConnectionError,
1433
+ elasticsearch.ConnectionTimeout,
1434
+ ) as error:
1435
+ raise SearchRetryException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
1436
+
1437
+ except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
1438
+ try:
1439
+ err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
1440
+ except (ValueError, KeyError, IndexError):
1441
+ err_msg = str(e)
1442
+
1443
+ raise SearchException(err_msg)
1444
+
1445
+ except Exception as error:
1446
+ raise SearchException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
1447
+
1448
+ def search(
1449
+ self,
1450
+ query,
1451
+ offset=0,
1452
+ rows=None,
1453
+ sort=None,
1454
+ fl=None,
1455
+ timeout=None,
1456
+ filters=None,
1457
+ access_control=None,
1458
+ deep_paging_id=None,
1459
+ as_obj=True,
1460
+ use_archive=False,
1461
+ track_total_hits=None,
1462
+ script_fields=[],
1463
+ ):
1464
+ """This function should perform a search through the datastore and return a
1465
+ search result object that consist on the following::
1466
+
1467
+ {
1468
+ "offset": 0, # Offset in the search index
1469
+ "rows": 25, # Number of document returned per page
1470
+ "total": 123456, # Total number of documents matching the query
1471
+ "items": [ # List of dictionary where each keys are one of
1472
+ { # the field list parameter specified
1473
+ fl[0]: value,
1474
+ ...
1475
+ fl[x]: value
1476
+ }, ...]
1477
+ }
1478
+
1479
+ :param script_fields: List of name/script tuple of fields to be evaluated at runtime
1480
+ :param track_total_hits: Return to total matching document count
1481
+ :param use_archive: Query also the archive
1482
+ :param deep_paging_id: ID of the next page during deep paging searches
1483
+ :param as_obj: Return objects instead of dictionaries
1484
+ :param query: lucene query to search for
1485
+ :param offset: offset at which you want the results to start at (paging)
1486
+ :param rows: number of items that the search function should return
1487
+ :param sort: field to sort the data with
1488
+ :param fl: list of fields to return from the search
1489
+ :param timeout: maximum time of execution
1490
+ :param filters: additional queries to run on the original query to reduce the scope
1491
+ :param access_control: access control parameters to limiti the scope of the query
1492
+ :return: a search result object
1493
+ """
1494
+ if offset is None:
1495
+ offset = self.DEFAULT_OFFSET
1496
+
1497
+ if rows is None:
1498
+ rows = self.DEFAULT_ROW_SIZE
1499
+
1500
+ if sort is None:
1501
+ sort = self.DEFAULT_SORT
1502
+
1503
+ if filters is None:
1504
+ filters = []
1505
+ elif isinstance(filters, str):
1506
+ filters = [filters]
1507
+
1508
+ if access_control:
1509
+ filters.append(access_control)
1510
+
1511
+ args = [
1512
+ ("query", query),
1513
+ ("start", offset),
1514
+ ("rows", rows),
1515
+ ("sort", sort),
1516
+ ("df", self.DEFAULT_SEARCH_FIELD),
1517
+ ]
1518
+
1519
+ if fl:
1520
+ field_list = fl.split(",")
1521
+ args.append(("field_list", field_list))
1522
+ else:
1523
+ field_list = None
1524
+
1525
+ if timeout:
1526
+ args.append(("timeout", "%sms" % timeout))
1527
+
1528
+ if filters:
1529
+ args.append(("filters", filters))
1530
+
1531
+ if script_fields:
1532
+ args.append(("script_fields", script_fields))
1533
+
1534
+ result = self._search(
1535
+ args,
1536
+ deep_paging_id=deep_paging_id,
1537
+ use_archive=use_archive,
1538
+ track_total_hits=track_total_hits,
1539
+ )
1540
+
1541
+ ret_data: dict[str, Any] = {
1542
+ "offset": int(offset),
1543
+ "rows": int(rows),
1544
+ "total": int(result["hits"]["total"]["value"]),
1545
+ "items": [self._format_output(doc, field_list, as_obj=as_obj) for doc in result["hits"]["hits"]],
1546
+ }
1547
+
1548
+ new_deep_paging_id = result.get("_scroll_id", None)
1549
+
1550
+ # Check if the scroll is finished and close it
1551
+ if deep_paging_id is not None and new_deep_paging_id is None:
1552
+ self.with_retries(
1553
+ self.datastore.client.clear_scroll,
1554
+ scroll_id=[deep_paging_id],
1555
+ ignore=(404,),
1556
+ )
1557
+
1558
+ # Check if we can tell from inspection that we have finished the scroll
1559
+ if new_deep_paging_id is not None and len(ret_data["items"]) < ret_data["rows"]:
1560
+ self.with_retries(
1561
+ self.datastore.client.clear_scroll,
1562
+ scroll_id=[new_deep_paging_id],
1563
+ ignore=(404,),
1564
+ )
1565
+ new_deep_paging_id = None
1566
+
1567
+ if new_deep_paging_id is not None:
1568
+ ret_data["next_deep_paging_id"] = new_deep_paging_id
1569
+
1570
+ return ret_data
1571
+
1572
+ def stream_search(
1573
+ self,
1574
+ query,
1575
+ fl=None,
1576
+ filters=None,
1577
+ access_control=None,
1578
+ item_buffer_size=200,
1579
+ as_obj=True,
1580
+ use_archive=False,
1581
+ ):
1582
+ """This function should perform a search through the datastore and stream
1583
+ all related results as a dictionary of key value pair where each keys
1584
+ are one of the field specified in the field list parameter.
1585
+
1586
+ >>> # noinspection PyUnresolvedReferences
1587
+ >>> {
1588
+ >>> fl[0]: value,
1589
+ >>> ...
1590
+ >>> fl[x]: value
1591
+ >>> }
1592
+
1593
+ :param use_archive: Query also the archive
1594
+ :param as_obj: Return objects instead of dictionaries
1595
+ :param query: lucene query to search for
1596
+ :param fl: list of fields to return from the search
1597
+ :param filters: additional queries to run on the original query to reduce the scope
1598
+ :param access_control: access control parameters to run the query with
1599
+ :param buffer_size: number of items to buffer with each search call
1600
+ :return: a generator of dictionary of field list results
1601
+ """
1602
+ if item_buffer_size > 2000 or item_buffer_size < 50:
1603
+ raise SearchException("Variable item_buffer_size must be between 50 and 2000.")
1604
+
1605
+ if filters is None:
1606
+ filters = []
1607
+ elif isinstance(filters, str):
1608
+ filters = [filters]
1609
+
1610
+ if access_control:
1611
+ filters.append(access_control)
1612
+
1613
+ if fl:
1614
+ fl = fl.split(",")
1615
+
1616
+ query_expression = {
1617
+ "bool": {
1618
+ "must": {
1619
+ "query_string": {
1620
+ "query": query,
1621
+ "default_field": self.DEFAULT_SEARCH_FIELD,
1622
+ }
1623
+ },
1624
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1625
+ }
1626
+ }
1627
+ sort = parse_sort(self.datastore.DEFAULT_SORT)
1628
+ source = fl or list(self.stored_fields.keys())
1629
+
1630
+ for value in self.scan_with_retry(
1631
+ query=query_expression,
1632
+ sort=sort,
1633
+ source=source,
1634
+ index=self.name,
1635
+ size=item_buffer_size,
1636
+ ):
1637
+ # Unpack the results, ensure the id is always set
1638
+ yield self._format_output(value, fl, as_obj=as_obj)
1639
+
1640
+ def raw_eql_search(
1641
+ self,
1642
+ eql_query: str,
1643
+ fl: Optional[str] = None,
1644
+ filters: Optional[Union[list[str], str]] = None,
1645
+ rows: Optional[int] = None,
1646
+ timeout: Optional[int] = None,
1647
+ as_obj=True,
1648
+ ):
1649
+ if filters is None:
1650
+ filters = []
1651
+ elif isinstance(filters, str):
1652
+ filters = [filters]
1653
+
1654
+ parsed_filters = {
1655
+ "bool": {
1656
+ "must": {"query_string": {"query": "*:*"}},
1657
+ "filter": [{"query_string": {"query": ff}} for ff in filters],
1658
+ }
1659
+ }
1660
+
1661
+ if not fl:
1662
+ fl = "howler.id"
1663
+
1664
+ if rows is None:
1665
+ rows = 5
1666
+
1667
+ fields = [{"field": f} for f in fl.split(",")]
1668
+
1669
+ try:
1670
+ result = self.with_retries(
1671
+ self.datastore.client.eql.search,
1672
+ index=self.name,
1673
+ timestamp_field="timestamp",
1674
+ query=eql_query,
1675
+ fields=fields,
1676
+ filter=parsed_filters,
1677
+ size=rows,
1678
+ wait_for_completion_timeout=(f"{timeout}ms" if timeout is not None else None),
1679
+ )
1680
+
1681
+ ret_data: dict[str, Any] = {
1682
+ "rows": int(rows),
1683
+ "total": int(result["hits"]["total"]["value"]),
1684
+ "items": [
1685
+ self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in result["hits"].get("events", [])
1686
+ ],
1687
+ "sequences": [
1688
+ [self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in sequence.get("events", [])]
1689
+ for sequence in result["hits"].get("sequences", [])
1690
+ ],
1691
+ }
1692
+
1693
+ return ret_data
1694
+
1695
+ except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
1696
+ try:
1697
+ err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
1698
+ except (ValueError, KeyError, IndexError):
1699
+ err_msg = str(e)
1700
+
1701
+ raise SearchException(err_msg)
1702
+ except Exception as error:
1703
+ raise SearchException(f"collection: {self.name}, error: {str(error)}")
1704
+
1705
+ def keys(self, access_control=None):
1706
+ """This function streams the keys of all the documents of this collection.
1707
+
1708
+ :param access_control: access control parameter to limit the scope of the key scan
1709
+ :return: a generator of keys
1710
+ """
1711
+ for item in self.stream_search("id:*", fl="id", access_control=access_control):
1712
+ try:
1713
+ yield item._id
1714
+ except AttributeError:
1715
+ value = item["id"]
1716
+ if isinstance(value, list):
1717
+ for v in value:
1718
+ yield v
1719
+ else:
1720
+ yield value
1721
+
1722
+ def _validate_steps_count(self, start, end, gap):
1723
+ with warnings.catch_warnings():
1724
+ warnings.simplefilter("ignore")
1725
+
1726
+ gaps_count = None
1727
+ ret_type: Optional[type] = None
1728
+
1729
+ try:
1730
+ start = int(start)
1731
+ end = int(end)
1732
+ gap = int(gap)
1733
+
1734
+ gaps_count = int((end - start) / gap)
1735
+ ret_type = int
1736
+ except ValueError:
1737
+ pass
1738
+
1739
+ if not gaps_count:
1740
+ try:
1741
+ t_gap = gap.strip("+").strip("-")
1742
+
1743
+ parsed_start = dm(self.datastore.to_pydatemath(start)).int_timestamp
1744
+ parsed_end = dm(self.datastore.to_pydatemath(end)).int_timestamp
1745
+ parsed_gap = dm(self.datastore.to_pydatemath(f"+{t_gap}")).int_timestamp - dm("now").int_timestamp
1746
+
1747
+ gaps_count = int((parsed_end - parsed_start) / parsed_gap)
1748
+ ret_type = str
1749
+ except (DateMathException, AttributeError):
1750
+ pass
1751
+
1752
+ if gaps_count is None:
1753
+ raise SearchException(
1754
+ "Could not parse histogram ranges. Either you've mix integer and dates values or you "
1755
+ "have invalid date math values. (start='%s', end='%s', gap='%s')" % (start, end, gap)
1756
+ )
1757
+
1758
+ if gaps_count > self.MAX_FACET_LIMIT:
1759
+ raise SearchException(
1760
+ f"Histograms are limited to a maximum of {self.MAX_FACET_LIMIT} steps. "
1761
+ f"Current settings would generate {gaps_count} steps"
1762
+ )
1763
+ return ret_type
1764
+
1765
+ def count(
1766
+ self,
1767
+ query,
1768
+ access_control=None,
1769
+ ):
1770
+ """This function should perform a count operation through the datastore and return a
1771
+ search result object that consists of the following:
1772
+
1773
+ {
1774
+ "total": 123456, # Total number of documents matching the query
1775
+ }
1776
+
1777
+ :param query: lucene query to search for
1778
+ :param access_control: access control parameters to limit the scope of the query
1779
+ :return: a count result object
1780
+ """
1781
+ result = self.with_retries(self.datastore.client.count, index=self.name, q=query)
1782
+
1783
+ ret_data: dict[str, Any] = {
1784
+ "count": result["count"],
1785
+ }
1786
+
1787
+ return ret_data
1788
+
1789
+ def histogram(
1790
+ self,
1791
+ field,
1792
+ start,
1793
+ end,
1794
+ gap,
1795
+ query="id:*",
1796
+ mincount=None,
1797
+ filters=None,
1798
+ access_control=None,
1799
+ use_archive=False,
1800
+ ):
1801
+ type_modifier = self._validate_steps_count(start, end, gap)
1802
+ start = type_modifier(start)
1803
+ end = type_modifier(end)
1804
+ gap = type_modifier(gap)
1805
+
1806
+ if mincount is None:
1807
+ mincount = 1
1808
+
1809
+ if filters is None:
1810
+ filters = []
1811
+ elif isinstance(filters, str):
1812
+ filters = [filters]
1813
+ filters.append("{field}:[{min} TO {max}]".format(field=field, min=start, max=end))
1814
+
1815
+ args = [
1816
+ ("query", query),
1817
+ ("histogram_active", True),
1818
+ ("histogram_field", field),
1819
+ (
1820
+ "histogram_type",
1821
+ "date_histogram" if isinstance(gap, str) else "histogram",
1822
+ ),
1823
+ (
1824
+ "histogram_gap",
1825
+ gap.strip("+").strip("-") if isinstance(gap, str) else gap,
1826
+ ),
1827
+ ("histogram_mincount", mincount),
1828
+ ("histogram_start", start),
1829
+ ("histogram_end", end),
1830
+ ]
1831
+
1832
+ if access_control:
1833
+ filters.append(access_control)
1834
+
1835
+ if filters:
1836
+ args.append(("filters", filters))
1837
+
1838
+ result = self._search(args, use_archive=use_archive)
1839
+
1840
+ # Convert the histogram into a dictionary
1841
+ return {
1842
+ type_modifier(row.get("key_as_string", row["key"])): row["doc_count"]
1843
+ for row in result["aggregations"]["histogram"]["buckets"]
1844
+ }
1845
+
1846
+ def facet(
1847
+ self,
1848
+ field,
1849
+ query=None,
1850
+ prefix=None,
1851
+ contains=None,
1852
+ ignore_case=False,
1853
+ sort=None,
1854
+ rows=10,
1855
+ mincount=None,
1856
+ filters=None,
1857
+ access_control=None,
1858
+ use_archive=False,
1859
+ field_script=None,
1860
+ ):
1861
+ if not query:
1862
+ query = "id:*"
1863
+
1864
+ if not mincount:
1865
+ mincount = 1
1866
+
1867
+ if filters is None:
1868
+ filters = []
1869
+ elif isinstance(filters, str):
1870
+ filters = [filters]
1871
+
1872
+ args = [
1873
+ ("query", query),
1874
+ ("facet_active", True),
1875
+ ("facet_fields", [field]),
1876
+ ("facet_mincount", mincount),
1877
+ ("rows", rows),
1878
+ ]
1879
+
1880
+ # TODO: prefix, contains, ignore_case, sort
1881
+
1882
+ if access_control:
1883
+ filters.append(access_control)
1884
+
1885
+ if filters:
1886
+ args.append(("filters", filters))
1887
+
1888
+ if field_script:
1889
+ args.append(("field_script", field_script))
1890
+
1891
+ result = self._search(args, use_archive=use_archive)
1892
+
1893
+ # Convert the histogram into a dictionary
1894
+ return {
1895
+ row.get("key_as_string", row["key"]): row["doc_count"] for row in result["aggregations"][field]["buckets"]
1896
+ }
1897
+
1898
+ def stats(
1899
+ self,
1900
+ field,
1901
+ query="id:*",
1902
+ filters=None,
1903
+ access_control=None,
1904
+ use_archive=False,
1905
+ field_script=None,
1906
+ ):
1907
+ if filters is None:
1908
+ filters = []
1909
+ elif isinstance(filters, str):
1910
+ filters = [filters]
1911
+
1912
+ args = [
1913
+ ("query", query),
1914
+ ("stats_active", True),
1915
+ ("stats_fields", [field]),
1916
+ ("rows", 0),
1917
+ ]
1918
+
1919
+ if access_control:
1920
+ filters.append(access_control)
1921
+
1922
+ if filters:
1923
+ args.append(("filters", filters))
1924
+
1925
+ if field_script:
1926
+ args.append(("field_script", field_script))
1927
+
1928
+ result = self._search(args, use_archive=use_archive)
1929
+ return result["aggregations"][f"{field}_stats"]
1930
+
1931
+ def grouped_search(
1932
+ self,
1933
+ group_field,
1934
+ query="id:*",
1935
+ offset=0,
1936
+ sort=None,
1937
+ group_sort=None,
1938
+ fl=None,
1939
+ limit=1,
1940
+ rows=None,
1941
+ filters=None,
1942
+ access_control=None,
1943
+ as_obj=True,
1944
+ use_archive=False,
1945
+ track_total_hits=False,
1946
+ ):
1947
+ if rows is None:
1948
+ rows = self.DEFAULT_ROW_SIZE
1949
+
1950
+ if sort is None:
1951
+ sort = self.DEFAULT_SORT
1952
+
1953
+ if group_sort is None:
1954
+ group_sort = self.DEFAULT_SORT
1955
+
1956
+ if filters is None:
1957
+ filters = []
1958
+ elif isinstance(filters, str):
1959
+ filters = [filters]
1960
+
1961
+ args = [
1962
+ ("query", query),
1963
+ ("group_active", True),
1964
+ ("group_field", group_field),
1965
+ ("group_limit", limit),
1966
+ ("group_sort", group_sort),
1967
+ ("start", offset),
1968
+ ("rows", rows),
1969
+ ("sort", sort),
1970
+ ]
1971
+
1972
+ filters.append("%s:*" % group_field)
1973
+
1974
+ if fl:
1975
+ field_list = fl.split(",")
1976
+ args.append(("field_list", field_list))
1977
+ else:
1978
+ field_list = None
1979
+
1980
+ if access_control:
1981
+ filters.append(access_control)
1982
+
1983
+ if filters:
1984
+ args.append(("filters", filters))
1985
+
1986
+ result = self._search(args, use_archive=use_archive, track_total_hits=track_total_hits)
1987
+
1988
+ return {
1989
+ "offset": offset,
1990
+ "rows": rows,
1991
+ "total": int(result["hits"]["total"]["value"]),
1992
+ "items": [
1993
+ {
1994
+ "value": collapsed["fields"][group_field][0],
1995
+ "total": int(collapsed["inner_hits"]["group"]["hits"]["total"]["value"]),
1996
+ "items": [
1997
+ self._format_output(row, field_list, as_obj=as_obj)
1998
+ for row in collapsed["inner_hits"]["group"]["hits"]["hits"]
1999
+ ],
2000
+ }
2001
+ for collapsed in result["hits"]["hits"]
2002
+ ],
2003
+ }
2004
+
2005
+ @staticmethod
2006
+ def _get_odm_type(ds_type):
2007
+ try:
2008
+ return BACK_MAPPING[ds_type].__name__.lower()
2009
+ except KeyError:
2010
+ return ds_type.lower()
2011
+
2012
+ def fields(self, skip_mapping_children=False):
2013
+ """
2014
+ This function should return all the fields in the index with their types
2015
+ """
2016
+
2017
+ def flatten_fields(props):
2018
+ out = {}
2019
+ for name, value in props.items():
2020
+ if "properties" in value:
2021
+ for child, cprops in flatten_fields(value["properties"]).items():
2022
+ out[name + "." + child] = cprops
2023
+ elif "type" in value:
2024
+ out[name] = value
2025
+ else:
2026
+ raise HowlerValueError("Unknown field data " + str(props))
2027
+ return out
2028
+
2029
+ data = self.with_retries(self.datastore.client.indices.get, index=self.name)
2030
+ index_name = list(data.keys())[0]
2031
+ properties = flatten_fields(data[index_name]["mappings"].get("properties", {}))
2032
+
2033
+ if self.model_class:
2034
+ model_fields = self.model_class.flat_fields()
2035
+ else:
2036
+ model_fields = {}
2037
+
2038
+ collection_data = {}
2039
+
2040
+ for p_name, p_val in properties.items():
2041
+ if p_name.startswith("_") or "//" in p_name:
2042
+ continue
2043
+ if not self.FIELD_SANITIZER.match(p_name):
2044
+ continue
2045
+ field_model = model_fields.get(p_name, None)
2046
+
2047
+ if "." in p_name:
2048
+ parent_p_name = re.sub(r"^(.+)\..+?$", r"\1", p_name)
2049
+ if parent_p_name in model_fields and isinstance(model_fields.get(parent_p_name), Mapping):
2050
+ if parent_p_name not in collection_data:
2051
+ field_model = model_fields.get(parent_p_name, None)
2052
+ f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
2053
+
2054
+ collection_data[parent_p_name] = {
2055
+ "default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
2056
+ "indexed": p_val.get("index", p_val.get("enabled", True)),
2057
+ "list": field_model.multivalued if field_model else False,
2058
+ "stored": field_model.store if field_model else False,
2059
+ "type": f_type,
2060
+ "description": (field_model.description if field_model else ""),
2061
+ "regex": (
2062
+ field_model.child_type.validation_regex.pattern
2063
+ if issubclass(type(field_model.child_type), ValidatedKeyword)
2064
+ or issubclass(type(field_model.child_type), IP)
2065
+ else None
2066
+ ),
2067
+ "values": (
2068
+ list(field_model.child_type.values)
2069
+ if issubclass(type(field_model.child_type), Enum)
2070
+ else None
2071
+ ),
2072
+ "deprecated_description": (field_model.deprecated_description if field_model else ""),
2073
+ }
2074
+
2075
+ if skip_mapping_children:
2076
+ continue
2077
+ else:
2078
+ continue
2079
+
2080
+ f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
2081
+ collection_data[p_name] = {
2082
+ "default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
2083
+ "indexed": p_val.get("index", p_val.get("enabled", True)),
2084
+ "list": field_model.multivalued if field_model else False,
2085
+ "stored": field_model.store if field_model else False,
2086
+ "deprecated": field_model.deprecated if field_model else False,
2087
+ "type": f_type,
2088
+ "description": field_model.description if field_model else "",
2089
+ "regex": (
2090
+ field_model.validation_regex.pattern
2091
+ if issubclass(type(field_model), ValidatedKeyword) or issubclass(type(field_model), IP)
2092
+ else None
2093
+ ),
2094
+ "values": list(field_model.values) if issubclass(type(field_model), Enum) else None,
2095
+ "deprecated_description": (field_model.deprecated_description if field_model else ""),
2096
+ }
2097
+
2098
+ collection_data.pop("id", None)
2099
+
2100
+ return collection_data
2101
+
2102
+ def _ilm_policy_exists(self):
2103
+ try:
2104
+ self.datastore.client.ilm.get_lifecycle(name=f"{self.name}_policy")
2105
+ except elasticsearch.NotFoundError:
2106
+ return False
2107
+ else:
2108
+ return True
2109
+
2110
+ def _delete_ilm_policy(self):
2111
+ try:
2112
+ self.datastore.client.ilm.delete_lifecycle(name=f"{self.name}_policy")
2113
+ except elasticsearch.ApiError:
2114
+ return False
2115
+ else:
2116
+ return True
2117
+
2118
+ def _get_index_settings(self) -> dict:
2119
+ default_stub: dict = deepcopy(default_index)
2120
+ settings: dict = default_stub.pop("settings", {})
2121
+
2122
+ if "index" not in settings:
2123
+ settings["index"] = {}
2124
+ settings["index"]["number_of_shards"] = self.shards
2125
+ settings["index"]["number_of_replicas"] = self.replicas
2126
+
2127
+ if "mapping" not in settings["index"]:
2128
+ settings["index"]["mapping"] = {}
2129
+
2130
+ if "total_fields" not in settings["index"]["mapping"]:
2131
+ settings["index"]["mapping"]["total_fields"] = {}
2132
+
2133
+ limit = len(self.model_class.flat_fields()) + 500 if self.model_class else 1500
2134
+ if limit < 1500:
2135
+ limit = 1500
2136
+ elif limit > 1500:
2137
+ logger.warning("ODM field size is larger than 1500 - set to %s", limit)
2138
+ settings["index"]["mapping"]["total_fields"]["limit"] = limit
2139
+
2140
+ return settings
2141
+
2142
+ def _get_index_mappings(self) -> dict:
2143
+ mappings: dict = deepcopy(default_mapping)
2144
+ if self.model_class:
2145
+ mappings["properties"], mappings["dynamic_templates"] = build_mapping(self.model_class.fields().values())
2146
+ mappings["dynamic_templates"].insert(0, default_dynamic_strings)
2147
+ else:
2148
+ mappings["dynamic_templates"] = deepcopy(default_dynamic_templates)
2149
+
2150
+ if not mappings["dynamic_templates"]:
2151
+ # Setting dynamic to strict prevents any documents with fields not in the properties to be added
2152
+ mappings["dynamic"] = "strict"
2153
+
2154
+ mappings["properties"]["id"] = {
2155
+ "store": True,
2156
+ "doc_values": True,
2157
+ "type": "keyword",
2158
+ }
2159
+
2160
+ mappings["properties"]["__text__"] = {
2161
+ "store": False,
2162
+ "type": "text",
2163
+ }
2164
+
2165
+ return mappings
2166
+
2167
+ def __get_possible_fields(self, field):
2168
+ field_types = [field.__name__.lower()]
2169
+ if field.__bases__[0] != _Field:
2170
+ field_types.extend(self.__get_possible_fields(field.__bases__[0]))
2171
+
2172
+ if field_type := TYPE_MAPPING.get(field.__name__, None):
2173
+ field_types.append(field_type)
2174
+
2175
+ return field_types
2176
+
2177
+ def _check_fields(self, model=None):
2178
+ if not self.validate:
2179
+ return
2180
+
2181
+ if model is None:
2182
+ if self.model_class:
2183
+ return self._check_fields(self.model_class)
2184
+ return
2185
+
2186
+ fields = self.fields()
2187
+ model = self.model_class.flat_fields(skip_mappings=True)
2188
+
2189
+ missing = set(model.keys()) - set(fields.keys())
2190
+ if missing:
2191
+ # TODO: Bump mapping limit
2192
+ try:
2193
+ self._add_fields({key: model[key] for key in missing})
2194
+ except elasticsearch.BadRequestError as err:
2195
+ handled = False
2196
+ if err.body and isinstance(err.body, dict) and "error" in err.body and "reason" in err.body["error"]:
2197
+ reason: str = err.body["error"]["reason"]
2198
+ if reason.startswith("Limit of total fields"):
2199
+ current_count = int(re.sub(r".+\[(\d+)].+", r"\1", reason))
2200
+ logger.warning(
2201
+ "Current field cap %s is too low, increasing to %s", current_count, current_count + 500
2202
+ )
2203
+ self.with_retries(
2204
+ self.datastore.client.indices.put_settings,
2205
+ settings={"index.mapping.total_fields.limit": current_count + 500},
2206
+ )
2207
+ self._add_fields({key: model[key] for key in missing})
2208
+ handled = True
2209
+ if not handled:
2210
+ raise
2211
+
2212
+ matching = set(fields.keys()) & set(model.keys())
2213
+ for field_name in matching:
2214
+ if fields[field_name]["indexed"] != model[field_name].index and model[field_name].index:
2215
+ raise HowlerRuntimeError(f"Field {field_name} should be indexed but is not.")
2216
+
2217
+ possible_field_types = self.__get_possible_fields(model[field_name].__class__)
2218
+
2219
+ if fields[field_name]["type"] not in possible_field_types:
2220
+ raise HowlerRuntimeError(
2221
+ f"Field {field_name} didn't have the expected store "
2222
+ f"type. [{fields[field_name]['type']} != "
2223
+ f"{model[field_name].__class__.__name__.lower()}]"
2224
+ )
2225
+
2226
+ def _ensure_collection(self):
2227
+ """This function should test if the collection that you are trying to access does indeed exist
2228
+ and should create it if it does not.
2229
+
2230
+ :return:
2231
+ """
2232
+ # Create HOT index
2233
+ if not self.with_retries(self.datastore.client.indices.exists, index=self.name):
2234
+ logger.debug(f"Index {self.name.upper()} does not exists. Creating it now...")
2235
+ try:
2236
+ self.with_retries(
2237
+ self.datastore.client.indices.create,
2238
+ index=self.index_name,
2239
+ mappings=self._get_index_mappings(),
2240
+ settings=self._get_index_settings(),
2241
+ )
2242
+ except elasticsearch.exceptions.RequestError as e:
2243
+ if "resource_already_exists_exception" not in str(e):
2244
+ raise
2245
+ logger.warning(f"Tried to create an index template that already exists: {self.name.upper()}")
2246
+
2247
+ self.with_retries(
2248
+ self.datastore.client.indices.put_alias,
2249
+ index=self.index_name,
2250
+ name=self.name,
2251
+ )
2252
+ elif not self.with_retries(
2253
+ self.datastore.client.indices.exists, index=self.index_name
2254
+ ) and not self.with_retries(self.datastore.client.indices.exists_alias, name=self.name):
2255
+ # Turn on write block
2256
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
2257
+
2258
+ # Create a copy on the result index
2259
+ self._safe_index_copy(self.datastore.client.indices.clone, self.name, self.index_name)
2260
+
2261
+ # Make the hot index the new clone
2262
+ self.with_retries(
2263
+ self.datastore.client.indices.update_aliases,
2264
+ actions=[
2265
+ {"add": {"index": self.index_name, "alias": self.name}},
2266
+ {"remove_index": {"index": self.name}},
2267
+ ],
2268
+ )
2269
+
2270
+ self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
2271
+
2272
+ self._check_fields()
2273
+
2274
+ def _add_fields(self, missing_fields: Dict):
2275
+ no_fix = []
2276
+ properties = {}
2277
+ for name, field in missing_fields.items():
2278
+ # Figure out the path of the field in the document, if the name is set in the field, it
2279
+ # is going to be duplicated in the path from missing_fields, so drop it
2280
+ prefix = name.split(".")
2281
+ if field.name:
2282
+ prefix = prefix[:-1]
2283
+
2284
+ # Build the fields and templates for this new mapping
2285
+ sub_properties, sub_templates = build_mapping([field], prefix=prefix, allow_refuse_implicit=False)
2286
+ properties.update(sub_properties)
2287
+ if sub_templates:
2288
+ no_fix.append(name)
2289
+
2290
+ # If we have collected any fields that we can't just blindly add, as they might conflict
2291
+ # with existing things, (we might have the refuse_all_implicit_mappings rule in place)
2292
+ # simply raise an exception
2293
+ if no_fix:
2294
+ raise HowlerValueError(
2295
+ f"Can't update database mapping for {self.name}, " f"couldn't safely amend mapping for {no_fix}"
2296
+ )
2297
+
2298
+ # If we got this far, the missing fields have been described in properties, upload them to the
2299
+ # server, and we should be able to move on.
2300
+ for index in self.index_list_full:
2301
+ self.with_retries(self.datastore.client.indices.put_mapping, index=index, properties=properties)
2302
+
2303
+ if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
2304
+ current_template = self.with_retries(self.datastore.client.indices.get_template, name=self.name)[self.name]
2305
+ self.with_retries(
2306
+ self.datastore.client.indices.put_template,
2307
+ name=self.name,
2308
+ **recursive_update(current_template, {"mappings": {"properties": properties}}),
2309
+ )
2310
+
2311
+ def wipe(self):
2312
+ """This function should completely delete the collection
2313
+
2314
+ NEVER USE THIS!
2315
+
2316
+ :return:
2317
+ """
2318
+ logger.debug("Wipe operation started for collection: %s" % self.name.upper())
2319
+
2320
+ for index in self.index_list:
2321
+ if self.with_retries(self.datastore.client.indices.exists, index=index):
2322
+ self.with_retries(self.datastore.client.indices.delete, index=index)
2323
+
2324
+ if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
2325
+ self.with_retries(self.datastore.client.indices.delete_template, name=self.name)
2326
+
2327
+ self._ensure_collection()