howler-api 2.13.0.dev329__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of howler-api might be problematic. Click here for more details.
- howler/__init__.py +0 -0
- howler/actions/__init__.py +167 -0
- howler/actions/add_label.py +111 -0
- howler/actions/add_to_bundle.py +159 -0
- howler/actions/change_field.py +76 -0
- howler/actions/demote.py +160 -0
- howler/actions/example_plugin.py +104 -0
- howler/actions/prioritization.py +93 -0
- howler/actions/promote.py +147 -0
- howler/actions/remove_from_bundle.py +133 -0
- howler/actions/remove_label.py +111 -0
- howler/actions/transition.py +200 -0
- howler/api/__init__.py +249 -0
- howler/api/base.py +88 -0
- howler/api/socket.py +114 -0
- howler/api/v1/__init__.py +97 -0
- howler/api/v1/action.py +372 -0
- howler/api/v1/analytic.py +748 -0
- howler/api/v1/auth.py +382 -0
- howler/api/v1/borealis.py +101 -0
- howler/api/v1/configs.py +55 -0
- howler/api/v1/dossier.py +222 -0
- howler/api/v1/help.py +28 -0
- howler/api/v1/hit.py +1181 -0
- howler/api/v1/notebook.py +82 -0
- howler/api/v1/overview.py +191 -0
- howler/api/v1/search.py +715 -0
- howler/api/v1/template.py +206 -0
- howler/api/v1/tool.py +183 -0
- howler/api/v1/user.py +414 -0
- howler/api/v1/utils/__init__.py +0 -0
- howler/api/v1/utils/etag.py +84 -0
- howler/api/v1/view.py +288 -0
- howler/app.py +235 -0
- howler/common/README.md +144 -0
- howler/common/__init__.py +0 -0
- howler/common/classification.py +979 -0
- howler/common/classification.yml +107 -0
- howler/common/exceptions.py +167 -0
- howler/common/hexdump.py +48 -0
- howler/common/iprange.py +171 -0
- howler/common/loader.py +154 -0
- howler/common/logging/__init__.py +241 -0
- howler/common/logging/audit.py +138 -0
- howler/common/logging/format.py +38 -0
- howler/common/net.py +79 -0
- howler/common/net_static.py +1494 -0
- howler/common/random_user.py +316 -0
- howler/common/swagger.py +117 -0
- howler/config.py +64 -0
- howler/cronjobs/__init__.py +29 -0
- howler/cronjobs/retention.py +61 -0
- howler/cronjobs/rules.py +274 -0
- howler/cronjobs/view_cleanup.py +88 -0
- howler/datastore/README.md +112 -0
- howler/datastore/__init__.py +0 -0
- howler/datastore/bulk.py +72 -0
- howler/datastore/collection.py +2327 -0
- howler/datastore/constants.py +117 -0
- howler/datastore/exceptions.py +41 -0
- howler/datastore/howler_store.py +105 -0
- howler/datastore/migrations/fix_process.py +41 -0
- howler/datastore/operations.py +130 -0
- howler/datastore/schemas.py +90 -0
- howler/datastore/store.py +231 -0
- howler/datastore/support/__init__.py +0 -0
- howler/datastore/support/build.py +214 -0
- howler/datastore/support/schemas.py +90 -0
- howler/datastore/types.py +22 -0
- howler/error.py +91 -0
- howler/external/__init__.py +0 -0
- howler/external/generate_mitre.py +96 -0
- howler/external/generate_sigma_rules.py +31 -0
- howler/external/generate_tlds.py +47 -0
- howler/external/reindex_data.py +46 -0
- howler/external/wipe_databases.py +58 -0
- howler/gunicorn_config.py +25 -0
- howler/healthz.py +47 -0
- howler/helper/__init__.py +0 -0
- howler/helper/azure.py +50 -0
- howler/helper/discover.py +59 -0
- howler/helper/hit.py +236 -0
- howler/helper/oauth.py +247 -0
- howler/helper/search.py +92 -0
- howler/helper/workflow.py +110 -0
- howler/helper/ws.py +378 -0
- howler/odm/README.md +102 -0
- howler/odm/__init__.py +1 -0
- howler/odm/base.py +1504 -0
- howler/odm/charter.txt +146 -0
- howler/odm/helper.py +416 -0
- howler/odm/howler_enum.py +25 -0
- howler/odm/models/__init__.py +0 -0
- howler/odm/models/action.py +33 -0
- howler/odm/models/analytic.py +90 -0
- howler/odm/models/assemblyline.py +48 -0
- howler/odm/models/aws.py +23 -0
- howler/odm/models/azure.py +16 -0
- howler/odm/models/cbs.py +44 -0
- howler/odm/models/config.py +558 -0
- howler/odm/models/dossier.py +33 -0
- howler/odm/models/ecs/__init__.py +0 -0
- howler/odm/models/ecs/agent.py +17 -0
- howler/odm/models/ecs/autonomous_system.py +16 -0
- howler/odm/models/ecs/client.py +149 -0
- howler/odm/models/ecs/cloud.py +141 -0
- howler/odm/models/ecs/code_signature.py +27 -0
- howler/odm/models/ecs/container.py +32 -0
- howler/odm/models/ecs/dns.py +62 -0
- howler/odm/models/ecs/egress.py +10 -0
- howler/odm/models/ecs/elf.py +74 -0
- howler/odm/models/ecs/email.py +122 -0
- howler/odm/models/ecs/error.py +14 -0
- howler/odm/models/ecs/event.py +140 -0
- howler/odm/models/ecs/faas.py +24 -0
- howler/odm/models/ecs/file.py +84 -0
- howler/odm/models/ecs/geo.py +30 -0
- howler/odm/models/ecs/group.py +18 -0
- howler/odm/models/ecs/hash.py +16 -0
- howler/odm/models/ecs/host.py +17 -0
- howler/odm/models/ecs/http.py +37 -0
- howler/odm/models/ecs/ingress.py +12 -0
- howler/odm/models/ecs/interface.py +21 -0
- howler/odm/models/ecs/network.py +30 -0
- howler/odm/models/ecs/observer.py +45 -0
- howler/odm/models/ecs/organization.py +12 -0
- howler/odm/models/ecs/os.py +21 -0
- howler/odm/models/ecs/pe.py +17 -0
- howler/odm/models/ecs/process.py +216 -0
- howler/odm/models/ecs/registry.py +26 -0
- howler/odm/models/ecs/related.py +45 -0
- howler/odm/models/ecs/rule.py +51 -0
- howler/odm/models/ecs/server.py +24 -0
- howler/odm/models/ecs/threat.py +247 -0
- howler/odm/models/ecs/tls.py +58 -0
- howler/odm/models/ecs/url.py +51 -0
- howler/odm/models/ecs/user.py +57 -0
- howler/odm/models/ecs/user_agent.py +20 -0
- howler/odm/models/ecs/vulnerability.py +41 -0
- howler/odm/models/gcp.py +16 -0
- howler/odm/models/hit.py +356 -0
- howler/odm/models/howler_data.py +328 -0
- howler/odm/models/lead.py +33 -0
- howler/odm/models/localized_label.py +13 -0
- howler/odm/models/overview.py +16 -0
- howler/odm/models/pivot.py +40 -0
- howler/odm/models/template.py +24 -0
- howler/odm/models/user.py +83 -0
- howler/odm/models/view.py +34 -0
- howler/odm/random_data.py +888 -0
- howler/odm/randomizer.py +606 -0
- howler/patched.py +5 -0
- howler/plugins/__init__.py +25 -0
- howler/plugins/config.py +123 -0
- howler/remote/__init__.py +0 -0
- howler/remote/datatypes/README.md +355 -0
- howler/remote/datatypes/__init__.py +98 -0
- howler/remote/datatypes/counters.py +63 -0
- howler/remote/datatypes/events.py +66 -0
- howler/remote/datatypes/hash.py +206 -0
- howler/remote/datatypes/lock.py +42 -0
- howler/remote/datatypes/queues/__init__.py +0 -0
- howler/remote/datatypes/queues/comms.py +59 -0
- howler/remote/datatypes/queues/multi.py +32 -0
- howler/remote/datatypes/queues/named.py +93 -0
- howler/remote/datatypes/queues/priority.py +215 -0
- howler/remote/datatypes/set.py +118 -0
- howler/remote/datatypes/user_quota_tracker.py +54 -0
- howler/security/__init__.py +253 -0
- howler/security/socket.py +108 -0
- howler/security/utils.py +185 -0
- howler/services/__init__.py +0 -0
- howler/services/action_service.py +111 -0
- howler/services/analytic_service.py +128 -0
- howler/services/auth_service.py +323 -0
- howler/services/config_service.py +128 -0
- howler/services/dossier_service.py +252 -0
- howler/services/event_service.py +93 -0
- howler/services/hit_service.py +893 -0
- howler/services/jwt_service.py +158 -0
- howler/services/lucene_service.py +286 -0
- howler/services/notebook_service.py +119 -0
- howler/services/overview_service.py +44 -0
- howler/services/template_service.py +45 -0
- howler/services/user_service.py +330 -0
- howler/utils/__init__.py +0 -0
- howler/utils/annotations.py +28 -0
- howler/utils/chunk.py +38 -0
- howler/utils/dict_utils.py +200 -0
- howler/utils/isotime.py +17 -0
- howler/utils/list_utils.py +11 -0
- howler/utils/lucene.py +77 -0
- howler/utils/path.py +27 -0
- howler/utils/socket_utils.py +61 -0
- howler/utils/str_utils.py +256 -0
- howler/utils/uid.py +47 -0
- howler_api-2.13.0.dev329.dist-info/METADATA +71 -0
- howler_api-2.13.0.dev329.dist-info/RECORD +200 -0
- howler_api-2.13.0.dev329.dist-info/WHEEL +4 -0
- howler_api-2.13.0.dev329.dist-info/entry_points.txt +8 -0
|
@@ -0,0 +1,2327 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
import typing
|
|
9
|
+
import warnings
|
|
10
|
+
from copy import deepcopy
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from os import environ
|
|
13
|
+
from random import random
|
|
14
|
+
from typing import Any, Dict, Generic, Optional, TypeVar, Union
|
|
15
|
+
|
|
16
|
+
import elasticsearch
|
|
17
|
+
from datemath import dm
|
|
18
|
+
from datemath.helpers import DateMathException
|
|
19
|
+
|
|
20
|
+
from howler import odm
|
|
21
|
+
from howler.common.exceptions import HowlerRuntimeError, HowlerValueError, NonRecoverableError
|
|
22
|
+
from howler.common.loader import APP_NAME
|
|
23
|
+
from howler.common.logging.format import HWL_DATE_FORMAT, HWL_LOG_FORMAT
|
|
24
|
+
from howler.datastore.constants import BACK_MAPPING, TYPE_MAPPING
|
|
25
|
+
from howler.datastore.exceptions import (
|
|
26
|
+
DataStoreException,
|
|
27
|
+
HowlerScanError,
|
|
28
|
+
MultiKeyError,
|
|
29
|
+
SearchException,
|
|
30
|
+
SearchRetryException,
|
|
31
|
+
VersionConflictException,
|
|
32
|
+
)
|
|
33
|
+
from howler.datastore.support.build import build_mapping
|
|
34
|
+
from howler.datastore.support.schemas import (
|
|
35
|
+
default_dynamic_strings,
|
|
36
|
+
default_dynamic_templates,
|
|
37
|
+
default_index,
|
|
38
|
+
default_mapping,
|
|
39
|
+
)
|
|
40
|
+
from howler.odm.base import (
|
|
41
|
+
BANNED_FIELDS,
|
|
42
|
+
IP,
|
|
43
|
+
ClassificationObject,
|
|
44
|
+
Enum,
|
|
45
|
+
Integer,
|
|
46
|
+
Keyword,
|
|
47
|
+
List,
|
|
48
|
+
Mapping,
|
|
49
|
+
Model,
|
|
50
|
+
ValidatedKeyword,
|
|
51
|
+
_Field,
|
|
52
|
+
)
|
|
53
|
+
from howler.utils.dict_utils import prune, recursive_update
|
|
54
|
+
|
|
55
|
+
if typing.TYPE_CHECKING:
|
|
56
|
+
from .store import ESStore
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
TRANSPORT_TIMEOUT = int(environ.get("HWL_DATASTORE_TRANSPORT_TIMEOUT", "10"))
|
|
60
|
+
|
|
61
|
+
logger = logging.getLogger("howler.api.datastore")
|
|
62
|
+
logger.setLevel(logging.INFO)
|
|
63
|
+
console = logging.StreamHandler()
|
|
64
|
+
console.setLevel(logging.INFO)
|
|
65
|
+
console.setFormatter(logging.Formatter(HWL_LOG_FORMAT, HWL_DATE_FORMAT))
|
|
66
|
+
logger.addHandler(console)
|
|
67
|
+
|
|
68
|
+
ModelType = TypeVar("ModelType", bound=Model)
|
|
69
|
+
write_block_settings = {"index.blocks.write": True}
|
|
70
|
+
write_unblock_settings = {"index.blocks.write": None}
|
|
71
|
+
|
|
72
|
+
# A token value to represent a document not existing. Its a string to match the
|
|
73
|
+
# type used for version values. Any string will do as long as it never matches
|
|
74
|
+
# a real version string.
|
|
75
|
+
CREATE_TOKEN = "create" # noqa: S105
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _strip_lists(model, data):
|
|
79
|
+
"""Elasticsearch returns everything as lists, regardless of whether
|
|
80
|
+
we want the field to be multi-valued or not. This method uses the model's
|
|
81
|
+
knowledge of what should or should not have multiple values to fix the data.
|
|
82
|
+
"""
|
|
83
|
+
fields = model.fields()
|
|
84
|
+
out = {}
|
|
85
|
+
for key, value in odm.flat_to_nested(data).items():
|
|
86
|
+
doc_type = fields.get(key, fields.get("", model))
|
|
87
|
+
# TODO: While we strip lists we don't want to know that the field is optional but we want to know what
|
|
88
|
+
# type of optional field that is. The following two lines of code change the doc_type to the
|
|
89
|
+
# child_type of the field. (Should model.fields() actually do that for us instead?)
|
|
90
|
+
if isinstance(doc_type, odm.Optional):
|
|
91
|
+
doc_type = doc_type.child_type
|
|
92
|
+
|
|
93
|
+
if isinstance(doc_type, odm.List):
|
|
94
|
+
out[key] = value
|
|
95
|
+
elif isinstance(doc_type, odm.Compound) or isinstance(doc_type, odm.Mapping):
|
|
96
|
+
out[key] = _strip_lists(doc_type.child_type, value)
|
|
97
|
+
elif isinstance(value, list):
|
|
98
|
+
out[key] = value[0]
|
|
99
|
+
else:
|
|
100
|
+
out[key] = value
|
|
101
|
+
return out
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def sort_str(sort_dicts):
|
|
105
|
+
if sort_dicts is None:
|
|
106
|
+
return sort_dicts
|
|
107
|
+
|
|
108
|
+
sort_list = [f"{key}:{val}" for d in sort_dicts for key, val in d.items()]
|
|
109
|
+
return ",".join(sort_list)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def parse_sort(sort, ret_list=True):
|
|
113
|
+
"""This function tries to do two things at once:
|
|
114
|
+
- convert AL sort syntax to elastic,
|
|
115
|
+
- convert any sorts on the key _id to _id_
|
|
116
|
+
"""
|
|
117
|
+
if sort is None:
|
|
118
|
+
return sort
|
|
119
|
+
|
|
120
|
+
if isinstance(sort, list):
|
|
121
|
+
return [parse_sort(row, ret_list=False) for row in sort]
|
|
122
|
+
elif isinstance(sort, dict):
|
|
123
|
+
return {("id" if key == "_id" else key): value for key, value in sort.items()}
|
|
124
|
+
|
|
125
|
+
parts = sort.split(" ")
|
|
126
|
+
if len(parts) == 1:
|
|
127
|
+
if parts == "_id":
|
|
128
|
+
if ret_list:
|
|
129
|
+
return ["id"]
|
|
130
|
+
return "id"
|
|
131
|
+
if ret_list:
|
|
132
|
+
return [parts]
|
|
133
|
+
return parts
|
|
134
|
+
elif len(parts) == 2:
|
|
135
|
+
if parts[1] not in ["asc", "desc"]:
|
|
136
|
+
raise SearchException("Unknown sort parameter " + sort)
|
|
137
|
+
if parts[0] == "_id":
|
|
138
|
+
if ret_list:
|
|
139
|
+
return [{"id": parts[1]}]
|
|
140
|
+
return {"id": parts[1]}
|
|
141
|
+
if ret_list:
|
|
142
|
+
return [{parts[0]: parts[1]}]
|
|
143
|
+
return {parts[0]: parts[1]}
|
|
144
|
+
raise SearchException("Unknown sort parameter " + sort)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class ESCollection(Generic[ModelType]):
|
|
148
|
+
DEFAULT_OFFSET = 0
|
|
149
|
+
DEFAULT_ROW_SIZE = 25
|
|
150
|
+
DEFAULT_SEARCH_FIELD = "__text__"
|
|
151
|
+
DEFAULT_SORT = [{"_id": "asc"}]
|
|
152
|
+
FIELD_SANITIZER = re.compile("^[a-z][a-z0-9_\\-.]+$")
|
|
153
|
+
MAX_GROUP_LIMIT = 10
|
|
154
|
+
MAX_FACET_LIMIT = 100
|
|
155
|
+
MAX_RETRY_BACKOFF = 10
|
|
156
|
+
MAX_SEARCH_ROWS = 500
|
|
157
|
+
RETRY_NORMAL = 1
|
|
158
|
+
RETRY_NONE = 0
|
|
159
|
+
RETRY_INFINITY = -1
|
|
160
|
+
SCROLL_TIMEOUT = "5m"
|
|
161
|
+
UPDATE_SET = "SET"
|
|
162
|
+
UPDATE_INC = "INC"
|
|
163
|
+
UPDATE_DEC = "DEC"
|
|
164
|
+
UPDATE_MAX = "MAX"
|
|
165
|
+
UPDATE_MIN = "MIN"
|
|
166
|
+
UPDATE_APPEND = "APPEND"
|
|
167
|
+
UPDATE_APPEND_IF_MISSING = "APPEND_IF_MISSING"
|
|
168
|
+
UPDATE_REMOVE = "REMOVE"
|
|
169
|
+
UPDATE_DELETE = "DELETE"
|
|
170
|
+
UPDATE_OPERATIONS = [
|
|
171
|
+
UPDATE_APPEND,
|
|
172
|
+
UPDATE_APPEND_IF_MISSING,
|
|
173
|
+
UPDATE_DEC,
|
|
174
|
+
UPDATE_INC,
|
|
175
|
+
UPDATE_MAX,
|
|
176
|
+
UPDATE_MIN,
|
|
177
|
+
UPDATE_REMOVE,
|
|
178
|
+
UPDATE_SET,
|
|
179
|
+
UPDATE_DELETE,
|
|
180
|
+
]
|
|
181
|
+
DEFAULT_SEARCH_VALUES: dict[str, typing.Any] = {
|
|
182
|
+
"timeout": None,
|
|
183
|
+
"field_list": None,
|
|
184
|
+
"facet_active": False,
|
|
185
|
+
"facet_mincount": 1,
|
|
186
|
+
"facet_fields": [],
|
|
187
|
+
"stats_active": False,
|
|
188
|
+
"stats_fields": [],
|
|
189
|
+
"field_script": None,
|
|
190
|
+
"filters": [],
|
|
191
|
+
"group_active": False,
|
|
192
|
+
"group_field": None,
|
|
193
|
+
"group_sort": None,
|
|
194
|
+
"group_limit": 1,
|
|
195
|
+
"histogram_active": False,
|
|
196
|
+
"histogram_field": None,
|
|
197
|
+
"histogram_type": None,
|
|
198
|
+
"histogram_gap": None,
|
|
199
|
+
"histogram_mincount": 1,
|
|
200
|
+
"histogram_start": None,
|
|
201
|
+
"histogram_end": None,
|
|
202
|
+
"start": 0,
|
|
203
|
+
"rows": DEFAULT_ROW_SIZE,
|
|
204
|
+
"query": "*",
|
|
205
|
+
"sort": DEFAULT_SORT,
|
|
206
|
+
"df": None,
|
|
207
|
+
"script_fields": [],
|
|
208
|
+
}
|
|
209
|
+
IGNORE_ENSURE_COLLECTION = False
|
|
210
|
+
|
|
211
|
+
def __init__(self, datastore: ESStore, name, model_class=None, validate=True, max_attempts=10):
|
|
212
|
+
self.replicas = int(
|
|
213
|
+
environ.get(
|
|
214
|
+
f"ELASTIC_{name.upper()}_REPLICAS",
|
|
215
|
+
environ.get("ELASTIC_DEFAULT_REPLICAS", 0),
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
self.shards = int(environ.get(f"ELASTIC_{name.upper()}_SHARDS", environ.get("ELASTIC_DEFAULT_SHARDS", 1)))
|
|
219
|
+
self._index_list: list[str] = []
|
|
220
|
+
|
|
221
|
+
self.datastore = datastore
|
|
222
|
+
self.name = f"{APP_NAME}-{name}"
|
|
223
|
+
self.index_name = f"{self.name}_hot"
|
|
224
|
+
self.model_class = model_class
|
|
225
|
+
self.validate = validate
|
|
226
|
+
self.max_attempts = max_attempts
|
|
227
|
+
|
|
228
|
+
if not ESCollection.IGNORE_ENSURE_COLLECTION:
|
|
229
|
+
self._ensure_collection()
|
|
230
|
+
elif "pytest" not in sys.modules:
|
|
231
|
+
logger.warning("Skipping ensure collection! This is dangerous. Waiting five seconds before continuing.")
|
|
232
|
+
time.sleep(5)
|
|
233
|
+
|
|
234
|
+
self.stored_fields = {}
|
|
235
|
+
if model_class:
|
|
236
|
+
for name, field in model_class.flat_fields().items():
|
|
237
|
+
if field.store:
|
|
238
|
+
self.stored_fields[name] = field
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def index_list_full(self):
|
|
242
|
+
if not self._index_list:
|
|
243
|
+
self._index_list = list(self.with_retries(self.datastore.client.indices.get, index=f"{self.name}-*").keys())
|
|
244
|
+
|
|
245
|
+
return [self.index_name] + sorted(self._index_list, reverse=True)
|
|
246
|
+
|
|
247
|
+
@property
|
|
248
|
+
def index_list(self):
|
|
249
|
+
"""This property contains the list of valid indexes for the current collection.
|
|
250
|
+
|
|
251
|
+
:return: list of valid indexes for this collection
|
|
252
|
+
"""
|
|
253
|
+
return [self.index_name]
|
|
254
|
+
|
|
255
|
+
def scan_with_retry(
|
|
256
|
+
self,
|
|
257
|
+
query,
|
|
258
|
+
sort=None,
|
|
259
|
+
source=None,
|
|
260
|
+
index=None,
|
|
261
|
+
scroll="5m",
|
|
262
|
+
size=1000,
|
|
263
|
+
request_timeout=None,
|
|
264
|
+
):
|
|
265
|
+
if index is None:
|
|
266
|
+
index = self.index_name
|
|
267
|
+
|
|
268
|
+
# initial search
|
|
269
|
+
resp = self.with_retries(
|
|
270
|
+
self.datastore.client.search,
|
|
271
|
+
index=index,
|
|
272
|
+
query=query,
|
|
273
|
+
scroll=scroll,
|
|
274
|
+
size=size,
|
|
275
|
+
request_timeout=request_timeout,
|
|
276
|
+
sort=sort,
|
|
277
|
+
_source=source,
|
|
278
|
+
)
|
|
279
|
+
scroll_id = resp.get("_scroll_id")
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
while scroll_id and resp["hits"]["hits"]:
|
|
283
|
+
for hit in resp["hits"]["hits"]:
|
|
284
|
+
yield hit
|
|
285
|
+
|
|
286
|
+
# Default to 0 if the value isn't included in the response
|
|
287
|
+
shards_successful = resp["_shards"].get("successful", 0)
|
|
288
|
+
shards_skipped = resp["_shards"].get("skipped", 0)
|
|
289
|
+
shards_total = resp["_shards"].get("total", 0)
|
|
290
|
+
|
|
291
|
+
# check if we have any errors
|
|
292
|
+
if (shards_successful + shards_skipped) < shards_total:
|
|
293
|
+
shards_message = (
|
|
294
|
+
f"{scroll_id}: Scroll request has only succeeded on {shards_successful} "
|
|
295
|
+
f"(+{shards_skipped} skipped) shards out of {shards_total}."
|
|
296
|
+
)
|
|
297
|
+
raise HowlerScanError(shards_message)
|
|
298
|
+
resp = self.with_retries(self.datastore.client.scroll, scroll_id=scroll_id, scroll=scroll)
|
|
299
|
+
scroll_id = resp.get("_scroll_id")
|
|
300
|
+
|
|
301
|
+
finally:
|
|
302
|
+
if scroll_id:
|
|
303
|
+
resp = self.with_retries(
|
|
304
|
+
self.datastore.client.clear_scroll,
|
|
305
|
+
scroll_id=[scroll_id],
|
|
306
|
+
ignore=(404,),
|
|
307
|
+
)
|
|
308
|
+
if not resp.get("succeeded", False):
|
|
309
|
+
logger.warning(
|
|
310
|
+
f"Could not clear scroll ID {scroll_id}, there is potential "
|
|
311
|
+
"memory leak in you Elastic cluster..."
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def with_retries(self, func, *args, raise_conflicts=False, **kwargs):
|
|
315
|
+
"""This function performs the passed function with the given args and kwargs and reconnect if it fails
|
|
316
|
+
|
|
317
|
+
:return: return the output of the function passed
|
|
318
|
+
"""
|
|
319
|
+
retries = 0
|
|
320
|
+
updated = 0
|
|
321
|
+
deleted = 0
|
|
322
|
+
|
|
323
|
+
while True:
|
|
324
|
+
if retries >= self.max_attempts:
|
|
325
|
+
raise HowlerRuntimeError(f"Maximum of {self.max_attempts} retries reached. Aborting ES connection")
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
ret_val = func(*args, **kwargs)
|
|
329
|
+
|
|
330
|
+
if retries:
|
|
331
|
+
logger.info("Reconnected to elasticsearch!")
|
|
332
|
+
|
|
333
|
+
if updated:
|
|
334
|
+
ret_val["updated"] += updated
|
|
335
|
+
|
|
336
|
+
if deleted:
|
|
337
|
+
ret_val["deleted"] += deleted
|
|
338
|
+
|
|
339
|
+
return ret_val
|
|
340
|
+
except elasticsearch.exceptions.NotFoundError as e:
|
|
341
|
+
if "index_not_found_exception" in str(e):
|
|
342
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
343
|
+
logger.debug("The index does not exist. Trying to recreate it...")
|
|
344
|
+
self._ensure_collection()
|
|
345
|
+
self.datastore.connection_reset()
|
|
346
|
+
retries += 1
|
|
347
|
+
else:
|
|
348
|
+
raise
|
|
349
|
+
|
|
350
|
+
except elasticsearch.exceptions.ConflictError as ce:
|
|
351
|
+
if raise_conflicts:
|
|
352
|
+
# De-sync potential treads trying to write to the index
|
|
353
|
+
time.sleep(random() * 0.1) # noqa: S311
|
|
354
|
+
raise VersionConflictException(str(ce))
|
|
355
|
+
updated += ce.info.get("updated", 0)
|
|
356
|
+
deleted += ce.info.get("deleted", 0)
|
|
357
|
+
|
|
358
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
359
|
+
self.datastore.connection_reset()
|
|
360
|
+
retries += 1
|
|
361
|
+
|
|
362
|
+
except elasticsearch.exceptions.ConnectionTimeout:
|
|
363
|
+
logger.warning(
|
|
364
|
+
f"Elasticsearch connection timeout, server(s): "
|
|
365
|
+
f"{' | '.join(self.datastore.get_hosts(safe=True))}"
|
|
366
|
+
f", retrying {func.__name__}..."
|
|
367
|
+
)
|
|
368
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
369
|
+
self.datastore.connection_reset()
|
|
370
|
+
retries += 1
|
|
371
|
+
|
|
372
|
+
except (
|
|
373
|
+
SearchRetryException,
|
|
374
|
+
elasticsearch.exceptions.ConnectionError,
|
|
375
|
+
elasticsearch.exceptions.AuthenticationException,
|
|
376
|
+
) as e:
|
|
377
|
+
if not isinstance(e, SearchRetryException):
|
|
378
|
+
logger.warning(
|
|
379
|
+
f"No connection to Elasticsearch server(s): "
|
|
380
|
+
f"{' | '.join(self.datastore.get_hosts(safe=True))}"
|
|
381
|
+
f", because [{e}] retrying {func.__name__}..."
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
385
|
+
self.datastore.connection_reset()
|
|
386
|
+
retries += 1
|
|
387
|
+
|
|
388
|
+
except elasticsearch.exceptions.TransportError as e:
|
|
389
|
+
err_code, msg, cause = e.args
|
|
390
|
+
if err_code == 503 or err_code == "503":
|
|
391
|
+
logger.warning(f"Looks like index {self.name} is not ready yet, retrying...")
|
|
392
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
393
|
+
self.datastore.connection_reset()
|
|
394
|
+
retries += 1
|
|
395
|
+
elif err_code == 429 or err_code == "429":
|
|
396
|
+
logger.warning(
|
|
397
|
+
"Elasticsearch is too busy to perform the requested " f"task on index {self.name}, retrying..."
|
|
398
|
+
)
|
|
399
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
400
|
+
self.datastore.connection_reset()
|
|
401
|
+
retries += 1
|
|
402
|
+
elif err_code == 403 or err_code == "403":
|
|
403
|
+
logger.warning(
|
|
404
|
+
"Elasticsearch cluster is preventing writing operations " f"on index {self.name}, retrying..."
|
|
405
|
+
)
|
|
406
|
+
time.sleep(min(retries, self.MAX_RETRY_BACKOFF))
|
|
407
|
+
self.datastore.connection_reset()
|
|
408
|
+
retries += 1
|
|
409
|
+
|
|
410
|
+
else:
|
|
411
|
+
raise
|
|
412
|
+
|
|
413
|
+
def _get_task_results(self, task):
|
|
414
|
+
# This function is only used to wait for a asynchronous task to finish in a graceful manner without
|
|
415
|
+
# timing out the elastic client. You can create an async task for long running operation like:
|
|
416
|
+
# - update_by_query
|
|
417
|
+
# - delete_by_query
|
|
418
|
+
# - reindex ...
|
|
419
|
+
res = None
|
|
420
|
+
while res is None:
|
|
421
|
+
try:
|
|
422
|
+
res = self.with_retries(
|
|
423
|
+
self.datastore.client.tasks.get,
|
|
424
|
+
task_id=task["task"],
|
|
425
|
+
wait_for_completion=True,
|
|
426
|
+
timeout="10s",
|
|
427
|
+
)
|
|
428
|
+
except elasticsearch.exceptions.TransportError as e:
|
|
429
|
+
err_code, msg, _ = e.args
|
|
430
|
+
if (err_code == 500 or err_code == "500") and msg in [
|
|
431
|
+
"timeout_exception",
|
|
432
|
+
"receive_timeout_transport_exception",
|
|
433
|
+
]:
|
|
434
|
+
pass
|
|
435
|
+
else:
|
|
436
|
+
raise
|
|
437
|
+
|
|
438
|
+
return res.get("response", res["task"]["status"])
|
|
439
|
+
|
|
440
|
+
def _get_current_alias(self, index: str) -> typing.Optional[str]:
|
|
441
|
+
if self.with_retries(self.datastore.client.indices.exists_alias, name=index):
|
|
442
|
+
return next(
|
|
443
|
+
iter(self.with_retries(self.datastore.client.indices.get_alias, index=index)),
|
|
444
|
+
None,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return None
|
|
448
|
+
|
|
449
|
+
def _wait_for_status(self, index, min_status="yellow"):
|
|
450
|
+
status_ok = False
|
|
451
|
+
while not status_ok:
|
|
452
|
+
try:
|
|
453
|
+
res = self.datastore.client.cluster.health(index=index, timeout="5s", wait_for_status=min_status)
|
|
454
|
+
status_ok = not res["timed_out"]
|
|
455
|
+
except elasticsearch.exceptions.TransportError as e:
|
|
456
|
+
err_code, _, _ = e.args
|
|
457
|
+
if err_code == 408 or err_code == "408":
|
|
458
|
+
logger.warning(f"Waiting for index {index} to get to status {min_status}...")
|
|
459
|
+
else:
|
|
460
|
+
raise
|
|
461
|
+
|
|
462
|
+
def _safe_index_copy(self, copy_function, src, target, settings=None, min_status="yellow"):
|
|
463
|
+
ret = copy_function(index=src, target=target, settings=settings, request_timeout=60)
|
|
464
|
+
if not ret["acknowledged"]:
|
|
465
|
+
raise DataStoreException(f"Failed to create index {target} from {src}.")
|
|
466
|
+
|
|
467
|
+
self._wait_for_status(target, min_status=min_status)
|
|
468
|
+
|
|
469
|
+
def _delete_async(self, index, query, max_docs=None, sort=None):
|
|
470
|
+
deleted = 0
|
|
471
|
+
while True:
|
|
472
|
+
task = self.with_retries(
|
|
473
|
+
self.datastore.client.delete_by_query,
|
|
474
|
+
index=index,
|
|
475
|
+
query=query,
|
|
476
|
+
wait_for_completion=False,
|
|
477
|
+
conflicts="proceed",
|
|
478
|
+
sort=sort,
|
|
479
|
+
max_docs=max_docs,
|
|
480
|
+
)
|
|
481
|
+
res = self._get_task_results(task)
|
|
482
|
+
|
|
483
|
+
if res["version_conflicts"] == 0:
|
|
484
|
+
res["deleted"] += deleted
|
|
485
|
+
return res
|
|
486
|
+
else:
|
|
487
|
+
deleted += res["deleted"]
|
|
488
|
+
|
|
489
|
+
def _update_async(self, index, script, query, max_docs=None):
|
|
490
|
+
updated = 0
|
|
491
|
+
while True:
|
|
492
|
+
task = self.with_retries(
|
|
493
|
+
self.datastore.client.update_by_query,
|
|
494
|
+
index=index,
|
|
495
|
+
script=script,
|
|
496
|
+
query=query,
|
|
497
|
+
wait_for_completion=False,
|
|
498
|
+
conflicts="proceed",
|
|
499
|
+
max_docs=max_docs,
|
|
500
|
+
)
|
|
501
|
+
res = self._get_task_results(task)
|
|
502
|
+
|
|
503
|
+
if res["version_conflicts"] == 0:
|
|
504
|
+
res["updated"] += updated
|
|
505
|
+
return res
|
|
506
|
+
else:
|
|
507
|
+
updated += res["updated"]
|
|
508
|
+
|
|
509
|
+
def commit(self):
|
|
510
|
+
"""This function should be overloaded to perform a commit of the index data of all the different hosts
|
|
511
|
+
specified in self.datastore.hosts.
|
|
512
|
+
|
|
513
|
+
:return: Should return True of the commit was successful on all hosts
|
|
514
|
+
"""
|
|
515
|
+
self.with_retries(self.datastore.client.indices.refresh, index=self.index_name)
|
|
516
|
+
self.with_retries(self.datastore.client.indices.clear_cache, index=self.index_name)
|
|
517
|
+
return True
|
|
518
|
+
|
|
519
|
+
def fix_replicas(self):
|
|
520
|
+
"""This function should be overloaded to fix the replica configuration of the index of all the different hosts
|
|
521
|
+
specified in self.datastore.hosts.
|
|
522
|
+
|
|
523
|
+
:return: Should return True of the fix was successful on all hosts
|
|
524
|
+
"""
|
|
525
|
+
replicas = self._get_index_settings()["index"]["number_of_replicas"]
|
|
526
|
+
settings = {"number_of_replicas": replicas}
|
|
527
|
+
return self.with_retries(self.datastore.client.indices.put_settings, index=self.index_name, settings=settings)[
|
|
528
|
+
"acknowledged"
|
|
529
|
+
]
|
|
530
|
+
|
|
531
|
+
def fix_shards(self):
|
|
532
|
+
"""This function should be overloaded to fix the shard configuration of the index of all the different hosts
|
|
533
|
+
specified in self.datastore.hosts.
|
|
534
|
+
|
|
535
|
+
:return: Should return True of the fix was successful on all hosts
|
|
536
|
+
"""
|
|
537
|
+
settings = self._get_index_settings()
|
|
538
|
+
clone_settings = {"index.number_of_replicas": 0}
|
|
539
|
+
clone_finish_settings = None
|
|
540
|
+
clone_setup_settings = None
|
|
541
|
+
method = None
|
|
542
|
+
target_node = ""
|
|
543
|
+
temp_name = f"{self.name}__fix_shards"
|
|
544
|
+
|
|
545
|
+
indexes_settings = self.with_retries(self.datastore.client.indices.get_settings)
|
|
546
|
+
current_settings = indexes_settings.get(self._get_current_alias(self.name), None)
|
|
547
|
+
if not current_settings:
|
|
548
|
+
raise DataStoreException(
|
|
549
|
+
"Could not get current index settings. Something is wrong and requires manual intervention..."
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
cur_replicas = int(current_settings["settings"]["index"]["number_of_replicas"])
|
|
553
|
+
cur_shards = int(current_settings["settings"]["index"]["number_of_shards"])
|
|
554
|
+
target_shards = int(settings["index"]["number_of_shards"])
|
|
555
|
+
clone_finish_settings = {
|
|
556
|
+
"index.number_of_replicas": cur_replicas,
|
|
557
|
+
"index.routing.allocation.require._name": None,
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
if cur_shards > target_shards:
|
|
561
|
+
logger.info(
|
|
562
|
+
f"Current shards ({cur_shards}) is bigger then target shards ({target_shards}), "
|
|
563
|
+
"we will be shrinking the index."
|
|
564
|
+
)
|
|
565
|
+
if cur_shards % target_shards != 0:
|
|
566
|
+
logger.info("The target shards is not a factor of the current shards, aborting...")
|
|
567
|
+
return
|
|
568
|
+
else:
|
|
569
|
+
target_node = self.with_retries(self.datastore.client.cat.nodes, format="json")[0]["name"]
|
|
570
|
+
clone_setup_settings = {
|
|
571
|
+
"index.number_of_replicas": 0,
|
|
572
|
+
"index.routing.allocation.require._name": target_node,
|
|
573
|
+
}
|
|
574
|
+
method = self.datastore.client.indices.shrink
|
|
575
|
+
elif cur_shards < target_shards:
|
|
576
|
+
logger.info(
|
|
577
|
+
f"Current shards ({cur_shards}) is smaller then target shards ({target_shards}), "
|
|
578
|
+
"we will be splitting the index."
|
|
579
|
+
)
|
|
580
|
+
if target_shards % cur_shards != 0:
|
|
581
|
+
logger.warning("The current shards is not a factor of the target shards, aborting...")
|
|
582
|
+
return
|
|
583
|
+
else:
|
|
584
|
+
method = self.datastore.client.indices.split
|
|
585
|
+
else:
|
|
586
|
+
logger.info(
|
|
587
|
+
f"Current shards ({cur_shards}) is equal to the target shards ({target_shards}), "
|
|
588
|
+
"only house keeping operations will be performed."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if method:
|
|
592
|
+
# Before we do anything, we should make sure the source index is in a good state
|
|
593
|
+
logger.info(f"Waiting for {self.name.upper()} status to be GREEN.")
|
|
594
|
+
self._wait_for_status(self.name, min_status="green")
|
|
595
|
+
|
|
596
|
+
# Block all indexes to be written to
|
|
597
|
+
logger.info("Set a datastore wide write block on Elastic.")
|
|
598
|
+
self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
|
|
599
|
+
|
|
600
|
+
# Clone it onto a temporary index
|
|
601
|
+
if not self.with_retries(self.datastore.client.indices.exists, index=temp_name):
|
|
602
|
+
# if there are specific settings to be applied to the index, apply them
|
|
603
|
+
if clone_setup_settings:
|
|
604
|
+
logger.info(f"Rellocating index to node {target_node.upper()}.")
|
|
605
|
+
self.with_retries(
|
|
606
|
+
self.datastore.client.indices.put_settings,
|
|
607
|
+
index=self.index_name,
|
|
608
|
+
settings=clone_setup_settings,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Make sure no shard are relocating
|
|
612
|
+
while self.datastore.client.cluster.health(index=self.index_name)["relocating_shards"] != 0:
|
|
613
|
+
time.sleep(1)
|
|
614
|
+
|
|
615
|
+
# Make a clone of the current index
|
|
616
|
+
logger.info(f"Cloning {self.index_name.upper()} into {temp_name.upper()}.")
|
|
617
|
+
self._safe_index_copy(
|
|
618
|
+
self.datastore.client.indices.clone,
|
|
619
|
+
self.index_name,
|
|
620
|
+
temp_name,
|
|
621
|
+
settings=clone_settings,
|
|
622
|
+
min_status="green",
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Make 100% sure temporary index is ready
|
|
626
|
+
logger.info(f"Waiting for {temp_name.upper()} status to be GREEN.")
|
|
627
|
+
self._wait_for_status(temp_name, "green")
|
|
628
|
+
|
|
629
|
+
# Make sure temporary index is the alias if not already
|
|
630
|
+
if self._get_current_alias(self.name) != temp_name:
|
|
631
|
+
logger.info(
|
|
632
|
+
f"Make {temp_name.upper()} the current alias for {self.name.upper()} "
|
|
633
|
+
f"and delete {self.index_name.upper()}."
|
|
634
|
+
)
|
|
635
|
+
# Make the hot index the temporary index while deleting the original index
|
|
636
|
+
alias_actions = [
|
|
637
|
+
{"add": {"index": temp_name, "alias": self.name}},
|
|
638
|
+
{"remove_index": {"index": self.index_name}},
|
|
639
|
+
]
|
|
640
|
+
self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
|
|
641
|
+
|
|
642
|
+
# Make sure the original index is deleted
|
|
643
|
+
if self.with_retries(self.datastore.client.indices.exists, index=self.index_name):
|
|
644
|
+
logger.info(f"Delete extra {self.index_name.upper()} index.")
|
|
645
|
+
self.with_retries(self.datastore.client.indices.delete, index=self.index_name)
|
|
646
|
+
|
|
647
|
+
# Shrink/split the temporary index into the original index
|
|
648
|
+
logger.info(f"Perform shard fix operation from {temp_name.upper()} to {self.index_name.upper()}.")
|
|
649
|
+
self._safe_index_copy(method, temp_name, self.index_name, settings=settings)
|
|
650
|
+
|
|
651
|
+
# Make the original index the new alias
|
|
652
|
+
logger.info(
|
|
653
|
+
f"Make {self.index_name.upper()} the current alias for {self.name.upper()} "
|
|
654
|
+
f"and delete {temp_name.upper()}."
|
|
655
|
+
)
|
|
656
|
+
alias_actions = [
|
|
657
|
+
{"add": {"index": self.index_name, "alias": self.name}},
|
|
658
|
+
{"remove_index": {"index": temp_name}},
|
|
659
|
+
]
|
|
660
|
+
self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
|
|
661
|
+
|
|
662
|
+
# Restore writes
|
|
663
|
+
logger.debug("Restore datastore wide write operation on Elastic.")
|
|
664
|
+
self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
|
|
665
|
+
|
|
666
|
+
# Restore normal routing and replicas
|
|
667
|
+
logger.debug(f"Restore original routing table for {self.name.upper()}.")
|
|
668
|
+
self.with_retries(
|
|
669
|
+
self.datastore.client.indices.put_settings,
|
|
670
|
+
index=self.name,
|
|
671
|
+
settings=clone_finish_settings,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
def reindex(self):
|
|
675
|
+
"""This function should be overloaded to perform a reindex of all the data of the different hosts
|
|
676
|
+
specified in self.datastore.hosts.
|
|
677
|
+
|
|
678
|
+
:return: Should return True of the commit was successful on all hosts
|
|
679
|
+
"""
|
|
680
|
+
for index in self.index_list:
|
|
681
|
+
new_name = f"{index}__reindex"
|
|
682
|
+
index_data = None
|
|
683
|
+
if self.with_retries(self.datastore.client.indices.exists, index=index) and not self.with_retries(
|
|
684
|
+
self.datastore.client.indices.exists, index=new_name
|
|
685
|
+
):
|
|
686
|
+
# Get information about the index to reindex
|
|
687
|
+
index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
|
|
688
|
+
|
|
689
|
+
# Create reindex target
|
|
690
|
+
self.with_retries(
|
|
691
|
+
self.datastore.client.indices.create,
|
|
692
|
+
index=new_name,
|
|
693
|
+
mappings=self._get_index_mappings(),
|
|
694
|
+
settings=self._get_index_settings(),
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# For all aliases related to the index, add a new alias to the reindex index
|
|
698
|
+
for alias, alias_data in index_data["aliases"].items():
|
|
699
|
+
# Make the reindex index the new write index if the original index was
|
|
700
|
+
if alias_data.get("is_write_index", True):
|
|
701
|
+
alias_actions = [
|
|
702
|
+
{
|
|
703
|
+
"add": {
|
|
704
|
+
"index": new_name,
|
|
705
|
+
"alias": alias,
|
|
706
|
+
"is_write_index": True,
|
|
707
|
+
}
|
|
708
|
+
},
|
|
709
|
+
{
|
|
710
|
+
"add": {
|
|
711
|
+
"index": index,
|
|
712
|
+
"alias": alias,
|
|
713
|
+
"is_write_index": False,
|
|
714
|
+
}
|
|
715
|
+
},
|
|
716
|
+
]
|
|
717
|
+
else:
|
|
718
|
+
alias_actions = [{"add": {"index": new_name, "alias": alias}}]
|
|
719
|
+
|
|
720
|
+
self.with_retries(self.datastore.client.indices.update_aliases, actions=alias_actions)
|
|
721
|
+
|
|
722
|
+
# Reindex data into target
|
|
723
|
+
r_task = self.with_retries(
|
|
724
|
+
self.datastore.client.reindex,
|
|
725
|
+
source={"index": index},
|
|
726
|
+
dest={"index": new_name},
|
|
727
|
+
wait_for_completion=False,
|
|
728
|
+
)
|
|
729
|
+
self._get_task_results(r_task)
|
|
730
|
+
|
|
731
|
+
if self.with_retries(self.datastore.client.indices.exists, index=new_name):
|
|
732
|
+
if index_data is None:
|
|
733
|
+
index_data = self.with_retries(self.datastore.client.indices.get, index=index)[index]
|
|
734
|
+
|
|
735
|
+
# Commit reindexed data
|
|
736
|
+
self.with_retries(self.datastore.client.indices.refresh, index=new_name)
|
|
737
|
+
self.with_retries(self.datastore.client.indices.clear_cache, index=new_name)
|
|
738
|
+
|
|
739
|
+
# Delete old index
|
|
740
|
+
if self.with_retries(self.datastore.client.indices.exists, index=index):
|
|
741
|
+
self.with_retries(self.datastore.client.indices.delete, index=index)
|
|
742
|
+
|
|
743
|
+
# Block write to the index
|
|
744
|
+
self.with_retries(
|
|
745
|
+
self.datastore.client.indices.put_settings,
|
|
746
|
+
settings=write_block_settings,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
# Rename reindexed index
|
|
750
|
+
try:
|
|
751
|
+
self._safe_index_copy(
|
|
752
|
+
self.datastore.client.indices.clone,
|
|
753
|
+
new_name,
|
|
754
|
+
index,
|
|
755
|
+
settings=self._get_index_settings(),
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# Restore original aliases for the index
|
|
759
|
+
for alias, alias_data in index_data["aliases"].items():
|
|
760
|
+
# Make the reindex index the new write index if the original index was
|
|
761
|
+
if alias_data.get("is_write_index", True):
|
|
762
|
+
alias_actions = [
|
|
763
|
+
{
|
|
764
|
+
"add": {
|
|
765
|
+
"index": index,
|
|
766
|
+
"alias": alias,
|
|
767
|
+
"is_write_index": True,
|
|
768
|
+
}
|
|
769
|
+
},
|
|
770
|
+
{"remove_index": {"index": new_name}},
|
|
771
|
+
]
|
|
772
|
+
self.with_retries(
|
|
773
|
+
self.datastore.client.indices.update_aliases,
|
|
774
|
+
actions=alias_actions,
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
# Delete the reindex target if it still exists
|
|
778
|
+
if self.with_retries(self.datastore.client.indices.exists, index=new_name):
|
|
779
|
+
self.with_retries(self.datastore.client.indices.delete, index=new_name)
|
|
780
|
+
finally:
|
|
781
|
+
# Unblock write to the index
|
|
782
|
+
self.with_retries(
|
|
783
|
+
self.datastore.client.indices.put_settings,
|
|
784
|
+
settings=write_unblock_settings,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
return True
|
|
788
|
+
|
|
789
|
+
def multiget(self, key_list, as_dictionary=True, as_obj=True, error_on_missing=True):
|
|
790
|
+
"""Get a list of documents from the datastore and make sure they are normalized using
|
|
791
|
+
the model class
|
|
792
|
+
|
|
793
|
+
:param error_on_missing: Should it raise a key error when keys are missing
|
|
794
|
+
:param as_dictionary: Return a disctionary of items or a list
|
|
795
|
+
:param as_obj: Return objects or not
|
|
796
|
+
:param key_list: list of keys of documents to get
|
|
797
|
+
:return: list of instances of the model class
|
|
798
|
+
"""
|
|
799
|
+
|
|
800
|
+
def add_to_output(data_output, data_id):
|
|
801
|
+
if "__non_doc_raw__" in data_output:
|
|
802
|
+
if as_dictionary:
|
|
803
|
+
out[data_id] = data_output["__non_doc_raw__"]
|
|
804
|
+
else:
|
|
805
|
+
out.append(data_output["__non_doc_raw__"]) # type: ignore
|
|
806
|
+
else:
|
|
807
|
+
data_output.pop("id", None)
|
|
808
|
+
if as_dictionary:
|
|
809
|
+
out[data_id] = self.normalize(data_output, as_obj=as_obj)
|
|
810
|
+
else:
|
|
811
|
+
out.append(self.normalize(data_output, as_obj=as_obj)) # type: ignore
|
|
812
|
+
|
|
813
|
+
out: Union[dict[str, Any], list[Any]]
|
|
814
|
+
if as_dictionary:
|
|
815
|
+
out = {}
|
|
816
|
+
else:
|
|
817
|
+
out = []
|
|
818
|
+
|
|
819
|
+
if key_list:
|
|
820
|
+
data = self.with_retries(self.datastore.client.mget, ids=key_list, index=self.name)
|
|
821
|
+
|
|
822
|
+
for row in data.get("docs", []):
|
|
823
|
+
if "found" in row and not row["found"]:
|
|
824
|
+
continue
|
|
825
|
+
|
|
826
|
+
try:
|
|
827
|
+
key_list.remove(row["_id"])
|
|
828
|
+
add_to_output(row["_source"], row["_id"])
|
|
829
|
+
except ValueError:
|
|
830
|
+
logger.exception(f'MGet returned multiple documents for id: {row["_id"]}')
|
|
831
|
+
|
|
832
|
+
if key_list and error_on_missing:
|
|
833
|
+
raise MultiKeyError(key_list, out)
|
|
834
|
+
|
|
835
|
+
return out
|
|
836
|
+
|
|
837
|
+
def normalize(self, data, as_obj=True) -> Union[ModelType, dict[str, Any], None]:
|
|
838
|
+
"""Normalize the data using the model class
|
|
839
|
+
|
|
840
|
+
:param as_obj: Return an object instead of a dictionary
|
|
841
|
+
:param data: data to normalize
|
|
842
|
+
:return: instance of the model class
|
|
843
|
+
"""
|
|
844
|
+
if as_obj and data is not None and self.model_class and not isinstance(data, self.model_class):
|
|
845
|
+
return self.model_class(data)
|
|
846
|
+
|
|
847
|
+
if isinstance(data, dict):
|
|
848
|
+
data = {k: v for k, v in data.items() if k not in BANNED_FIELDS}
|
|
849
|
+
|
|
850
|
+
return data
|
|
851
|
+
|
|
852
|
+
def exists(self, key):
|
|
853
|
+
"""Check if a document exists in the datastore.
|
|
854
|
+
|
|
855
|
+
:param key: key of the document to get from the datastore
|
|
856
|
+
:return: true/false depending if the document exists or not
|
|
857
|
+
"""
|
|
858
|
+
return self.with_retries(self.datastore.client.exists, index=self.name, id=key, _source=False)
|
|
859
|
+
|
|
860
|
+
def _get(self, key, retries, version=False):
|
|
861
|
+
"""Versioned get-save for atomic update has two paths:
|
|
862
|
+
1. Document doesn't exist at all. Create token will be returned for version.
|
|
863
|
+
This way only the first query to try and create the document will succeed.
|
|
864
|
+
2. Document exists in hot. A version string with the info needed to do a versioned save is returned.
|
|
865
|
+
|
|
866
|
+
The create token is needed to differentiate between "I'm saving a new
|
|
867
|
+
document non-atomic (version=None)" and "I'm saving a new document
|
|
868
|
+
atomically (version=CREATE_TOKEN)".
|
|
869
|
+
"""
|
|
870
|
+
|
|
871
|
+
def normalize_output(data_output):
|
|
872
|
+
if "__non_doc_raw__" in data_output:
|
|
873
|
+
return data_output["__non_doc_raw__"]
|
|
874
|
+
data_output.pop("id", None)
|
|
875
|
+
return data_output
|
|
876
|
+
|
|
877
|
+
if retries is None:
|
|
878
|
+
retries = self.RETRY_NONE
|
|
879
|
+
|
|
880
|
+
done = False
|
|
881
|
+
while not done:
|
|
882
|
+
try:
|
|
883
|
+
doc = self.with_retries(self.datastore.client.get, index=self.name, id=key)
|
|
884
|
+
if version:
|
|
885
|
+
return (
|
|
886
|
+
normalize_output(doc["_source"]),
|
|
887
|
+
f"{doc['_seq_no']}---{doc['_primary_term']}",
|
|
888
|
+
)
|
|
889
|
+
return normalize_output(doc["_source"])
|
|
890
|
+
except elasticsearch.exceptions.NotFoundError:
|
|
891
|
+
pass
|
|
892
|
+
|
|
893
|
+
if retries > 0:
|
|
894
|
+
time.sleep(0.05)
|
|
895
|
+
retries -= 1
|
|
896
|
+
elif retries < 0:
|
|
897
|
+
time.sleep(0.05)
|
|
898
|
+
else:
|
|
899
|
+
done = True
|
|
900
|
+
|
|
901
|
+
if version:
|
|
902
|
+
return None, CREATE_TOKEN
|
|
903
|
+
|
|
904
|
+
return None
|
|
905
|
+
|
|
906
|
+
def get(self, key, as_obj=True, version=False):
|
|
907
|
+
"""Get a document from the datastore, retry a few times if not found and normalize the
|
|
908
|
+
document with the model provided with the collection.
|
|
909
|
+
|
|
910
|
+
This is the normal way to get data of the system.
|
|
911
|
+
|
|
912
|
+
:param archive_access: Temporary sets access value to archive during this call
|
|
913
|
+
:param as_obj: Should the data be returned as an ODM object
|
|
914
|
+
:param key: key of the document to get from the datastore
|
|
915
|
+
:param version: should the version number be returned by the call
|
|
916
|
+
:return: an instance of the model class loaded with the document data
|
|
917
|
+
"""
|
|
918
|
+
data = self._get(key, self.RETRY_NORMAL, version=version)
|
|
919
|
+
if version:
|
|
920
|
+
data, version = data
|
|
921
|
+
return self.normalize(data, as_obj=as_obj), version
|
|
922
|
+
return self.normalize(data, as_obj=as_obj)
|
|
923
|
+
|
|
924
|
+
def get_if_exists(self, key, as_obj=True, version=False):
|
|
925
|
+
"""Get a document from the datastore but do not retry if not found.
|
|
926
|
+
|
|
927
|
+
Use this more in caching scenarios because eventually consistent database may lead
|
|
928
|
+
to have document reported as missing even if they exist.
|
|
929
|
+
|
|
930
|
+
:param archive_access: Temporary sets access value to archive during this call
|
|
931
|
+
:param as_obj: Should the data be returned as an ODM object
|
|
932
|
+
:param key: key of the document to get from the datastore
|
|
933
|
+
:param version: should the version number be returned by the call
|
|
934
|
+
:return: an instance of the model class loaded with the document data
|
|
935
|
+
"""
|
|
936
|
+
data = self._get(key, self.RETRY_NONE, version=version)
|
|
937
|
+
if version:
|
|
938
|
+
data, version = data
|
|
939
|
+
return self.normalize(data, as_obj=as_obj), version
|
|
940
|
+
return self.normalize(data, as_obj=as_obj)
|
|
941
|
+
|
|
942
|
+
def require(
|
|
943
|
+
self, key, as_obj=True, version=False
|
|
944
|
+
) -> Union[
|
|
945
|
+
tuple[Optional[Union[dict[str, Any], ModelType]], str],
|
|
946
|
+
Optional[Union[dict[str, Any], ModelType]],
|
|
947
|
+
]:
|
|
948
|
+
"""Get a document from the datastore and retry forever because we know for sure
|
|
949
|
+
that this document should exist. If it does not right now, this will wait for the
|
|
950
|
+
document to show up in the datastore.
|
|
951
|
+
|
|
952
|
+
:param archive_access: Temporary sets access value to archive during this call
|
|
953
|
+
:param as_obj: Should the data be returned as an ODM object
|
|
954
|
+
:param key: key of the document to get from the datastore
|
|
955
|
+
:param version: should the version number be returned by the call
|
|
956
|
+
:return: an instance of the model class loaded with the document data
|
|
957
|
+
"""
|
|
958
|
+
data = self._get(key, self.RETRY_INFINITY, version=version)
|
|
959
|
+
if version:
|
|
960
|
+
data, version = data
|
|
961
|
+
return self.normalize(data, as_obj=as_obj), version
|
|
962
|
+
return self.normalize(data, as_obj=as_obj)
|
|
963
|
+
|
|
964
|
+
def save(self, key, data, version=None):
|
|
965
|
+
"""Save to document to the datastore using the key as its document id.
|
|
966
|
+
|
|
967
|
+
The document data will be normalized before being saved in the datastore.
|
|
968
|
+
|
|
969
|
+
:param key: ID of the document to save
|
|
970
|
+
:param data: raw data or instance of the model class to save as the document
|
|
971
|
+
:param version: version of the document to save over, if the version check fails this will raise an exception
|
|
972
|
+
:return: True if the document was saved properly
|
|
973
|
+
"""
|
|
974
|
+
if " " in key:
|
|
975
|
+
raise DataStoreException("You are not allowed to use spaces in datastore keys.")
|
|
976
|
+
|
|
977
|
+
data = self.normalize(data)
|
|
978
|
+
|
|
979
|
+
if self.model_class:
|
|
980
|
+
saved_data = data.as_primitives(hidden_fields=True)
|
|
981
|
+
else:
|
|
982
|
+
if not isinstance(data, dict):
|
|
983
|
+
saved_data = {"__non_doc_raw__": data}
|
|
984
|
+
else:
|
|
985
|
+
saved_data = deepcopy(data)
|
|
986
|
+
|
|
987
|
+
saved_data["id"] = key
|
|
988
|
+
operation = "index"
|
|
989
|
+
seq_no = None
|
|
990
|
+
primary_term = None
|
|
991
|
+
|
|
992
|
+
if version == CREATE_TOKEN:
|
|
993
|
+
operation = "create"
|
|
994
|
+
elif version:
|
|
995
|
+
seq_no, primary_term = version.split("---")
|
|
996
|
+
|
|
997
|
+
try:
|
|
998
|
+
self.with_retries(
|
|
999
|
+
self.datastore.client.index,
|
|
1000
|
+
index=self.name,
|
|
1001
|
+
id=key,
|
|
1002
|
+
document=json.dumps(saved_data),
|
|
1003
|
+
op_type=operation,
|
|
1004
|
+
if_seq_no=seq_no,
|
|
1005
|
+
if_primary_term=primary_term,
|
|
1006
|
+
raise_conflicts=True,
|
|
1007
|
+
)
|
|
1008
|
+
except elasticsearch.BadRequestError as e:
|
|
1009
|
+
raise NonRecoverableError(
|
|
1010
|
+
f"When saving document {key} to elasticsearch, an exception occurred:\n{repr(e)}\n\n"
|
|
1011
|
+
f"Data: {json.dumps(saved_data)}"
|
|
1012
|
+
) from e
|
|
1013
|
+
|
|
1014
|
+
return True
|
|
1015
|
+
|
|
1016
|
+
def delete(self, key):
|
|
1017
|
+
"""This function should delete the underlying document referenced by the key.
|
|
1018
|
+
It should return true if the document was in fact properly deleted.
|
|
1019
|
+
|
|
1020
|
+
:param key: id of the document to delete
|
|
1021
|
+
:return: True is delete successful
|
|
1022
|
+
"""
|
|
1023
|
+
try:
|
|
1024
|
+
info = self.with_retries(self.datastore.client.delete, id=key, index=self.name)
|
|
1025
|
+
return info["result"] == "deleted"
|
|
1026
|
+
except elasticsearch.NotFoundError:
|
|
1027
|
+
return False
|
|
1028
|
+
|
|
1029
|
+
def delete_by_query(self, query, workers=20, sort=None, max_docs=None):
|
|
1030
|
+
"""This function should delete the underlying documents referenced by the query.
|
|
1031
|
+
It should return true if the documents were in fact properly deleted.
|
|
1032
|
+
|
|
1033
|
+
:param query: Query of the documents to download
|
|
1034
|
+
:param workers: Number of workers used for deletion if basic currency delete is used
|
|
1035
|
+
:return: True is delete successful
|
|
1036
|
+
"""
|
|
1037
|
+
query = {"bool": {"must": {"query_string": {"query": query}}}}
|
|
1038
|
+
info = self._delete_async(self.name, query=query, sort=sort_str(parse_sort(sort)), max_docs=max_docs)
|
|
1039
|
+
return info.get("deleted", 0) != 0
|
|
1040
|
+
|
|
1041
|
+
def _create_scripts_from_operations(self, operations):
|
|
1042
|
+
op_sources = []
|
|
1043
|
+
op_params = {}
|
|
1044
|
+
val_id = 0
|
|
1045
|
+
for op, doc_key, value in operations:
|
|
1046
|
+
if op == self.UPDATE_SET:
|
|
1047
|
+
op_sources.append(f"ctx._source.{doc_key} = params.value{val_id}")
|
|
1048
|
+
op_params[f"value{val_id}"] = value
|
|
1049
|
+
elif op == self.UPDATE_DELETE:
|
|
1050
|
+
op_sources.append(f"ctx._source.{doc_key}.remove(params.value{val_id})")
|
|
1051
|
+
op_params[f"value{val_id}"] = value
|
|
1052
|
+
elif op == self.UPDATE_APPEND:
|
|
1053
|
+
op_sources.append(f"ctx._source.{doc_key}.add(params.value{val_id})")
|
|
1054
|
+
op_params[f"value{val_id}"] = value
|
|
1055
|
+
elif op == self.UPDATE_APPEND_IF_MISSING:
|
|
1056
|
+
script = (
|
|
1057
|
+
f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) == -1) "
|
|
1058
|
+
f"{{ctx._source.{doc_key}.add(params.value{val_id})}}"
|
|
1059
|
+
)
|
|
1060
|
+
op_sources.append(script)
|
|
1061
|
+
op_params[f"value{val_id}"] = value
|
|
1062
|
+
elif op == self.UPDATE_REMOVE:
|
|
1063
|
+
script = (
|
|
1064
|
+
f"if (ctx._source.{doc_key}.indexOf(params.value{val_id}) != -1) "
|
|
1065
|
+
f"{{ctx._source.{doc_key}.remove(ctx._source.{doc_key}.indexOf(params.value{val_id}))}}"
|
|
1066
|
+
)
|
|
1067
|
+
op_sources.append(script)
|
|
1068
|
+
op_params[f"value{val_id}"] = value
|
|
1069
|
+
elif op == self.UPDATE_INC:
|
|
1070
|
+
op_sources.append(f"ctx._source.{doc_key} += params.value{val_id}")
|
|
1071
|
+
op_params[f"value{val_id}"] = value
|
|
1072
|
+
elif op == self.UPDATE_DEC:
|
|
1073
|
+
op_sources.append(f"ctx._source.{doc_key} -= params.value{val_id}")
|
|
1074
|
+
op_params[f"value{val_id}"] = value
|
|
1075
|
+
elif op == self.UPDATE_MAX:
|
|
1076
|
+
script = (
|
|
1077
|
+
f"if (ctx._source.{doc_key} == null || "
|
|
1078
|
+
f"ctx._source.{doc_key}.compareTo(params.value{val_id}) < 0) "
|
|
1079
|
+
f"{{ctx._source.{doc_key} = params.value{val_id}}}"
|
|
1080
|
+
)
|
|
1081
|
+
op_sources.append(script)
|
|
1082
|
+
op_params[f"value{val_id}"] = value
|
|
1083
|
+
elif op == self.UPDATE_MIN:
|
|
1084
|
+
script = (
|
|
1085
|
+
f"if (ctx._source.{doc_key} == null || "
|
|
1086
|
+
f"ctx._source.{doc_key}.compareTo(params.value{val_id}) > 0) "
|
|
1087
|
+
f"{{ctx._source.{doc_key} = params.value{val_id}}}"
|
|
1088
|
+
)
|
|
1089
|
+
op_sources.append(script)
|
|
1090
|
+
op_params[f"value{val_id}"] = value
|
|
1091
|
+
|
|
1092
|
+
val_id += 1
|
|
1093
|
+
|
|
1094
|
+
joined_sources = """;\n""".join(op_sources)
|
|
1095
|
+
|
|
1096
|
+
return {
|
|
1097
|
+
"lang": "painless",
|
|
1098
|
+
"source": joined_sources.replace("};\n", "}\n"),
|
|
1099
|
+
"params": op_params,
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
def _validate_operations(self, operations):
|
|
1103
|
+
"""Validate the different operations received for a partial update
|
|
1104
|
+
|
|
1105
|
+
TODO: When the field is of type Mapping, the validation/check only works for depth 1. A full recursive
|
|
1106
|
+
solution is needed to support multi-depth cases.
|
|
1107
|
+
|
|
1108
|
+
:param operations: list of operation tuples
|
|
1109
|
+
:raises: DatastoreException if operation not valid
|
|
1110
|
+
"""
|
|
1111
|
+
if self.model_class:
|
|
1112
|
+
fields = self.model_class.flat_fields(show_compound=True)
|
|
1113
|
+
if "classification in fields":
|
|
1114
|
+
fields.update(
|
|
1115
|
+
{
|
|
1116
|
+
"__access_lvl__": Integer(),
|
|
1117
|
+
"__access_req__": List(Keyword()),
|
|
1118
|
+
"__access_grp1__": List(Keyword()),
|
|
1119
|
+
"__access_grp2__": List(Keyword()),
|
|
1120
|
+
}
|
|
1121
|
+
)
|
|
1122
|
+
else:
|
|
1123
|
+
fields = None
|
|
1124
|
+
|
|
1125
|
+
ret_ops = []
|
|
1126
|
+
for op, doc_key, value in operations:
|
|
1127
|
+
if op not in self.UPDATE_OPERATIONS:
|
|
1128
|
+
raise DataStoreException(f"Not a valid Update Operation: {op}")
|
|
1129
|
+
|
|
1130
|
+
if fields is not None:
|
|
1131
|
+
prev_key = None
|
|
1132
|
+
if doc_key not in fields:
|
|
1133
|
+
if "." in doc_key:
|
|
1134
|
+
prev_key = doc_key[: doc_key.rindex(".")]
|
|
1135
|
+
if prev_key in fields and not isinstance(fields[prev_key], Mapping):
|
|
1136
|
+
raise DataStoreException(f"Invalid field for model: {prev_key}")
|
|
1137
|
+
else:
|
|
1138
|
+
raise DataStoreException(f"Invalid field for model: {doc_key}")
|
|
1139
|
+
|
|
1140
|
+
if prev_key:
|
|
1141
|
+
field = fields[prev_key].child_type
|
|
1142
|
+
else:
|
|
1143
|
+
field = fields[doc_key]
|
|
1144
|
+
|
|
1145
|
+
if op in [
|
|
1146
|
+
self.UPDATE_APPEND,
|
|
1147
|
+
self.UPDATE_APPEND_IF_MISSING,
|
|
1148
|
+
self.UPDATE_REMOVE,
|
|
1149
|
+
]:
|
|
1150
|
+
try:
|
|
1151
|
+
value = field.check(value)
|
|
1152
|
+
except (ValueError, TypeError, AttributeError):
|
|
1153
|
+
raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
|
|
1154
|
+
|
|
1155
|
+
elif op in [self.UPDATE_SET, self.UPDATE_DEC, self.UPDATE_INC]:
|
|
1156
|
+
try:
|
|
1157
|
+
value = field.check(value)
|
|
1158
|
+
except (ValueError, TypeError):
|
|
1159
|
+
raise DataStoreException(f"Invalid value for field {doc_key}: {value}")
|
|
1160
|
+
|
|
1161
|
+
if isinstance(value, Model):
|
|
1162
|
+
value = value.as_primitives()
|
|
1163
|
+
elif isinstance(value, datetime):
|
|
1164
|
+
value = value.isoformat()
|
|
1165
|
+
elif isinstance(value, ClassificationObject):
|
|
1166
|
+
value = str(value)
|
|
1167
|
+
|
|
1168
|
+
ret_ops.append((op, doc_key, value))
|
|
1169
|
+
|
|
1170
|
+
return ret_ops
|
|
1171
|
+
|
|
1172
|
+
def update(self, key, operations, version=None):
|
|
1173
|
+
"""This function performs an atomic update on some fields from the
|
|
1174
|
+
underlying documents referenced by the id using a list of operations.
|
|
1175
|
+
|
|
1176
|
+
Operations supported by the update function are the following:
|
|
1177
|
+
INTEGER ONLY: Increase and decreased value
|
|
1178
|
+
LISTS ONLY: Append and remove items
|
|
1179
|
+
ALL TYPES: Set value
|
|
1180
|
+
|
|
1181
|
+
:param key: ID of the document to modify
|
|
1182
|
+
:param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
|
|
1183
|
+
:return: True is update successful
|
|
1184
|
+
"""
|
|
1185
|
+
operations = self._validate_operations(operations)
|
|
1186
|
+
script = self._create_scripts_from_operations(operations)
|
|
1187
|
+
seq_no = None
|
|
1188
|
+
primary_term = None
|
|
1189
|
+
if version:
|
|
1190
|
+
seq_no, primary_term = version.split("---")
|
|
1191
|
+
|
|
1192
|
+
try:
|
|
1193
|
+
res = self.with_retries(
|
|
1194
|
+
self.datastore.client.update,
|
|
1195
|
+
index=self.name,
|
|
1196
|
+
id=key,
|
|
1197
|
+
script=script,
|
|
1198
|
+
if_seq_no=seq_no,
|
|
1199
|
+
if_primary_term=primary_term,
|
|
1200
|
+
raise_conflicts=seq_no and primary_term,
|
|
1201
|
+
)
|
|
1202
|
+
return (
|
|
1203
|
+
res["result"] == "updated",
|
|
1204
|
+
f"{res['_seq_no']}---{res['_primary_term']}",
|
|
1205
|
+
)
|
|
1206
|
+
except elasticsearch.NotFoundError as e:
|
|
1207
|
+
logger.warning("Update - elasticsearch.NotFoundError: %s %s", e.message, e.info)
|
|
1208
|
+
except elasticsearch.BadRequestError as e:
|
|
1209
|
+
logger.warning("Update - elasticsearch.BadRequestError: %s %s", e.message, e.info)
|
|
1210
|
+
return False
|
|
1211
|
+
except VersionConflictException as e:
|
|
1212
|
+
logger.warning("Update - elasticsearch.ConflictError: %s", e.message)
|
|
1213
|
+
raise
|
|
1214
|
+
except Exception as e:
|
|
1215
|
+
logger.warning("Update - Generic Exception: %s", str(e))
|
|
1216
|
+
return False
|
|
1217
|
+
|
|
1218
|
+
return False
|
|
1219
|
+
|
|
1220
|
+
def update_by_query(self, query, operations, filters=None, access_control=None, max_docs=None):
|
|
1221
|
+
"""This function performs an atomic update on some fields from the
|
|
1222
|
+
underlying documents matching the query and the filters using a list of operations.
|
|
1223
|
+
|
|
1224
|
+
Operations supported by the update function are the following:
|
|
1225
|
+
INTEGER ONLY: Increase and decreased value
|
|
1226
|
+
LISTS ONLY: Append and remove items
|
|
1227
|
+
ALL TYPES: Set value
|
|
1228
|
+
|
|
1229
|
+
:param access_control:
|
|
1230
|
+
:param filters: Filter queries to reduce the data
|
|
1231
|
+
:param query: Query to find the matching documents
|
|
1232
|
+
:param operations: List of tuple of operations e.q. [(SET, document_key, operation_value), ...]
|
|
1233
|
+
:return: True is update successful
|
|
1234
|
+
"""
|
|
1235
|
+
operations = self._validate_operations(operations)
|
|
1236
|
+
if filters is None:
|
|
1237
|
+
filters = []
|
|
1238
|
+
|
|
1239
|
+
if access_control:
|
|
1240
|
+
filters.append(access_control)
|
|
1241
|
+
|
|
1242
|
+
script = self._create_scripts_from_operations(operations)
|
|
1243
|
+
|
|
1244
|
+
try:
|
|
1245
|
+
res = self._update_async(
|
|
1246
|
+
self.name,
|
|
1247
|
+
script=script,
|
|
1248
|
+
query={
|
|
1249
|
+
"bool": {
|
|
1250
|
+
"must": {"query_string": {"query": query}},
|
|
1251
|
+
"filter": [{"query_string": {"query": ff}} for ff in filters],
|
|
1252
|
+
}
|
|
1253
|
+
},
|
|
1254
|
+
max_docs=max_docs,
|
|
1255
|
+
)
|
|
1256
|
+
except Exception:
|
|
1257
|
+
return False
|
|
1258
|
+
|
|
1259
|
+
return res["updated"]
|
|
1260
|
+
|
|
1261
|
+
def _format_output(self, result, fields=None, as_obj=True):
|
|
1262
|
+
# Getting search document data
|
|
1263
|
+
extra_fields = result.get("fields", {})
|
|
1264
|
+
source_data = result.pop("_source", None)
|
|
1265
|
+
|
|
1266
|
+
if source_data is not None:
|
|
1267
|
+
for f in BANNED_FIELDS:
|
|
1268
|
+
source_data.pop(f, None)
|
|
1269
|
+
|
|
1270
|
+
item_id = result["_id"]
|
|
1271
|
+
|
|
1272
|
+
if self.model_class:
|
|
1273
|
+
if not fields:
|
|
1274
|
+
fields = list(self.stored_fields.keys())
|
|
1275
|
+
fields.append("id")
|
|
1276
|
+
elif isinstance(fields, str):
|
|
1277
|
+
fields = fields.split(",")
|
|
1278
|
+
|
|
1279
|
+
extra_fields = _strip_lists(self.model_class, extra_fields)
|
|
1280
|
+
if as_obj:
|
|
1281
|
+
if "_index" in fields and "_index" in result:
|
|
1282
|
+
extra_fields["_index"] = result["_index"]
|
|
1283
|
+
if "*" in fields:
|
|
1284
|
+
fields = None
|
|
1285
|
+
return self.model_class(source_data, mask=fields, docid=item_id, extra_fields=extra_fields)
|
|
1286
|
+
else:
|
|
1287
|
+
source_data = recursive_update(source_data, extra_fields, allow_recursion=False)
|
|
1288
|
+
if "id" in fields:
|
|
1289
|
+
source_data["id"] = item_id
|
|
1290
|
+
if "_index" in fields and "_index" in result:
|
|
1291
|
+
source_data["_index"] = result["_index"]
|
|
1292
|
+
|
|
1293
|
+
if isinstance(fields, str):
|
|
1294
|
+
fields = [fields]
|
|
1295
|
+
|
|
1296
|
+
if fields is None or "*" in fields or "id" in fields:
|
|
1297
|
+
source_data["id"] = [item_id]
|
|
1298
|
+
|
|
1299
|
+
if fields is None or "*" in fields:
|
|
1300
|
+
return source_data
|
|
1301
|
+
|
|
1302
|
+
return prune(source_data, fields, self.stored_fields, mapping_class=Mapping)
|
|
1303
|
+
|
|
1304
|
+
def _search(self, args=None, deep_paging_id=None, use_archive=False, track_total_hits=None):
|
|
1305
|
+
if args is None:
|
|
1306
|
+
args = []
|
|
1307
|
+
|
|
1308
|
+
params = {}
|
|
1309
|
+
if deep_paging_id is not None:
|
|
1310
|
+
params = {"scroll": self.SCROLL_TIMEOUT}
|
|
1311
|
+
elif track_total_hits:
|
|
1312
|
+
params["track_total_hits"] = track_total_hits
|
|
1313
|
+
|
|
1314
|
+
parsed_values = deepcopy(self.DEFAULT_SEARCH_VALUES)
|
|
1315
|
+
|
|
1316
|
+
# TODO: we should validate values for max rows, group length, history length...
|
|
1317
|
+
for key, value in args:
|
|
1318
|
+
if key not in parsed_values:
|
|
1319
|
+
all_args = "; ".join("%s=%s" % (field_name, field_value) for field_name, field_value in args)
|
|
1320
|
+
raise HowlerValueError("Unknown query argument: %s %s of [%s]" % (key, value, all_args))
|
|
1321
|
+
|
|
1322
|
+
parsed_values[key] = value
|
|
1323
|
+
|
|
1324
|
+
# This is our minimal query, the following sections will fill it out
|
|
1325
|
+
# with whatever extra options the search has been given.
|
|
1326
|
+
query_body = {
|
|
1327
|
+
"query": {
|
|
1328
|
+
"bool": {
|
|
1329
|
+
"must": {"query_string": {"query": parsed_values["query"]}},
|
|
1330
|
+
"filter": [{"query_string": {"query": ff}} for ff in parsed_values["filters"]],
|
|
1331
|
+
}
|
|
1332
|
+
},
|
|
1333
|
+
"from_": parsed_values["start"],
|
|
1334
|
+
"size": parsed_values["rows"],
|
|
1335
|
+
"sort": parse_sort(parsed_values["sort"]),
|
|
1336
|
+
"_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
if parsed_values["script_fields"]:
|
|
1340
|
+
fields = {}
|
|
1341
|
+
for f_name, f_script in parsed_values["script_fields"]:
|
|
1342
|
+
fields[f_name] = {"script": {"lang": "painless", "source": f_script}}
|
|
1343
|
+
query_body["script_fields"] = fields
|
|
1344
|
+
|
|
1345
|
+
if parsed_values["df"]:
|
|
1346
|
+
query_body["query"]["bool"]["must"]["query_string"]["default_field"] = parsed_values["df"]
|
|
1347
|
+
|
|
1348
|
+
# Time limit for the query
|
|
1349
|
+
if parsed_values["timeout"]:
|
|
1350
|
+
query_body["timeout"] = parsed_values["timeout"]
|
|
1351
|
+
|
|
1352
|
+
# Add an histogram aggregation
|
|
1353
|
+
if parsed_values["histogram_active"]:
|
|
1354
|
+
query_body.setdefault("aggregations", {})
|
|
1355
|
+
if parsed_values["histogram_type"] == "date_histogram":
|
|
1356
|
+
interval_type = "fixed_interval"
|
|
1357
|
+
else:
|
|
1358
|
+
interval_type = "interval"
|
|
1359
|
+
query_body["aggregations"]["histogram"] = {
|
|
1360
|
+
parsed_values["histogram_type"]: {
|
|
1361
|
+
"field": parsed_values["histogram_field"],
|
|
1362
|
+
interval_type: parsed_values["histogram_gap"],
|
|
1363
|
+
"min_doc_count": parsed_values["histogram_mincount"],
|
|
1364
|
+
"extended_bounds": {
|
|
1365
|
+
"min": parsed_values["histogram_start"],
|
|
1366
|
+
"max": parsed_values["histogram_end"],
|
|
1367
|
+
},
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
# Add a facet aggregation
|
|
1372
|
+
if parsed_values["facet_active"]:
|
|
1373
|
+
query_body.setdefault("aggregations", {})
|
|
1374
|
+
for field in parsed_values["facet_fields"]:
|
|
1375
|
+
field_script = parsed_values["field_script"]
|
|
1376
|
+
if field_script:
|
|
1377
|
+
facet_body = {
|
|
1378
|
+
"script": {"source": field_script},
|
|
1379
|
+
"min_doc_count": parsed_values["facet_mincount"],
|
|
1380
|
+
}
|
|
1381
|
+
else:
|
|
1382
|
+
facet_body = {
|
|
1383
|
+
"field": field,
|
|
1384
|
+
"min_doc_count": parsed_values["facet_mincount"],
|
|
1385
|
+
"size": parsed_values["rows"],
|
|
1386
|
+
}
|
|
1387
|
+
query_body["aggregations"][field] = {"terms": facet_body}
|
|
1388
|
+
|
|
1389
|
+
# Add a facet aggregation
|
|
1390
|
+
if parsed_values["stats_active"]:
|
|
1391
|
+
query_body.setdefault("aggregations", {})
|
|
1392
|
+
for field in parsed_values["stats_fields"]:
|
|
1393
|
+
field_script = parsed_values["field_script"]
|
|
1394
|
+
if field_script:
|
|
1395
|
+
stats_body = {"script": {"source": field_script}}
|
|
1396
|
+
else:
|
|
1397
|
+
stats_body = {"field": field}
|
|
1398
|
+
|
|
1399
|
+
query_body["aggregations"][f"{field}_stats"] = {"stats": stats_body}
|
|
1400
|
+
|
|
1401
|
+
# Add a group aggregation
|
|
1402
|
+
if parsed_values["group_active"]:
|
|
1403
|
+
query_body["collapse"] = {
|
|
1404
|
+
"field": parsed_values["group_field"],
|
|
1405
|
+
"inner_hits": {
|
|
1406
|
+
"name": "group",
|
|
1407
|
+
"_source": parsed_values["field_list"] or list(self.stored_fields.keys()),
|
|
1408
|
+
"size": parsed_values["group_limit"],
|
|
1409
|
+
"sort": parse_sort(parsed_values["group_sort"]) or [{parsed_values["group_field"]: "asc"}],
|
|
1410
|
+
},
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
try:
|
|
1414
|
+
if deep_paging_id is not None and not deep_paging_id == "*":
|
|
1415
|
+
# Get the next page
|
|
1416
|
+
result = self.with_retries(
|
|
1417
|
+
self.datastore.client.scroll,
|
|
1418
|
+
scroll_id=deep_paging_id,
|
|
1419
|
+
**params,
|
|
1420
|
+
)
|
|
1421
|
+
else:
|
|
1422
|
+
# Run the query
|
|
1423
|
+
result = self.with_retries(
|
|
1424
|
+
self.datastore.client.search,
|
|
1425
|
+
index=self.name,
|
|
1426
|
+
**params,
|
|
1427
|
+
**query_body,
|
|
1428
|
+
)
|
|
1429
|
+
|
|
1430
|
+
return result
|
|
1431
|
+
except (
|
|
1432
|
+
elasticsearch.ConnectionError,
|
|
1433
|
+
elasticsearch.ConnectionTimeout,
|
|
1434
|
+
) as error:
|
|
1435
|
+
raise SearchRetryException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
|
|
1436
|
+
|
|
1437
|
+
except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
|
|
1438
|
+
try:
|
|
1439
|
+
err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
|
|
1440
|
+
except (ValueError, KeyError, IndexError):
|
|
1441
|
+
err_msg = str(e)
|
|
1442
|
+
|
|
1443
|
+
raise SearchException(err_msg)
|
|
1444
|
+
|
|
1445
|
+
except Exception as error:
|
|
1446
|
+
raise SearchException("collection: %s, query: %s, error: %s" % (self.name, query_body, str(error)))
|
|
1447
|
+
|
|
1448
|
+
def search(
|
|
1449
|
+
self,
|
|
1450
|
+
query,
|
|
1451
|
+
offset=0,
|
|
1452
|
+
rows=None,
|
|
1453
|
+
sort=None,
|
|
1454
|
+
fl=None,
|
|
1455
|
+
timeout=None,
|
|
1456
|
+
filters=None,
|
|
1457
|
+
access_control=None,
|
|
1458
|
+
deep_paging_id=None,
|
|
1459
|
+
as_obj=True,
|
|
1460
|
+
use_archive=False,
|
|
1461
|
+
track_total_hits=None,
|
|
1462
|
+
script_fields=[],
|
|
1463
|
+
):
|
|
1464
|
+
"""This function should perform a search through the datastore and return a
|
|
1465
|
+
search result object that consist on the following::
|
|
1466
|
+
|
|
1467
|
+
{
|
|
1468
|
+
"offset": 0, # Offset in the search index
|
|
1469
|
+
"rows": 25, # Number of document returned per page
|
|
1470
|
+
"total": 123456, # Total number of documents matching the query
|
|
1471
|
+
"items": [ # List of dictionary where each keys are one of
|
|
1472
|
+
{ # the field list parameter specified
|
|
1473
|
+
fl[0]: value,
|
|
1474
|
+
...
|
|
1475
|
+
fl[x]: value
|
|
1476
|
+
}, ...]
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
:param script_fields: List of name/script tuple of fields to be evaluated at runtime
|
|
1480
|
+
:param track_total_hits: Return to total matching document count
|
|
1481
|
+
:param use_archive: Query also the archive
|
|
1482
|
+
:param deep_paging_id: ID of the next page during deep paging searches
|
|
1483
|
+
:param as_obj: Return objects instead of dictionaries
|
|
1484
|
+
:param query: lucene query to search for
|
|
1485
|
+
:param offset: offset at which you want the results to start at (paging)
|
|
1486
|
+
:param rows: number of items that the search function should return
|
|
1487
|
+
:param sort: field to sort the data with
|
|
1488
|
+
:param fl: list of fields to return from the search
|
|
1489
|
+
:param timeout: maximum time of execution
|
|
1490
|
+
:param filters: additional queries to run on the original query to reduce the scope
|
|
1491
|
+
:param access_control: access control parameters to limiti the scope of the query
|
|
1492
|
+
:return: a search result object
|
|
1493
|
+
"""
|
|
1494
|
+
if offset is None:
|
|
1495
|
+
offset = self.DEFAULT_OFFSET
|
|
1496
|
+
|
|
1497
|
+
if rows is None:
|
|
1498
|
+
rows = self.DEFAULT_ROW_SIZE
|
|
1499
|
+
|
|
1500
|
+
if sort is None:
|
|
1501
|
+
sort = self.DEFAULT_SORT
|
|
1502
|
+
|
|
1503
|
+
if filters is None:
|
|
1504
|
+
filters = []
|
|
1505
|
+
elif isinstance(filters, str):
|
|
1506
|
+
filters = [filters]
|
|
1507
|
+
|
|
1508
|
+
if access_control:
|
|
1509
|
+
filters.append(access_control)
|
|
1510
|
+
|
|
1511
|
+
args = [
|
|
1512
|
+
("query", query),
|
|
1513
|
+
("start", offset),
|
|
1514
|
+
("rows", rows),
|
|
1515
|
+
("sort", sort),
|
|
1516
|
+
("df", self.DEFAULT_SEARCH_FIELD),
|
|
1517
|
+
]
|
|
1518
|
+
|
|
1519
|
+
if fl:
|
|
1520
|
+
field_list = fl.split(",")
|
|
1521
|
+
args.append(("field_list", field_list))
|
|
1522
|
+
else:
|
|
1523
|
+
field_list = None
|
|
1524
|
+
|
|
1525
|
+
if timeout:
|
|
1526
|
+
args.append(("timeout", "%sms" % timeout))
|
|
1527
|
+
|
|
1528
|
+
if filters:
|
|
1529
|
+
args.append(("filters", filters))
|
|
1530
|
+
|
|
1531
|
+
if script_fields:
|
|
1532
|
+
args.append(("script_fields", script_fields))
|
|
1533
|
+
|
|
1534
|
+
result = self._search(
|
|
1535
|
+
args,
|
|
1536
|
+
deep_paging_id=deep_paging_id,
|
|
1537
|
+
use_archive=use_archive,
|
|
1538
|
+
track_total_hits=track_total_hits,
|
|
1539
|
+
)
|
|
1540
|
+
|
|
1541
|
+
ret_data: dict[str, Any] = {
|
|
1542
|
+
"offset": int(offset),
|
|
1543
|
+
"rows": int(rows),
|
|
1544
|
+
"total": int(result["hits"]["total"]["value"]),
|
|
1545
|
+
"items": [self._format_output(doc, field_list, as_obj=as_obj) for doc in result["hits"]["hits"]],
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
new_deep_paging_id = result.get("_scroll_id", None)
|
|
1549
|
+
|
|
1550
|
+
# Check if the scroll is finished and close it
|
|
1551
|
+
if deep_paging_id is not None and new_deep_paging_id is None:
|
|
1552
|
+
self.with_retries(
|
|
1553
|
+
self.datastore.client.clear_scroll,
|
|
1554
|
+
scroll_id=[deep_paging_id],
|
|
1555
|
+
ignore=(404,),
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
# Check if we can tell from inspection that we have finished the scroll
|
|
1559
|
+
if new_deep_paging_id is not None and len(ret_data["items"]) < ret_data["rows"]:
|
|
1560
|
+
self.with_retries(
|
|
1561
|
+
self.datastore.client.clear_scroll,
|
|
1562
|
+
scroll_id=[new_deep_paging_id],
|
|
1563
|
+
ignore=(404,),
|
|
1564
|
+
)
|
|
1565
|
+
new_deep_paging_id = None
|
|
1566
|
+
|
|
1567
|
+
if new_deep_paging_id is not None:
|
|
1568
|
+
ret_data["next_deep_paging_id"] = new_deep_paging_id
|
|
1569
|
+
|
|
1570
|
+
return ret_data
|
|
1571
|
+
|
|
1572
|
+
def stream_search(
|
|
1573
|
+
self,
|
|
1574
|
+
query,
|
|
1575
|
+
fl=None,
|
|
1576
|
+
filters=None,
|
|
1577
|
+
access_control=None,
|
|
1578
|
+
item_buffer_size=200,
|
|
1579
|
+
as_obj=True,
|
|
1580
|
+
use_archive=False,
|
|
1581
|
+
):
|
|
1582
|
+
"""This function should perform a search through the datastore and stream
|
|
1583
|
+
all related results as a dictionary of key value pair where each keys
|
|
1584
|
+
are one of the field specified in the field list parameter.
|
|
1585
|
+
|
|
1586
|
+
>>> # noinspection PyUnresolvedReferences
|
|
1587
|
+
>>> {
|
|
1588
|
+
>>> fl[0]: value,
|
|
1589
|
+
>>> ...
|
|
1590
|
+
>>> fl[x]: value
|
|
1591
|
+
>>> }
|
|
1592
|
+
|
|
1593
|
+
:param use_archive: Query also the archive
|
|
1594
|
+
:param as_obj: Return objects instead of dictionaries
|
|
1595
|
+
:param query: lucene query to search for
|
|
1596
|
+
:param fl: list of fields to return from the search
|
|
1597
|
+
:param filters: additional queries to run on the original query to reduce the scope
|
|
1598
|
+
:param access_control: access control parameters to run the query with
|
|
1599
|
+
:param buffer_size: number of items to buffer with each search call
|
|
1600
|
+
:return: a generator of dictionary of field list results
|
|
1601
|
+
"""
|
|
1602
|
+
if item_buffer_size > 2000 or item_buffer_size < 50:
|
|
1603
|
+
raise SearchException("Variable item_buffer_size must be between 50 and 2000.")
|
|
1604
|
+
|
|
1605
|
+
if filters is None:
|
|
1606
|
+
filters = []
|
|
1607
|
+
elif isinstance(filters, str):
|
|
1608
|
+
filters = [filters]
|
|
1609
|
+
|
|
1610
|
+
if access_control:
|
|
1611
|
+
filters.append(access_control)
|
|
1612
|
+
|
|
1613
|
+
if fl:
|
|
1614
|
+
fl = fl.split(",")
|
|
1615
|
+
|
|
1616
|
+
query_expression = {
|
|
1617
|
+
"bool": {
|
|
1618
|
+
"must": {
|
|
1619
|
+
"query_string": {
|
|
1620
|
+
"query": query,
|
|
1621
|
+
"default_field": self.DEFAULT_SEARCH_FIELD,
|
|
1622
|
+
}
|
|
1623
|
+
},
|
|
1624
|
+
"filter": [{"query_string": {"query": ff}} for ff in filters],
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
sort = parse_sort(self.datastore.DEFAULT_SORT)
|
|
1628
|
+
source = fl or list(self.stored_fields.keys())
|
|
1629
|
+
|
|
1630
|
+
for value in self.scan_with_retry(
|
|
1631
|
+
query=query_expression,
|
|
1632
|
+
sort=sort,
|
|
1633
|
+
source=source,
|
|
1634
|
+
index=self.name,
|
|
1635
|
+
size=item_buffer_size,
|
|
1636
|
+
):
|
|
1637
|
+
# Unpack the results, ensure the id is always set
|
|
1638
|
+
yield self._format_output(value, fl, as_obj=as_obj)
|
|
1639
|
+
|
|
1640
|
+
def raw_eql_search(
|
|
1641
|
+
self,
|
|
1642
|
+
eql_query: str,
|
|
1643
|
+
fl: Optional[str] = None,
|
|
1644
|
+
filters: Optional[Union[list[str], str]] = None,
|
|
1645
|
+
rows: Optional[int] = None,
|
|
1646
|
+
timeout: Optional[int] = None,
|
|
1647
|
+
as_obj=True,
|
|
1648
|
+
):
|
|
1649
|
+
if filters is None:
|
|
1650
|
+
filters = []
|
|
1651
|
+
elif isinstance(filters, str):
|
|
1652
|
+
filters = [filters]
|
|
1653
|
+
|
|
1654
|
+
parsed_filters = {
|
|
1655
|
+
"bool": {
|
|
1656
|
+
"must": {"query_string": {"query": "*:*"}},
|
|
1657
|
+
"filter": [{"query_string": {"query": ff}} for ff in filters],
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
|
|
1661
|
+
if not fl:
|
|
1662
|
+
fl = "howler.id"
|
|
1663
|
+
|
|
1664
|
+
if rows is None:
|
|
1665
|
+
rows = 5
|
|
1666
|
+
|
|
1667
|
+
fields = [{"field": f} for f in fl.split(",")]
|
|
1668
|
+
|
|
1669
|
+
try:
|
|
1670
|
+
result = self.with_retries(
|
|
1671
|
+
self.datastore.client.eql.search,
|
|
1672
|
+
index=self.name,
|
|
1673
|
+
timestamp_field="timestamp",
|
|
1674
|
+
query=eql_query,
|
|
1675
|
+
fields=fields,
|
|
1676
|
+
filter=parsed_filters,
|
|
1677
|
+
size=rows,
|
|
1678
|
+
wait_for_completion_timeout=(f"{timeout}ms" if timeout is not None else None),
|
|
1679
|
+
)
|
|
1680
|
+
|
|
1681
|
+
ret_data: dict[str, Any] = {
|
|
1682
|
+
"rows": int(rows),
|
|
1683
|
+
"total": int(result["hits"]["total"]["value"]),
|
|
1684
|
+
"items": [
|
|
1685
|
+
self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in result["hits"].get("events", [])
|
|
1686
|
+
],
|
|
1687
|
+
"sequences": [
|
|
1688
|
+
[self._format_output(doc, fl.split(","), as_obj=as_obj) for doc in sequence.get("events", [])]
|
|
1689
|
+
for sequence in result["hits"].get("sequences", [])
|
|
1690
|
+
],
|
|
1691
|
+
}
|
|
1692
|
+
|
|
1693
|
+
return ret_data
|
|
1694
|
+
|
|
1695
|
+
except (elasticsearch.TransportError, elasticsearch.RequestError) as e:
|
|
1696
|
+
try:
|
|
1697
|
+
err_msg = e.info["error"]["root_cause"][0]["reason"] # type: ignore
|
|
1698
|
+
except (ValueError, KeyError, IndexError):
|
|
1699
|
+
err_msg = str(e)
|
|
1700
|
+
|
|
1701
|
+
raise SearchException(err_msg)
|
|
1702
|
+
except Exception as error:
|
|
1703
|
+
raise SearchException(f"collection: {self.name}, error: {str(error)}")
|
|
1704
|
+
|
|
1705
|
+
def keys(self, access_control=None):
|
|
1706
|
+
"""This function streams the keys of all the documents of this collection.
|
|
1707
|
+
|
|
1708
|
+
:param access_control: access control parameter to limit the scope of the key scan
|
|
1709
|
+
:return: a generator of keys
|
|
1710
|
+
"""
|
|
1711
|
+
for item in self.stream_search("id:*", fl="id", access_control=access_control):
|
|
1712
|
+
try:
|
|
1713
|
+
yield item._id
|
|
1714
|
+
except AttributeError:
|
|
1715
|
+
value = item["id"]
|
|
1716
|
+
if isinstance(value, list):
|
|
1717
|
+
for v in value:
|
|
1718
|
+
yield v
|
|
1719
|
+
else:
|
|
1720
|
+
yield value
|
|
1721
|
+
|
|
1722
|
+
def _validate_steps_count(self, start, end, gap):
|
|
1723
|
+
with warnings.catch_warnings():
|
|
1724
|
+
warnings.simplefilter("ignore")
|
|
1725
|
+
|
|
1726
|
+
gaps_count = None
|
|
1727
|
+
ret_type: Optional[type] = None
|
|
1728
|
+
|
|
1729
|
+
try:
|
|
1730
|
+
start = int(start)
|
|
1731
|
+
end = int(end)
|
|
1732
|
+
gap = int(gap)
|
|
1733
|
+
|
|
1734
|
+
gaps_count = int((end - start) / gap)
|
|
1735
|
+
ret_type = int
|
|
1736
|
+
except ValueError:
|
|
1737
|
+
pass
|
|
1738
|
+
|
|
1739
|
+
if not gaps_count:
|
|
1740
|
+
try:
|
|
1741
|
+
t_gap = gap.strip("+").strip("-")
|
|
1742
|
+
|
|
1743
|
+
parsed_start = dm(self.datastore.to_pydatemath(start)).int_timestamp
|
|
1744
|
+
parsed_end = dm(self.datastore.to_pydatemath(end)).int_timestamp
|
|
1745
|
+
parsed_gap = dm(self.datastore.to_pydatemath(f"+{t_gap}")).int_timestamp - dm("now").int_timestamp
|
|
1746
|
+
|
|
1747
|
+
gaps_count = int((parsed_end - parsed_start) / parsed_gap)
|
|
1748
|
+
ret_type = str
|
|
1749
|
+
except (DateMathException, AttributeError):
|
|
1750
|
+
pass
|
|
1751
|
+
|
|
1752
|
+
if gaps_count is None:
|
|
1753
|
+
raise SearchException(
|
|
1754
|
+
"Could not parse histogram ranges. Either you've mix integer and dates values or you "
|
|
1755
|
+
"have invalid date math values. (start='%s', end='%s', gap='%s')" % (start, end, gap)
|
|
1756
|
+
)
|
|
1757
|
+
|
|
1758
|
+
if gaps_count > self.MAX_FACET_LIMIT:
|
|
1759
|
+
raise SearchException(
|
|
1760
|
+
f"Histograms are limited to a maximum of {self.MAX_FACET_LIMIT} steps. "
|
|
1761
|
+
f"Current settings would generate {gaps_count} steps"
|
|
1762
|
+
)
|
|
1763
|
+
return ret_type
|
|
1764
|
+
|
|
1765
|
+
def count(
|
|
1766
|
+
self,
|
|
1767
|
+
query,
|
|
1768
|
+
access_control=None,
|
|
1769
|
+
):
|
|
1770
|
+
"""This function should perform a count operation through the datastore and return a
|
|
1771
|
+
search result object that consists of the following:
|
|
1772
|
+
|
|
1773
|
+
{
|
|
1774
|
+
"total": 123456, # Total number of documents matching the query
|
|
1775
|
+
}
|
|
1776
|
+
|
|
1777
|
+
:param query: lucene query to search for
|
|
1778
|
+
:param access_control: access control parameters to limit the scope of the query
|
|
1779
|
+
:return: a count result object
|
|
1780
|
+
"""
|
|
1781
|
+
result = self.with_retries(self.datastore.client.count, index=self.name, q=query)
|
|
1782
|
+
|
|
1783
|
+
ret_data: dict[str, Any] = {
|
|
1784
|
+
"count": result["count"],
|
|
1785
|
+
}
|
|
1786
|
+
|
|
1787
|
+
return ret_data
|
|
1788
|
+
|
|
1789
|
+
def histogram(
|
|
1790
|
+
self,
|
|
1791
|
+
field,
|
|
1792
|
+
start,
|
|
1793
|
+
end,
|
|
1794
|
+
gap,
|
|
1795
|
+
query="id:*",
|
|
1796
|
+
mincount=None,
|
|
1797
|
+
filters=None,
|
|
1798
|
+
access_control=None,
|
|
1799
|
+
use_archive=False,
|
|
1800
|
+
):
|
|
1801
|
+
type_modifier = self._validate_steps_count(start, end, gap)
|
|
1802
|
+
start = type_modifier(start)
|
|
1803
|
+
end = type_modifier(end)
|
|
1804
|
+
gap = type_modifier(gap)
|
|
1805
|
+
|
|
1806
|
+
if mincount is None:
|
|
1807
|
+
mincount = 1
|
|
1808
|
+
|
|
1809
|
+
if filters is None:
|
|
1810
|
+
filters = []
|
|
1811
|
+
elif isinstance(filters, str):
|
|
1812
|
+
filters = [filters]
|
|
1813
|
+
filters.append("{field}:[{min} TO {max}]".format(field=field, min=start, max=end))
|
|
1814
|
+
|
|
1815
|
+
args = [
|
|
1816
|
+
("query", query),
|
|
1817
|
+
("histogram_active", True),
|
|
1818
|
+
("histogram_field", field),
|
|
1819
|
+
(
|
|
1820
|
+
"histogram_type",
|
|
1821
|
+
"date_histogram" if isinstance(gap, str) else "histogram",
|
|
1822
|
+
),
|
|
1823
|
+
(
|
|
1824
|
+
"histogram_gap",
|
|
1825
|
+
gap.strip("+").strip("-") if isinstance(gap, str) else gap,
|
|
1826
|
+
),
|
|
1827
|
+
("histogram_mincount", mincount),
|
|
1828
|
+
("histogram_start", start),
|
|
1829
|
+
("histogram_end", end),
|
|
1830
|
+
]
|
|
1831
|
+
|
|
1832
|
+
if access_control:
|
|
1833
|
+
filters.append(access_control)
|
|
1834
|
+
|
|
1835
|
+
if filters:
|
|
1836
|
+
args.append(("filters", filters))
|
|
1837
|
+
|
|
1838
|
+
result = self._search(args, use_archive=use_archive)
|
|
1839
|
+
|
|
1840
|
+
# Convert the histogram into a dictionary
|
|
1841
|
+
return {
|
|
1842
|
+
type_modifier(row.get("key_as_string", row["key"])): row["doc_count"]
|
|
1843
|
+
for row in result["aggregations"]["histogram"]["buckets"]
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
def facet(
|
|
1847
|
+
self,
|
|
1848
|
+
field,
|
|
1849
|
+
query=None,
|
|
1850
|
+
prefix=None,
|
|
1851
|
+
contains=None,
|
|
1852
|
+
ignore_case=False,
|
|
1853
|
+
sort=None,
|
|
1854
|
+
rows=10,
|
|
1855
|
+
mincount=None,
|
|
1856
|
+
filters=None,
|
|
1857
|
+
access_control=None,
|
|
1858
|
+
use_archive=False,
|
|
1859
|
+
field_script=None,
|
|
1860
|
+
):
|
|
1861
|
+
if not query:
|
|
1862
|
+
query = "id:*"
|
|
1863
|
+
|
|
1864
|
+
if not mincount:
|
|
1865
|
+
mincount = 1
|
|
1866
|
+
|
|
1867
|
+
if filters is None:
|
|
1868
|
+
filters = []
|
|
1869
|
+
elif isinstance(filters, str):
|
|
1870
|
+
filters = [filters]
|
|
1871
|
+
|
|
1872
|
+
args = [
|
|
1873
|
+
("query", query),
|
|
1874
|
+
("facet_active", True),
|
|
1875
|
+
("facet_fields", [field]),
|
|
1876
|
+
("facet_mincount", mincount),
|
|
1877
|
+
("rows", rows),
|
|
1878
|
+
]
|
|
1879
|
+
|
|
1880
|
+
# TODO: prefix, contains, ignore_case, sort
|
|
1881
|
+
|
|
1882
|
+
if access_control:
|
|
1883
|
+
filters.append(access_control)
|
|
1884
|
+
|
|
1885
|
+
if filters:
|
|
1886
|
+
args.append(("filters", filters))
|
|
1887
|
+
|
|
1888
|
+
if field_script:
|
|
1889
|
+
args.append(("field_script", field_script))
|
|
1890
|
+
|
|
1891
|
+
result = self._search(args, use_archive=use_archive)
|
|
1892
|
+
|
|
1893
|
+
# Convert the histogram into a dictionary
|
|
1894
|
+
return {
|
|
1895
|
+
row.get("key_as_string", row["key"]): row["doc_count"] for row in result["aggregations"][field]["buckets"]
|
|
1896
|
+
}
|
|
1897
|
+
|
|
1898
|
+
def stats(
|
|
1899
|
+
self,
|
|
1900
|
+
field,
|
|
1901
|
+
query="id:*",
|
|
1902
|
+
filters=None,
|
|
1903
|
+
access_control=None,
|
|
1904
|
+
use_archive=False,
|
|
1905
|
+
field_script=None,
|
|
1906
|
+
):
|
|
1907
|
+
if filters is None:
|
|
1908
|
+
filters = []
|
|
1909
|
+
elif isinstance(filters, str):
|
|
1910
|
+
filters = [filters]
|
|
1911
|
+
|
|
1912
|
+
args = [
|
|
1913
|
+
("query", query),
|
|
1914
|
+
("stats_active", True),
|
|
1915
|
+
("stats_fields", [field]),
|
|
1916
|
+
("rows", 0),
|
|
1917
|
+
]
|
|
1918
|
+
|
|
1919
|
+
if access_control:
|
|
1920
|
+
filters.append(access_control)
|
|
1921
|
+
|
|
1922
|
+
if filters:
|
|
1923
|
+
args.append(("filters", filters))
|
|
1924
|
+
|
|
1925
|
+
if field_script:
|
|
1926
|
+
args.append(("field_script", field_script))
|
|
1927
|
+
|
|
1928
|
+
result = self._search(args, use_archive=use_archive)
|
|
1929
|
+
return result["aggregations"][f"{field}_stats"]
|
|
1930
|
+
|
|
1931
|
+
def grouped_search(
|
|
1932
|
+
self,
|
|
1933
|
+
group_field,
|
|
1934
|
+
query="id:*",
|
|
1935
|
+
offset=0,
|
|
1936
|
+
sort=None,
|
|
1937
|
+
group_sort=None,
|
|
1938
|
+
fl=None,
|
|
1939
|
+
limit=1,
|
|
1940
|
+
rows=None,
|
|
1941
|
+
filters=None,
|
|
1942
|
+
access_control=None,
|
|
1943
|
+
as_obj=True,
|
|
1944
|
+
use_archive=False,
|
|
1945
|
+
track_total_hits=False,
|
|
1946
|
+
):
|
|
1947
|
+
if rows is None:
|
|
1948
|
+
rows = self.DEFAULT_ROW_SIZE
|
|
1949
|
+
|
|
1950
|
+
if sort is None:
|
|
1951
|
+
sort = self.DEFAULT_SORT
|
|
1952
|
+
|
|
1953
|
+
if group_sort is None:
|
|
1954
|
+
group_sort = self.DEFAULT_SORT
|
|
1955
|
+
|
|
1956
|
+
if filters is None:
|
|
1957
|
+
filters = []
|
|
1958
|
+
elif isinstance(filters, str):
|
|
1959
|
+
filters = [filters]
|
|
1960
|
+
|
|
1961
|
+
args = [
|
|
1962
|
+
("query", query),
|
|
1963
|
+
("group_active", True),
|
|
1964
|
+
("group_field", group_field),
|
|
1965
|
+
("group_limit", limit),
|
|
1966
|
+
("group_sort", group_sort),
|
|
1967
|
+
("start", offset),
|
|
1968
|
+
("rows", rows),
|
|
1969
|
+
("sort", sort),
|
|
1970
|
+
]
|
|
1971
|
+
|
|
1972
|
+
filters.append("%s:*" % group_field)
|
|
1973
|
+
|
|
1974
|
+
if fl:
|
|
1975
|
+
field_list = fl.split(",")
|
|
1976
|
+
args.append(("field_list", field_list))
|
|
1977
|
+
else:
|
|
1978
|
+
field_list = None
|
|
1979
|
+
|
|
1980
|
+
if access_control:
|
|
1981
|
+
filters.append(access_control)
|
|
1982
|
+
|
|
1983
|
+
if filters:
|
|
1984
|
+
args.append(("filters", filters))
|
|
1985
|
+
|
|
1986
|
+
result = self._search(args, use_archive=use_archive, track_total_hits=track_total_hits)
|
|
1987
|
+
|
|
1988
|
+
return {
|
|
1989
|
+
"offset": offset,
|
|
1990
|
+
"rows": rows,
|
|
1991
|
+
"total": int(result["hits"]["total"]["value"]),
|
|
1992
|
+
"items": [
|
|
1993
|
+
{
|
|
1994
|
+
"value": collapsed["fields"][group_field][0],
|
|
1995
|
+
"total": int(collapsed["inner_hits"]["group"]["hits"]["total"]["value"]),
|
|
1996
|
+
"items": [
|
|
1997
|
+
self._format_output(row, field_list, as_obj=as_obj)
|
|
1998
|
+
for row in collapsed["inner_hits"]["group"]["hits"]["hits"]
|
|
1999
|
+
],
|
|
2000
|
+
}
|
|
2001
|
+
for collapsed in result["hits"]["hits"]
|
|
2002
|
+
],
|
|
2003
|
+
}
|
|
2004
|
+
|
|
2005
|
+
@staticmethod
|
|
2006
|
+
def _get_odm_type(ds_type):
|
|
2007
|
+
try:
|
|
2008
|
+
return BACK_MAPPING[ds_type].__name__.lower()
|
|
2009
|
+
except KeyError:
|
|
2010
|
+
return ds_type.lower()
|
|
2011
|
+
|
|
2012
|
+
def fields(self, skip_mapping_children=False):
|
|
2013
|
+
"""
|
|
2014
|
+
This function should return all the fields in the index with their types
|
|
2015
|
+
"""
|
|
2016
|
+
|
|
2017
|
+
def flatten_fields(props):
|
|
2018
|
+
out = {}
|
|
2019
|
+
for name, value in props.items():
|
|
2020
|
+
if "properties" in value:
|
|
2021
|
+
for child, cprops in flatten_fields(value["properties"]).items():
|
|
2022
|
+
out[name + "." + child] = cprops
|
|
2023
|
+
elif "type" in value:
|
|
2024
|
+
out[name] = value
|
|
2025
|
+
else:
|
|
2026
|
+
raise HowlerValueError("Unknown field data " + str(props))
|
|
2027
|
+
return out
|
|
2028
|
+
|
|
2029
|
+
data = self.with_retries(self.datastore.client.indices.get, index=self.name)
|
|
2030
|
+
index_name = list(data.keys())[0]
|
|
2031
|
+
properties = flatten_fields(data[index_name]["mappings"].get("properties", {}))
|
|
2032
|
+
|
|
2033
|
+
if self.model_class:
|
|
2034
|
+
model_fields = self.model_class.flat_fields()
|
|
2035
|
+
else:
|
|
2036
|
+
model_fields = {}
|
|
2037
|
+
|
|
2038
|
+
collection_data = {}
|
|
2039
|
+
|
|
2040
|
+
for p_name, p_val in properties.items():
|
|
2041
|
+
if p_name.startswith("_") or "//" in p_name:
|
|
2042
|
+
continue
|
|
2043
|
+
if not self.FIELD_SANITIZER.match(p_name):
|
|
2044
|
+
continue
|
|
2045
|
+
field_model = model_fields.get(p_name, None)
|
|
2046
|
+
|
|
2047
|
+
if "." in p_name:
|
|
2048
|
+
parent_p_name = re.sub(r"^(.+)\..+?$", r"\1", p_name)
|
|
2049
|
+
if parent_p_name in model_fields and isinstance(model_fields.get(parent_p_name), Mapping):
|
|
2050
|
+
if parent_p_name not in collection_data:
|
|
2051
|
+
field_model = model_fields.get(parent_p_name, None)
|
|
2052
|
+
f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
|
|
2053
|
+
|
|
2054
|
+
collection_data[parent_p_name] = {
|
|
2055
|
+
"default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
|
|
2056
|
+
"indexed": p_val.get("index", p_val.get("enabled", True)),
|
|
2057
|
+
"list": field_model.multivalued if field_model else False,
|
|
2058
|
+
"stored": field_model.store if field_model else False,
|
|
2059
|
+
"type": f_type,
|
|
2060
|
+
"description": (field_model.description if field_model else ""),
|
|
2061
|
+
"regex": (
|
|
2062
|
+
field_model.child_type.validation_regex.pattern
|
|
2063
|
+
if issubclass(type(field_model.child_type), ValidatedKeyword)
|
|
2064
|
+
or issubclass(type(field_model.child_type), IP)
|
|
2065
|
+
else None
|
|
2066
|
+
),
|
|
2067
|
+
"values": (
|
|
2068
|
+
list(field_model.child_type.values)
|
|
2069
|
+
if issubclass(type(field_model.child_type), Enum)
|
|
2070
|
+
else None
|
|
2071
|
+
),
|
|
2072
|
+
"deprecated_description": (field_model.deprecated_description if field_model else ""),
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
if skip_mapping_children:
|
|
2076
|
+
continue
|
|
2077
|
+
else:
|
|
2078
|
+
continue
|
|
2079
|
+
|
|
2080
|
+
f_type = self._get_odm_type(p_val.get("analyzer", None) or p_val["type"])
|
|
2081
|
+
collection_data[p_name] = {
|
|
2082
|
+
"default": self.DEFAULT_SEARCH_FIELD in p_val.get("copy_to", []),
|
|
2083
|
+
"indexed": p_val.get("index", p_val.get("enabled", True)),
|
|
2084
|
+
"list": field_model.multivalued if field_model else False,
|
|
2085
|
+
"stored": field_model.store if field_model else False,
|
|
2086
|
+
"deprecated": field_model.deprecated if field_model else False,
|
|
2087
|
+
"type": f_type,
|
|
2088
|
+
"description": field_model.description if field_model else "",
|
|
2089
|
+
"regex": (
|
|
2090
|
+
field_model.validation_regex.pattern
|
|
2091
|
+
if issubclass(type(field_model), ValidatedKeyword) or issubclass(type(field_model), IP)
|
|
2092
|
+
else None
|
|
2093
|
+
),
|
|
2094
|
+
"values": list(field_model.values) if issubclass(type(field_model), Enum) else None,
|
|
2095
|
+
"deprecated_description": (field_model.deprecated_description if field_model else ""),
|
|
2096
|
+
}
|
|
2097
|
+
|
|
2098
|
+
collection_data.pop("id", None)
|
|
2099
|
+
|
|
2100
|
+
return collection_data
|
|
2101
|
+
|
|
2102
|
+
def _ilm_policy_exists(self):
|
|
2103
|
+
try:
|
|
2104
|
+
self.datastore.client.ilm.get_lifecycle(name=f"{self.name}_policy")
|
|
2105
|
+
except elasticsearch.NotFoundError:
|
|
2106
|
+
return False
|
|
2107
|
+
else:
|
|
2108
|
+
return True
|
|
2109
|
+
|
|
2110
|
+
def _delete_ilm_policy(self):
|
|
2111
|
+
try:
|
|
2112
|
+
self.datastore.client.ilm.delete_lifecycle(name=f"{self.name}_policy")
|
|
2113
|
+
except elasticsearch.ApiError:
|
|
2114
|
+
return False
|
|
2115
|
+
else:
|
|
2116
|
+
return True
|
|
2117
|
+
|
|
2118
|
+
def _get_index_settings(self) -> dict:
|
|
2119
|
+
default_stub: dict = deepcopy(default_index)
|
|
2120
|
+
settings: dict = default_stub.pop("settings", {})
|
|
2121
|
+
|
|
2122
|
+
if "index" not in settings:
|
|
2123
|
+
settings["index"] = {}
|
|
2124
|
+
settings["index"]["number_of_shards"] = self.shards
|
|
2125
|
+
settings["index"]["number_of_replicas"] = self.replicas
|
|
2126
|
+
|
|
2127
|
+
if "mapping" not in settings["index"]:
|
|
2128
|
+
settings["index"]["mapping"] = {}
|
|
2129
|
+
|
|
2130
|
+
if "total_fields" not in settings["index"]["mapping"]:
|
|
2131
|
+
settings["index"]["mapping"]["total_fields"] = {}
|
|
2132
|
+
|
|
2133
|
+
limit = len(self.model_class.flat_fields()) + 500 if self.model_class else 1500
|
|
2134
|
+
if limit < 1500:
|
|
2135
|
+
limit = 1500
|
|
2136
|
+
elif limit > 1500:
|
|
2137
|
+
logger.warning("ODM field size is larger than 1500 - set to %s", limit)
|
|
2138
|
+
settings["index"]["mapping"]["total_fields"]["limit"] = limit
|
|
2139
|
+
|
|
2140
|
+
return settings
|
|
2141
|
+
|
|
2142
|
+
def _get_index_mappings(self) -> dict:
|
|
2143
|
+
mappings: dict = deepcopy(default_mapping)
|
|
2144
|
+
if self.model_class:
|
|
2145
|
+
mappings["properties"], mappings["dynamic_templates"] = build_mapping(self.model_class.fields().values())
|
|
2146
|
+
mappings["dynamic_templates"].insert(0, default_dynamic_strings)
|
|
2147
|
+
else:
|
|
2148
|
+
mappings["dynamic_templates"] = deepcopy(default_dynamic_templates)
|
|
2149
|
+
|
|
2150
|
+
if not mappings["dynamic_templates"]:
|
|
2151
|
+
# Setting dynamic to strict prevents any documents with fields not in the properties to be added
|
|
2152
|
+
mappings["dynamic"] = "strict"
|
|
2153
|
+
|
|
2154
|
+
mappings["properties"]["id"] = {
|
|
2155
|
+
"store": True,
|
|
2156
|
+
"doc_values": True,
|
|
2157
|
+
"type": "keyword",
|
|
2158
|
+
}
|
|
2159
|
+
|
|
2160
|
+
mappings["properties"]["__text__"] = {
|
|
2161
|
+
"store": False,
|
|
2162
|
+
"type": "text",
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
return mappings
|
|
2166
|
+
|
|
2167
|
+
def __get_possible_fields(self, field):
|
|
2168
|
+
field_types = [field.__name__.lower()]
|
|
2169
|
+
if field.__bases__[0] != _Field:
|
|
2170
|
+
field_types.extend(self.__get_possible_fields(field.__bases__[0]))
|
|
2171
|
+
|
|
2172
|
+
if field_type := TYPE_MAPPING.get(field.__name__, None):
|
|
2173
|
+
field_types.append(field_type)
|
|
2174
|
+
|
|
2175
|
+
return field_types
|
|
2176
|
+
|
|
2177
|
+
def _check_fields(self, model=None):
|
|
2178
|
+
if not self.validate:
|
|
2179
|
+
return
|
|
2180
|
+
|
|
2181
|
+
if model is None:
|
|
2182
|
+
if self.model_class:
|
|
2183
|
+
return self._check_fields(self.model_class)
|
|
2184
|
+
return
|
|
2185
|
+
|
|
2186
|
+
fields = self.fields()
|
|
2187
|
+
model = self.model_class.flat_fields(skip_mappings=True)
|
|
2188
|
+
|
|
2189
|
+
missing = set(model.keys()) - set(fields.keys())
|
|
2190
|
+
if missing:
|
|
2191
|
+
# TODO: Bump mapping limit
|
|
2192
|
+
try:
|
|
2193
|
+
self._add_fields({key: model[key] for key in missing})
|
|
2194
|
+
except elasticsearch.BadRequestError as err:
|
|
2195
|
+
handled = False
|
|
2196
|
+
if err.body and isinstance(err.body, dict) and "error" in err.body and "reason" in err.body["error"]:
|
|
2197
|
+
reason: str = err.body["error"]["reason"]
|
|
2198
|
+
if reason.startswith("Limit of total fields"):
|
|
2199
|
+
current_count = int(re.sub(r".+\[(\d+)].+", r"\1", reason))
|
|
2200
|
+
logger.warning(
|
|
2201
|
+
"Current field cap %s is too low, increasing to %s", current_count, current_count + 500
|
|
2202
|
+
)
|
|
2203
|
+
self.with_retries(
|
|
2204
|
+
self.datastore.client.indices.put_settings,
|
|
2205
|
+
settings={"index.mapping.total_fields.limit": current_count + 500},
|
|
2206
|
+
)
|
|
2207
|
+
self._add_fields({key: model[key] for key in missing})
|
|
2208
|
+
handled = True
|
|
2209
|
+
if not handled:
|
|
2210
|
+
raise
|
|
2211
|
+
|
|
2212
|
+
matching = set(fields.keys()) & set(model.keys())
|
|
2213
|
+
for field_name in matching:
|
|
2214
|
+
if fields[field_name]["indexed"] != model[field_name].index and model[field_name].index:
|
|
2215
|
+
raise HowlerRuntimeError(f"Field {field_name} should be indexed but is not.")
|
|
2216
|
+
|
|
2217
|
+
possible_field_types = self.__get_possible_fields(model[field_name].__class__)
|
|
2218
|
+
|
|
2219
|
+
if fields[field_name]["type"] not in possible_field_types:
|
|
2220
|
+
raise HowlerRuntimeError(
|
|
2221
|
+
f"Field {field_name} didn't have the expected store "
|
|
2222
|
+
f"type. [{fields[field_name]['type']} != "
|
|
2223
|
+
f"{model[field_name].__class__.__name__.lower()}]"
|
|
2224
|
+
)
|
|
2225
|
+
|
|
2226
|
+
def _ensure_collection(self):
|
|
2227
|
+
"""This function should test if the collection that you are trying to access does indeed exist
|
|
2228
|
+
and should create it if it does not.
|
|
2229
|
+
|
|
2230
|
+
:return:
|
|
2231
|
+
"""
|
|
2232
|
+
# Create HOT index
|
|
2233
|
+
if not self.with_retries(self.datastore.client.indices.exists, index=self.name):
|
|
2234
|
+
logger.debug(f"Index {self.name.upper()} does not exists. Creating it now...")
|
|
2235
|
+
try:
|
|
2236
|
+
self.with_retries(
|
|
2237
|
+
self.datastore.client.indices.create,
|
|
2238
|
+
index=self.index_name,
|
|
2239
|
+
mappings=self._get_index_mappings(),
|
|
2240
|
+
settings=self._get_index_settings(),
|
|
2241
|
+
)
|
|
2242
|
+
except elasticsearch.exceptions.RequestError as e:
|
|
2243
|
+
if "resource_already_exists_exception" not in str(e):
|
|
2244
|
+
raise
|
|
2245
|
+
logger.warning(f"Tried to create an index template that already exists: {self.name.upper()}")
|
|
2246
|
+
|
|
2247
|
+
self.with_retries(
|
|
2248
|
+
self.datastore.client.indices.put_alias,
|
|
2249
|
+
index=self.index_name,
|
|
2250
|
+
name=self.name,
|
|
2251
|
+
)
|
|
2252
|
+
elif not self.with_retries(
|
|
2253
|
+
self.datastore.client.indices.exists, index=self.index_name
|
|
2254
|
+
) and not self.with_retries(self.datastore.client.indices.exists_alias, name=self.name):
|
|
2255
|
+
# Turn on write block
|
|
2256
|
+
self.with_retries(self.datastore.client.indices.put_settings, settings=write_block_settings)
|
|
2257
|
+
|
|
2258
|
+
# Create a copy on the result index
|
|
2259
|
+
self._safe_index_copy(self.datastore.client.indices.clone, self.name, self.index_name)
|
|
2260
|
+
|
|
2261
|
+
# Make the hot index the new clone
|
|
2262
|
+
self.with_retries(
|
|
2263
|
+
self.datastore.client.indices.update_aliases,
|
|
2264
|
+
actions=[
|
|
2265
|
+
{"add": {"index": self.index_name, "alias": self.name}},
|
|
2266
|
+
{"remove_index": {"index": self.name}},
|
|
2267
|
+
],
|
|
2268
|
+
)
|
|
2269
|
+
|
|
2270
|
+
self.with_retries(self.datastore.client.indices.put_settings, settings=write_unblock_settings)
|
|
2271
|
+
|
|
2272
|
+
self._check_fields()
|
|
2273
|
+
|
|
2274
|
+
def _add_fields(self, missing_fields: Dict):
|
|
2275
|
+
no_fix = []
|
|
2276
|
+
properties = {}
|
|
2277
|
+
for name, field in missing_fields.items():
|
|
2278
|
+
# Figure out the path of the field in the document, if the name is set in the field, it
|
|
2279
|
+
# is going to be duplicated in the path from missing_fields, so drop it
|
|
2280
|
+
prefix = name.split(".")
|
|
2281
|
+
if field.name:
|
|
2282
|
+
prefix = prefix[:-1]
|
|
2283
|
+
|
|
2284
|
+
# Build the fields and templates for this new mapping
|
|
2285
|
+
sub_properties, sub_templates = build_mapping([field], prefix=prefix, allow_refuse_implicit=False)
|
|
2286
|
+
properties.update(sub_properties)
|
|
2287
|
+
if sub_templates:
|
|
2288
|
+
no_fix.append(name)
|
|
2289
|
+
|
|
2290
|
+
# If we have collected any fields that we can't just blindly add, as they might conflict
|
|
2291
|
+
# with existing things, (we might have the refuse_all_implicit_mappings rule in place)
|
|
2292
|
+
# simply raise an exception
|
|
2293
|
+
if no_fix:
|
|
2294
|
+
raise HowlerValueError(
|
|
2295
|
+
f"Can't update database mapping for {self.name}, " f"couldn't safely amend mapping for {no_fix}"
|
|
2296
|
+
)
|
|
2297
|
+
|
|
2298
|
+
# If we got this far, the missing fields have been described in properties, upload them to the
|
|
2299
|
+
# server, and we should be able to move on.
|
|
2300
|
+
for index in self.index_list_full:
|
|
2301
|
+
self.with_retries(self.datastore.client.indices.put_mapping, index=index, properties=properties)
|
|
2302
|
+
|
|
2303
|
+
if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
|
|
2304
|
+
current_template = self.with_retries(self.datastore.client.indices.get_template, name=self.name)[self.name]
|
|
2305
|
+
self.with_retries(
|
|
2306
|
+
self.datastore.client.indices.put_template,
|
|
2307
|
+
name=self.name,
|
|
2308
|
+
**recursive_update(current_template, {"mappings": {"properties": properties}}),
|
|
2309
|
+
)
|
|
2310
|
+
|
|
2311
|
+
def wipe(self):
|
|
2312
|
+
"""This function should completely delete the collection
|
|
2313
|
+
|
|
2314
|
+
NEVER USE THIS!
|
|
2315
|
+
|
|
2316
|
+
:return:
|
|
2317
|
+
"""
|
|
2318
|
+
logger.debug("Wipe operation started for collection: %s" % self.name.upper())
|
|
2319
|
+
|
|
2320
|
+
for index in self.index_list:
|
|
2321
|
+
if self.with_retries(self.datastore.client.indices.exists, index=index):
|
|
2322
|
+
self.with_retries(self.datastore.client.indices.delete, index=index)
|
|
2323
|
+
|
|
2324
|
+
if self.with_retries(self.datastore.client.indices.exists_template, name=self.name):
|
|
2325
|
+
self.with_retries(self.datastore.client.indices.delete_template, name=self.name)
|
|
2326
|
+
|
|
2327
|
+
self._ensure_collection()
|