howler-api 3.0.0.dev374__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of howler-api might be problematic. Click here for more details.

Files changed (198) hide show
  1. howler/__init__.py +0 -0
  2. howler/actions/__init__.py +168 -0
  3. howler/actions/add_label.py +111 -0
  4. howler/actions/add_to_bundle.py +159 -0
  5. howler/actions/change_field.py +76 -0
  6. howler/actions/demote.py +160 -0
  7. howler/actions/example_plugin.py +104 -0
  8. howler/actions/prioritization.py +93 -0
  9. howler/actions/promote.py +147 -0
  10. howler/actions/remove_from_bundle.py +133 -0
  11. howler/actions/remove_label.py +111 -0
  12. howler/actions/transition.py +200 -0
  13. howler/api/__init__.py +249 -0
  14. howler/api/base.py +88 -0
  15. howler/api/socket.py +114 -0
  16. howler/api/v1/__init__.py +97 -0
  17. howler/api/v1/action.py +372 -0
  18. howler/api/v1/analytic.py +748 -0
  19. howler/api/v1/auth.py +382 -0
  20. howler/api/v1/clue.py +99 -0
  21. howler/api/v1/configs.py +58 -0
  22. howler/api/v1/dossier.py +222 -0
  23. howler/api/v1/help.py +28 -0
  24. howler/api/v1/hit.py +1181 -0
  25. howler/api/v1/notebook.py +82 -0
  26. howler/api/v1/overview.py +191 -0
  27. howler/api/v1/search.py +788 -0
  28. howler/api/v1/template.py +206 -0
  29. howler/api/v1/tool.py +183 -0
  30. howler/api/v1/user.py +416 -0
  31. howler/api/v1/utils/__init__.py +0 -0
  32. howler/api/v1/utils/etag.py +84 -0
  33. howler/api/v1/view.py +288 -0
  34. howler/app.py +235 -0
  35. howler/common/README.md +125 -0
  36. howler/common/__init__.py +0 -0
  37. howler/common/classification.py +979 -0
  38. howler/common/classification.yml +107 -0
  39. howler/common/exceptions.py +167 -0
  40. howler/common/loader.py +154 -0
  41. howler/common/logging/__init__.py +241 -0
  42. howler/common/logging/audit.py +138 -0
  43. howler/common/logging/format.py +38 -0
  44. howler/common/net.py +79 -0
  45. howler/common/net_static.py +1494 -0
  46. howler/common/random_user.py +316 -0
  47. howler/common/swagger.py +117 -0
  48. howler/config.py +64 -0
  49. howler/cronjobs/__init__.py +29 -0
  50. howler/cronjobs/retention.py +61 -0
  51. howler/cronjobs/rules.py +274 -0
  52. howler/cronjobs/view_cleanup.py +88 -0
  53. howler/datastore/README.md +112 -0
  54. howler/datastore/__init__.py +0 -0
  55. howler/datastore/bulk.py +72 -0
  56. howler/datastore/collection.py +2342 -0
  57. howler/datastore/constants.py +119 -0
  58. howler/datastore/exceptions.py +41 -0
  59. howler/datastore/howler_store.py +105 -0
  60. howler/datastore/migrations/fix_process.py +41 -0
  61. howler/datastore/operations.py +130 -0
  62. howler/datastore/schemas.py +90 -0
  63. howler/datastore/store.py +231 -0
  64. howler/datastore/support/__init__.py +0 -0
  65. howler/datastore/support/build.py +215 -0
  66. howler/datastore/support/schemas.py +90 -0
  67. howler/datastore/types.py +22 -0
  68. howler/error.py +91 -0
  69. howler/external/__init__.py +0 -0
  70. howler/external/generate_mitre.py +96 -0
  71. howler/external/generate_sigma_rules.py +31 -0
  72. howler/external/generate_tlds.py +47 -0
  73. howler/external/reindex_data.py +66 -0
  74. howler/external/wipe_databases.py +58 -0
  75. howler/gunicorn_config.py +25 -0
  76. howler/healthz.py +47 -0
  77. howler/helper/__init__.py +0 -0
  78. howler/helper/azure.py +50 -0
  79. howler/helper/discover.py +59 -0
  80. howler/helper/hit.py +236 -0
  81. howler/helper/oauth.py +247 -0
  82. howler/helper/search.py +92 -0
  83. howler/helper/workflow.py +110 -0
  84. howler/helper/ws.py +378 -0
  85. howler/odm/README.md +102 -0
  86. howler/odm/__init__.py +1 -0
  87. howler/odm/base.py +1543 -0
  88. howler/odm/charter.txt +146 -0
  89. howler/odm/helper.py +416 -0
  90. howler/odm/howler_enum.py +25 -0
  91. howler/odm/models/__init__.py +0 -0
  92. howler/odm/models/action.py +33 -0
  93. howler/odm/models/analytic.py +90 -0
  94. howler/odm/models/assemblyline.py +48 -0
  95. howler/odm/models/aws.py +23 -0
  96. howler/odm/models/azure.py +16 -0
  97. howler/odm/models/cbs.py +44 -0
  98. howler/odm/models/config.py +558 -0
  99. howler/odm/models/dossier.py +33 -0
  100. howler/odm/models/ecs/__init__.py +0 -0
  101. howler/odm/models/ecs/agent.py +17 -0
  102. howler/odm/models/ecs/autonomous_system.py +16 -0
  103. howler/odm/models/ecs/client.py +149 -0
  104. howler/odm/models/ecs/cloud.py +141 -0
  105. howler/odm/models/ecs/code_signature.py +27 -0
  106. howler/odm/models/ecs/container.py +32 -0
  107. howler/odm/models/ecs/dns.py +62 -0
  108. howler/odm/models/ecs/egress.py +10 -0
  109. howler/odm/models/ecs/elf.py +74 -0
  110. howler/odm/models/ecs/email.py +122 -0
  111. howler/odm/models/ecs/error.py +14 -0
  112. howler/odm/models/ecs/event.py +140 -0
  113. howler/odm/models/ecs/faas.py +24 -0
  114. howler/odm/models/ecs/file.py +84 -0
  115. howler/odm/models/ecs/geo.py +30 -0
  116. howler/odm/models/ecs/group.py +18 -0
  117. howler/odm/models/ecs/hash.py +16 -0
  118. howler/odm/models/ecs/host.py +17 -0
  119. howler/odm/models/ecs/http.py +37 -0
  120. howler/odm/models/ecs/ingress.py +12 -0
  121. howler/odm/models/ecs/interface.py +21 -0
  122. howler/odm/models/ecs/network.py +30 -0
  123. howler/odm/models/ecs/observer.py +45 -0
  124. howler/odm/models/ecs/organization.py +12 -0
  125. howler/odm/models/ecs/os.py +21 -0
  126. howler/odm/models/ecs/pe.py +17 -0
  127. howler/odm/models/ecs/process.py +216 -0
  128. howler/odm/models/ecs/registry.py +26 -0
  129. howler/odm/models/ecs/related.py +45 -0
  130. howler/odm/models/ecs/rule.py +51 -0
  131. howler/odm/models/ecs/server.py +24 -0
  132. howler/odm/models/ecs/threat.py +247 -0
  133. howler/odm/models/ecs/tls.py +58 -0
  134. howler/odm/models/ecs/url.py +51 -0
  135. howler/odm/models/ecs/user.py +57 -0
  136. howler/odm/models/ecs/user_agent.py +20 -0
  137. howler/odm/models/ecs/vulnerability.py +41 -0
  138. howler/odm/models/gcp.py +16 -0
  139. howler/odm/models/hit.py +356 -0
  140. howler/odm/models/howler_data.py +328 -0
  141. howler/odm/models/lead.py +24 -0
  142. howler/odm/models/localized_label.py +13 -0
  143. howler/odm/models/overview.py +16 -0
  144. howler/odm/models/pivot.py +40 -0
  145. howler/odm/models/template.py +24 -0
  146. howler/odm/models/user.py +83 -0
  147. howler/odm/models/view.py +34 -0
  148. howler/odm/random_data.py +888 -0
  149. howler/odm/randomizer.py +609 -0
  150. howler/patched.py +5 -0
  151. howler/plugins/__init__.py +25 -0
  152. howler/plugins/config.py +123 -0
  153. howler/remote/__init__.py +0 -0
  154. howler/remote/datatypes/README.md +355 -0
  155. howler/remote/datatypes/__init__.py +98 -0
  156. howler/remote/datatypes/counters.py +63 -0
  157. howler/remote/datatypes/events.py +66 -0
  158. howler/remote/datatypes/hash.py +206 -0
  159. howler/remote/datatypes/lock.py +42 -0
  160. howler/remote/datatypes/queues/__init__.py +0 -0
  161. howler/remote/datatypes/queues/comms.py +59 -0
  162. howler/remote/datatypes/queues/multi.py +32 -0
  163. howler/remote/datatypes/queues/named.py +93 -0
  164. howler/remote/datatypes/queues/priority.py +215 -0
  165. howler/remote/datatypes/set.py +118 -0
  166. howler/remote/datatypes/user_quota_tracker.py +54 -0
  167. howler/security/__init__.py +253 -0
  168. howler/security/socket.py +108 -0
  169. howler/security/utils.py +185 -0
  170. howler/services/__init__.py +0 -0
  171. howler/services/action_service.py +111 -0
  172. howler/services/analytic_service.py +128 -0
  173. howler/services/auth_service.py +323 -0
  174. howler/services/config_service.py +128 -0
  175. howler/services/dossier_service.py +252 -0
  176. howler/services/event_service.py +93 -0
  177. howler/services/hit_service.py +893 -0
  178. howler/services/jwt_service.py +158 -0
  179. howler/services/lucene_service.py +286 -0
  180. howler/services/notebook_service.py +119 -0
  181. howler/services/overview_service.py +44 -0
  182. howler/services/template_service.py +45 -0
  183. howler/services/user_service.py +331 -0
  184. howler/utils/__init__.py +0 -0
  185. howler/utils/annotations.py +28 -0
  186. howler/utils/chunk.py +38 -0
  187. howler/utils/dict_utils.py +200 -0
  188. howler/utils/isotime.py +17 -0
  189. howler/utils/list_utils.py +11 -0
  190. howler/utils/lucene.py +77 -0
  191. howler/utils/path.py +27 -0
  192. howler/utils/socket_utils.py +61 -0
  193. howler/utils/str_utils.py +256 -0
  194. howler/utils/uid.py +47 -0
  195. howler_api-3.0.0.dev374.dist-info/METADATA +71 -0
  196. howler_api-3.0.0.dev374.dist-info/RECORD +198 -0
  197. howler_api-3.0.0.dev374.dist-info/WHEEL +4 -0
  198. howler_api-3.0.0.dev374.dist-info/entry_points.txt +8 -0
@@ -0,0 +1,158 @@
1
+ # implementation based on this stackoverflow post:
2
+ # https://stackoverflow.com/a/67943659
3
+
4
+
5
+ from typing import Any, Optional
6
+
7
+ import jwt
8
+ import requests
9
+ from jwt.api_jwk import PyJWK
10
+
11
+ from howler.common.exceptions import ForbiddenException, HowlerKeyError, HowlerValueError
12
+ from howler.common.logging import get_logger
13
+ from howler.config import cache, config
14
+
15
+ logger = get_logger(__file__)
16
+
17
+
18
+ def get_jwk(access_token: str) -> PyJWK:
19
+ """Get the JSON Web Key associated with the given JWT"""
20
+ # "kid" is the JSON Web Key's identifier. It tells us which key was used to validate the token.
21
+ kid = jwt.get_unverified_header(access_token).get("kid")
22
+ jwks, _ = get_jwks()
23
+
24
+ try:
25
+ # Check to see if we have it cached
26
+ key = PyJWK(jwks[kid])
27
+ except KeyError:
28
+ # We don't, so we need to refresh the key set
29
+ cache.delete(key="get_jwks")
30
+ try:
31
+ jwks, _ = get_jwks()
32
+ key = jwks[kid]
33
+ except KeyError:
34
+ raise HowlerKeyError("Specified Key Set does not exist.")
35
+
36
+ return key
37
+
38
+
39
+ def get_provider(access_token: str) -> str:
40
+ """Get the provider of a given access token
41
+
42
+ Args:
43
+ access_token (str): The access token to determine the provider of
44
+
45
+ Raises:
46
+ HowlerValueError: The provider of this access token does not match any supported providers
47
+
48
+ Returns:
49
+ str: The provider of the token
50
+ """
51
+ # "kid" is the JSON Web Key's identifier. It tells us which key was used to validate the token.
52
+ kid = jwt.get_unverified_header(access_token).get("kid")
53
+ _, providers = get_jwks()
54
+
55
+ try:
56
+ # Check to see if we have it cached
57
+ oauth_provider = providers[kid]
58
+ except KeyError:
59
+ # We don't, so we need to refresh the key set
60
+ cache.delete(key="get_jwks")
61
+ try:
62
+ _, providers = get_jwks()
63
+ oauth_provider = providers[kid]
64
+ except KeyError:
65
+ raise HowlerValueError("The provider of this access token does not match any supported providers")
66
+
67
+ return oauth_provider
68
+
69
+
70
+ @cache.cached(timeout=60 * 60 * 12, key_prefix="get_jwks") # Cached for 12hrs
71
+ def get_jwks() -> tuple[dict[str, dict[str, Any]], dict[str, str]]:
72
+ """Get the JSON Web Key Set for all supported providers
73
+
74
+ Returns:
75
+ tuple[dict[str, str], dict[str, str]]: The JWKS and the providers that are included in it
76
+ """
77
+ # JWKS = JSON Web Key Set. We merge the key set from all oauth providers
78
+ jwks: dict[str, dict[str, Any]] = {}
79
+ # Mapping of keys to their provider (i.e. azure, keycloak)
80
+ providers: dict[str, str] = {}
81
+
82
+ for (
83
+ provider_name,
84
+ provider_data,
85
+ ) in config.auth.oauth.providers.items():
86
+ # Fetch the JSON Web Key Set for each provider that supports them
87
+ if provider_data.jwks_uri:
88
+ provider_jwks: list[dict[str, Any]] = requests.get(provider_data.jwks_uri, timeout=10).json()["keys"]
89
+ for jwk in provider_jwks:
90
+ jwks[jwk["kid"]] = jwk
91
+ providers[jwk["kid"]] = provider_name
92
+
93
+ return (jwks, providers)
94
+
95
+
96
+ def get_audience(oauth_provider: str) -> str:
97
+ """Get the audience for the specified OAuth provider
98
+
99
+ Args:
100
+ oauth_provider (str): The OAuth provider to retrieve the audience of
101
+
102
+ Raises:
103
+ HowlerValueError: The provider is azure, and is improperly formatted
104
+
105
+ Returns:
106
+ str: The audience of the provider
107
+ """
108
+ audience: str = "howler"
109
+ provider_data = config.auth.oauth.providers[oauth_provider]
110
+ if provider_data.audience:
111
+ audience = provider_data.audience
112
+ elif provider_data.client_id:
113
+ audience = provider_data.client_id
114
+
115
+ if oauth_provider == "azure" and f"{audience}/.default" not in provider_data.scope:
116
+ raise HowlerValueError("Azure scope must contain the <client_id>/.default claim!")
117
+
118
+ return audience
119
+
120
+
121
+ def decode(
122
+ access_token: str,
123
+ key: Optional[str] = None,
124
+ algorithms: Optional[list[str]] = None,
125
+ audience: Optional[str] = None,
126
+ validate_audience: bool = False,
127
+ **kwargs,
128
+ ) -> dict[str, Any]:
129
+ """Decode an access token into a JSON Web Token dict
130
+
131
+ Args:
132
+ access_token (str): The access token to decode
133
+ key (Optional[str], optional): The key used to sign the token. Defaults to None.
134
+ algorithms (Optional[list[str]], optional): The algorithm to use when decoding. Defaults to None.
135
+ audience (Optional[str], optional): The audience to check against, if validating the audience. Defaults to None.
136
+ validate_audience (bool, optional): Should we validate the audience? Defaults to False.
137
+
138
+ Returns:
139
+ dict[str, Any]: The decoded JWT, in dict format
140
+ """
141
+ if not key:
142
+ key = get_jwk(access_token).key
143
+
144
+ if not algorithms:
145
+ algorithms = [jwt.get_unverified_header(access_token).get("alg", "HS256")]
146
+
147
+ if validate_audience and not audience:
148
+ audience = get_audience(get_provider(access_token))
149
+
150
+ try:
151
+ logger.debug("Validating token against audience %s", audience)
152
+ return jwt.decode(jwt=access_token, key=key, algorithms=algorithms, audience=audience, **kwargs) # type: ignore
153
+ except jwt.ExpiredSignatureError as err:
154
+ logger.info("JWT has expired.")
155
+ raise ForbiddenException("Your JWT has expired.", cause=err)
156
+ except jwt.InvalidTokenError as err:
157
+ logger.exception("Error occurred when decoding JWT.")
158
+ raise HowlerValueError("There was an error when decoding your JWT.", cause=err)
@@ -0,0 +1,286 @@
1
+ import fnmatch
2
+ import os
3
+ import re
4
+ import sys
5
+ from datetime import datetime, timedelta
6
+ from hashlib import sha256
7
+ from typing import Any, Literal, Union, cast
8
+
9
+ from elasticsearch._sync.client.indices import IndicesClient
10
+ from luqum.parser import parser
11
+ from luqum.tree import AndOperation, BoolOperation, Phrase, Plus, Prohibit, Range, SearchField, Word
12
+ from luqum.utils import UnknownOperationResolver
13
+ from luqum.visitor import TreeVisitor
14
+
15
+ from howler.api import get_logger
16
+ from howler.common.exceptions import InvalidDataException
17
+ from howler.common.loader import datastore
18
+ from howler.config import redis
19
+ from howler.remote.datatypes.hash import Hash
20
+ from howler.utils.dict_utils import flatten_deep
21
+ from howler.utils.lucene import coerce, normalize_phrase, try_parse_date, try_parse_ip, try_parse_number
22
+
23
+ logger = get_logger(__file__)
24
+
25
+ TRANSPORT_TIMEOUT = int(os.environ.get("HWL_DATASTORE_TRANSPORT_TIMEOUT", "10"))
26
+
27
+
28
+ class LuceneProcessor(TreeVisitor):
29
+ "Tree visitor that evaluates a query on a given object"
30
+
31
+ def visit(self, tree: Any, context: dict[str, Any]) -> bool:
32
+ "Visit each node in a tree"
33
+ return super().visit(tree, context)[0]
34
+
35
+ def visit_search_field(self, node: SearchField, context: dict[str, Any]):
36
+ "Handle search fields"
37
+ # The actual validation happens in the word/phrases directly, not the search field.
38
+ # We pass the field name down for use later
39
+ for result in self.generic_visit(node, {**context, "field": node.name}):
40
+ yield result
41
+
42
+ def visit_and_operation(self, node: AndOperation, context: dict[str, Any]):
43
+ "Handle AND results in query"
44
+ yield all(list(self.generic_visit(node, context)))
45
+
46
+ def visit_or_operation(self, node: AndOperation, context: dict[str, Any]):
47
+ "Handle OR results in query"
48
+ yield any(list(self.generic_visit(node, context)))
49
+
50
+ def visit_bool_operation(self, node: BoolOperation, context: dict[str, Any]):
51
+ """Handle the insanity that is boolean operations.
52
+
53
+ For information about how boolean operations work, see the following extremely helpful article:
54
+
55
+ https://lucidworks.com/resources/solr-boolean-operators/
56
+
57
+ However, we are operating in a boolean environment instead of rankings, so the behaviour is slightly modified.
58
+ """
59
+ results: list[bool] = []
60
+ for child in node.children:
61
+ child_context = self.child_context(node, child, context)
62
+ for result in self.visit_iter(child, context=child_context):
63
+ # If we run across a MUST or MUST NOT (plus, probhit) object and the value doesn't match, we immediately
64
+ # shortcircuit and return false.
65
+ if isinstance(child, Plus) and not result:
66
+ yield False
67
+ return
68
+ elif isinstance(child, Prohibit) and result:
69
+ yield False
70
+ return
71
+
72
+ # Otherwise, we use a basic OR operation to return a result.
73
+ results.append(result)
74
+
75
+ yield any(results)
76
+
77
+ @staticmethod
78
+ def __parse_range(low: str, value: Union[list[str], str], high: str) -> Any:
79
+ "Generate the low, value and high components of a range check, ensuring correct types"
80
+ if datetime_result := coerce(value, try_parse_date):
81
+ low_datetime_result = cast(Any, datetime.fromtimestamp(int(low) / 1000, tz=datetime_result.tzinfo))
82
+
83
+ high_datetime_result = datetime.fromtimestamp(int(high) / 1000, tz=datetime_result.tzinfo)
84
+ high_datetime_result += timedelta(milliseconds=1)
85
+
86
+ return low_datetime_result, datetime_result, high_datetime_result
87
+
88
+ if number_result := coerce(value, try_parse_number):
89
+ low_number_result = coerce(low, try_parse_number)
90
+ high_number_result = coerce(high, try_parse_number)
91
+
92
+ if low_number_result is not None and high_number_result is not None:
93
+ return low_number_result, number_result, high_number_result
94
+
95
+ try:
96
+ # Check if the value is a simple integer
97
+ return int(low), coerce(value, int), int(high)
98
+ except ValueError:
99
+ pass
100
+
101
+ if ip_result := coerce(value, try_parse_ip):
102
+ low_ip_result = coerce(low, try_parse_ip)
103
+ high_ip_result = coerce(high, try_parse_ip)
104
+
105
+ if low_ip_result is not None and high_ip_result is not None:
106
+ return low_ip_result, ip_result, high_ip_result
107
+
108
+ try:
109
+ # Check if the value is a float
110
+ return float(low), coerce(value, float), float(high)
111
+ except ValueError:
112
+ pass
113
+
114
+ raise InvalidDataException(f"Unknown range type for values {low} - {value} - {high}")
115
+
116
+ def visit_range(self, node: Range, context: dict[str, Any]):
117
+ "Handle range queries"
118
+ low, value, high = self.__parse_range(node.low.value, context["hit"].get(context["field"]), node.high.value)
119
+
120
+ if isinstance(value, list):
121
+ values = value
122
+ else:
123
+ values = [value]
124
+
125
+ result = False
126
+ for _value in values:
127
+ if low <= _value and _value <= high:
128
+ if not node.include_high and _value == high:
129
+ continue
130
+ elif not node.include_low and _value == low:
131
+ continue
132
+
133
+ result = True
134
+ break
135
+
136
+ yield result
137
+
138
+ @staticmethod
139
+ def __sanitize_value(value: str) -> str:
140
+ "Sanitize the value we are validating against"
141
+ # True/False are shorthanded by elastic - convert back to True/False
142
+ sanitized_value = re.sub(r"^F$", r"False", value)
143
+ sanitized_value = re.sub(r"^T$", r"True", sanitized_value)
144
+
145
+ # For phrases, remove the encapsulating quotations
146
+ sanitized_value = re.sub(r'"(.+)"', r"\1", sanitized_value)
147
+
148
+ # Unescape escaped colons in value
149
+ sanitized_value = sanitized_value.replace("\\:", ":")
150
+
151
+ return sanitized_value
152
+
153
+ @staticmethod
154
+ def __build_candidates(value: Union[list[str], str], type: Union[Literal["phrase"], Literal["word"]]) -> list[str]:
155
+ candidates: list[str] = []
156
+ if isinstance(value, list):
157
+ for entry in value:
158
+ candidates += normalize_phrase(str(entry), type)
159
+ else:
160
+ candidates = normalize_phrase(str(value), type)
161
+
162
+ return candidates
163
+
164
+ def __handle_word_or_phrase(self, node: Union[Phrase, Word], context: dict[str, Any]):
165
+ sanitized_value = self.__sanitize_value(node.value)
166
+
167
+ if "field" not in context:
168
+ yield any(value == sanitized_value for value in context["hit"].values())
169
+ elif context["field"] == "_exists_":
170
+ yield context["hit"].get(node.value) is not None
171
+ else:
172
+ candidates = self.__build_candidates(context["hit"].get(context["field"]), context["term_type"])
173
+
174
+ yield len(fnmatch.filter(candidates, sanitized_value)) > 0
175
+
176
+ def visit_word(self, node: Phrase, context: dict[str, Any]):
177
+ "Handle words"
178
+ yield from self.__handle_word_or_phrase(node, {**context, "term_type": "word"})
179
+
180
+ def visit_phrase(self, node: Phrase, context: dict[str, Any]):
181
+ "Handle phrases"
182
+ yield from self.__handle_word_or_phrase(node, {**context, "term_type": "phrase"})
183
+
184
+ def visit_prohibit(self, node: Prohibit, context: dict[str, Any]):
185
+ "Handle NOT operation"
186
+ yield from (not entry for entry in self.generic_visit(node, context))
187
+
188
+
189
+ NORMALIZED_QUERY_CACHE: Hash[str] = Hash("normalized_queries", redis)
190
+
191
+ SEARCH_PHRASE_CACHE: dict[str, re.Match[str]] = {}
192
+
193
+
194
+ def replace_lucene_phrase(match: re.Match[str]) -> str:
195
+ "Replace a phrase in lucene with its sha256 hash, to circumvent mangling by ES"
196
+ result = match.group(2) or ""
197
+
198
+ value = match.group(3)
199
+
200
+ if try_parse_date(value.replace('"', "")):
201
+ result += value
202
+ elif try_parse_ip(value.replace('"', "")):
203
+ result += value.replace(":", "@colon")
204
+ else:
205
+ key = sha256(value.encode()).hexdigest()
206
+
207
+ SEARCH_PHRASE_CACHE[key] = match
208
+
209
+ result += key
210
+
211
+ result += match.group(4) or ""
212
+
213
+ return result
214
+
215
+
216
+ def try_reinsert_lucene_phrase(match: re.Match[str]) -> str:
217
+ "Given a potential sha256 hash, replace that hash with the original lucene phrase (if it exists)"
218
+ key = match.group(1)
219
+
220
+ if key in SEARCH_PHRASE_CACHE:
221
+ return SEARCH_PHRASE_CACHE[key].group(3)
222
+ else:
223
+ return key
224
+
225
+
226
+ def match(lucene: str, obj: dict[str, Any]):
227
+ "Check if a given lucene query matches the given object"
228
+ hash_key = sha256(lucene.encode()).hexdigest()
229
+
230
+ # We cache the results back from ES, since we will frequently run the same validation queries over and over again.
231
+ if (normalized_query := NORMALIZED_QUERY_CACHE.get(hash_key)) is None or "pytest" in sys.modules:
232
+ # This regex checks for lucene phrases (i.e. the "Example Analytic" part of howler.analytic:"Example Analytic")
233
+ # And then escapes them.
234
+ # https://regex101.com/r/8u5F6a/1
235
+ escaped_lucene = re.sub(r'((:\()?(".+?")(\)?))', replace_lucene_phrase, lucene)
236
+
237
+ # This may seem unintuitive, but elastic parses lucene queries in somewhat nonstandard ways (or at least,
238
+ # in ways luqum doesn't agree with). to circumvent this, we use validate_query, which returns a "normalized"
239
+ # query that works much better with luqum. It's also much faster than actually searching for the hit in
240
+ # question.
241
+ indices_client = IndicesClient(datastore().hit.datastore.client)
242
+ result = indices_client.validate_query(q=escaped_lucene, explain=True, index=datastore().hit.index_name)
243
+
244
+ if not result["valid"]:
245
+ logger.error("Invalid lucene query:\n%s", result["explanations"][0]["error"])
246
+ return False
247
+
248
+ # As an example, the query:
249
+ # server.address:("supports" OR "their") AND howler.votes.benign:("edge" OR "also")
250
+ # becomes:
251
+ # +(server.address:supports server.address:their) +(howler.votes.benign:edge howler.votes.benign:also)
252
+ # which means the two are equivalent in elastic, but the second one is a lot less ambiguous to parse.
253
+ normalized_query = cast(str, result["explanations"][0]["explanation"])
254
+
255
+ # Elastic's explanation mangles exists queries. Since we will handle them the normal way, reset their changes
256
+ normalized_query = re.sub(r"FieldExistsQuery *\[.*?field=(.+?)]", r"_exists_:\1", normalized_query)
257
+ normalized_query = re.sub(r"ConstantScore", "", normalized_query)
258
+ # try and reinsert any phrases we have replaced with sha256 hashes
259
+ normalized_query = re.sub(r"([0-9a-f]{64})", try_reinsert_lucene_phrase, normalized_query)
260
+
261
+ # Properly convert escaped colons back
262
+ normalized_query = normalized_query.replace("@colon", ":")
263
+
264
+ # Cache the normalized query
265
+ NORMALIZED_QUERY_CACHE.set(hash_key, normalized_query)
266
+
267
+ try:
268
+ # luqum's default tree will return UnknownOperations in cases where expilicit operators aren't used.
269
+ # Due to the normalization step undertaken by elastic, we know that all unknown operations are actually
270
+ # Boolean operations.
271
+ #
272
+ # NOTE: Boolean operations have a special meaning in lucene, and are not analgous to and/or operations.
273
+ # For more information, see: https://lucidworks.com/resources/solr-boolean-operators/
274
+ tree = UnknownOperationResolver(resolve_to=BoolOperation)(parser.parse(normalized_query))
275
+
276
+ # Actually run the validation
277
+ return LuceneProcessor(track_parents=True).visit(tree, {"hit": flatten_deep(obj)})
278
+ except Exception:
279
+ logger.exception("Exception on processing lucene:")
280
+ return False
281
+
282
+
283
+ if __name__ == "__main__":
284
+ hit = datastore().hit.search("howler.id:*", rows=1, as_obj=False)["items"][0]
285
+
286
+ print(match(sys.argv[1], hit)) # noqa: T201
@@ -0,0 +1,119 @@
1
+ from typing import Any, Callable, Optional
2
+
3
+ import chevron
4
+ import requests
5
+ from flask import request
6
+
7
+ from howler.common.exceptions import AuthenticationException, HowlerRuntimeError, HowlerValueError
8
+ from howler.common.logging import get_logger
9
+ from howler.config import cache, config
10
+ from howler.odm.models.analytic import Analytic
11
+ from howler.plugins import get_plugins
12
+
13
+ logger = get_logger(__file__)
14
+
15
+
16
+ @cache.memoize(15 * 60)
17
+ def get_token(access_token: str) -> str:
18
+ """Get a notebook token based on the current howler token"""
19
+ get_notebook_token: Optional[Callable[[str], str]] = None
20
+
21
+ for plugin in get_plugins():
22
+ if get_notebook_token := plugin.modules.token_functions.get("notebook", None):
23
+ break
24
+ else:
25
+ logger.info("Plugin %s does not modify the notebook access token.")
26
+
27
+ if get_notebook_token:
28
+ notebook_access_token = get_notebook_token(access_token)
29
+ else:
30
+ logger.info("No custom notebook token logic provided, continuing with howler credentials")
31
+ notebook_access_token = access_token
32
+
33
+ return notebook_access_token
34
+
35
+
36
+ def get_nbgallery_nb(link: str):
37
+ """Get a notebook from a given nbgallery link"""
38
+ # /notebooks/1-example-nb
39
+ # get the id (1)
40
+ nb_id = link.rsplit("/", 1)[-1].rsplit("-")[0]
41
+ auth_data: Optional[str] = request.headers.get("Authorization", None, type=str)
42
+
43
+ if not auth_data:
44
+ raise AuthenticationException("No Authorization header present")
45
+
46
+ access_token = get_token(auth_data.split(" ")[1])
47
+
48
+ # use obo token to retrieve notebook value
49
+ notebook_req = requests.get(
50
+ f"{config.core.notebook.url}/notebooks/{nb_id}/download.json",
51
+ headers={
52
+ "accept": "application/json",
53
+ "Authorization": f"Bearer {access_token}",
54
+ },
55
+ timeout=5,
56
+ )
57
+
58
+ if notebook_req.ok:
59
+ notebook: dict[str, Any] = notebook_req.json()
60
+
61
+ name = notebook["metadata"]["gallery"]["title"]
62
+
63
+ return (notebook, name)
64
+ else:
65
+ return None, None
66
+
67
+
68
+ def get_user_envs():
69
+ """Get a user's environments from nbgallery"""
70
+ auth_data: Optional[str] = request.headers.get("Authorization", None, type=str)
71
+
72
+ if not auth_data:
73
+ raise AuthenticationException("No Authorization header present")
74
+
75
+ access_token = get_token(auth_data.split(" ")[1])
76
+
77
+ # get environment info from jupyterhub
78
+ # how to get environment without nbgallery?
79
+ # https://nbgallery.dev.analysis.cyber.gc.ca/environments.json
80
+ env = requests.get(
81
+ f"{config.core.notebook.url}/environments.json",
82
+ headers={
83
+ "accept": "application/json",
84
+ "Authorization": f"Bearer {access_token}",
85
+ },
86
+ timeout=5,
87
+ )
88
+
89
+ if env.ok:
90
+ env = env.json()
91
+ else:
92
+ raise HowlerRuntimeError(f"NBGallery returned {env.status_code}")
93
+
94
+ return env
95
+
96
+
97
+ def get_nb_information(nb_link: str, analytic: Analytic, hit: dict[str, Any]):
98
+ """Get a information about a notebook from nbgallery"""
99
+ # get notebook
100
+ # only from nbgallery for now
101
+ if "nbgallery" in nb_link:
102
+ json_content, name = get_nbgallery_nb(nb_link)
103
+ else:
104
+ raise HowlerValueError("Invalid notebook source")
105
+
106
+ if not json_content or not name:
107
+ raise HowlerRuntimeError("An error occurred when retrieving the notebook")
108
+
109
+ try:
110
+ # patch first node containing code with hit/analytic info
111
+ cell_to_template = next(filter(lambda cell: cell["cell_type"] == "code", json_content["cells"]))
112
+ # goal: support any field from a hit/analytic object
113
+ cell_to_template["source"] = chevron.render(cell_to_template["source"], {"hit": hit, "analytic": analytic})
114
+ except StopIteration as e:
115
+ raise HowlerValueError("Notebook doesn't contain a cell with code.", e)
116
+ except Exception as e:
117
+ raise HowlerRuntimeError("Unexpected error while processing notebook.", e)
118
+
119
+ return (json_content, name)
@@ -0,0 +1,44 @@
1
+ from typing import Any, Union
2
+
3
+ from howler.common.loader import datastore
4
+ from howler.common.logging import get_logger
5
+ from howler.datastore.exceptions import SearchException
6
+ from howler.odm.models.hit import Hit
7
+ from howler.odm.models.overview import Overview
8
+ from howler.utils.str_utils import sanitize_lucene_query
9
+
10
+ logger = get_logger(__file__)
11
+
12
+
13
+ def get_matching_overviews(
14
+ hits: Union[list[Hit], list[dict[str, Any]]], as_odm: bool = False
15
+ ) -> Union[list[dict[str, Any]], list[Overview]]:
16
+ """Generate a list of overviews matching a given list of analytic names from the provided hits.
17
+
18
+ Args:
19
+ hits (list[Hit] | list[dict[str, Any]]): A list of Hit objects or dictionaries containing analytic information.
20
+ as_odm (bool, optional): If True, return Overview objects; otherwise, return dictionaries. Defaults to False.
21
+
22
+ Returns:
23
+ list[dict[str, Any]] | list[Overview]: A list of matching overviews, either as dictionaries or Overview objects.
24
+ """
25
+ if len(hits) < 1:
26
+ return []
27
+
28
+ analytic_names: set[str] = set()
29
+ for hit in hits:
30
+ analytic_names.add(f'"{sanitize_lucene_query(hit["howler"]["analytic"])}"')
31
+
32
+ if len(analytic_names) < 1:
33
+ return []
34
+
35
+ try:
36
+ overview_candidates = datastore().overview.search(
37
+ f"analytic:({' OR '.join(analytic_names)})",
38
+ as_obj=as_odm,
39
+ )["items"]
40
+
41
+ return overview_candidates
42
+ except SearchException:
43
+ logger.exception("Exception on analytic matching")
44
+ return []
@@ -0,0 +1,45 @@
1
+ from typing import Any, Optional, Union
2
+
3
+ from howler.common.loader import datastore
4
+ from howler.common.logging import get_logger
5
+ from howler.datastore.exceptions import SearchException
6
+ from howler.odm.models.analytic import Analytic
7
+ from howler.odm.models.hit import Hit
8
+ from howler.utils.str_utils import sanitize_lucene_query
9
+
10
+ logger = get_logger(__file__)
11
+
12
+
13
+ def get_matching_templates(
14
+ hits: Union[list[Hit], list[dict[str, Any]]], uname: Optional[str] = None, as_odm: bool = False
15
+ ) -> Union[list[dict[str, Any]], list[Analytic]]:
16
+ """Generate a list of templates matching a given list of analytic names, and optionally a user.
17
+
18
+ Args:
19
+ hits (list[Hit] | list[dict[str, Any]]]: List of hits, each containing analytic information.
20
+ uname (Optional[str], optional): Username to filter templates by owner. Defaults to None.
21
+ as_odm (bool, optional): If True, return results as ODM objects. If False, return as dicts. Defaults to False.
22
+
23
+ Returns:
24
+ list[dict[str, Any]] | list[Analytic]: List of matching templates, either as dicts or Analytic ODM objects.
25
+ """
26
+ if len(hits) < 1:
27
+ return []
28
+
29
+ analytic_names: set[str] = set()
30
+ for hit in hits:
31
+ analytic_names.add(f'"{sanitize_lucene_query(hit["howler"]["analytic"])}"')
32
+
33
+ if len(analytic_names) < 1:
34
+ return []
35
+
36
+ try:
37
+ template_candidates = datastore().template.search(
38
+ f"analytic:({' OR '.join(analytic_names)}) AND (type:global OR owner:{uname or '*'})",
39
+ as_obj=as_odm,
40
+ )["items"]
41
+
42
+ return template_candidates
43
+ except SearchException:
44
+ logger.exception("Exception on analytic matching")
45
+ return []