polyswarm-engine 3.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ # flake8: noqa
2
+ __VERSION__ = '3.1.1'
3
+
4
+ from .bidutils import (
5
+ bid_max,
6
+ bid_median,
7
+ bid_min,
8
+ bid_range,
9
+ dni_to_bid,
10
+ rescale_to_bid,
11
+ to_wei,
12
+ )
13
+ from .bounty import (
14
+ ArtifactTempfile,
15
+ get_artifact_bytes,
16
+ get_artifact_path,
17
+ get_artifact_stream,
18
+ get_artifact_type,
19
+ get_bounty_expiration,
20
+ is_file_artifact,
21
+ is_url_artifact,
22
+ )
23
+ from .constants import (
24
+ ARTIFACT_TYPES,
25
+ BENIGN,
26
+ FILE_ARTIFACT,
27
+ MALICIOUS,
28
+ SUSPICIOUS,
29
+ UNKNOWN,
30
+ URL_ARTIFACT,
31
+ )
32
+ from .engine import EngineManager
33
+ from .typing import (
34
+ Analysis,
35
+ AnalysisMetadata,
36
+ AnalysisResult,
37
+ ArtifactType,
38
+ Bounty,
39
+ Environment,
40
+ Scanner,
41
+ )
42
+ from .utils import (
43
+ pattern_matches,
44
+ poll,
45
+ poll_decorator,
46
+ resource_path,
47
+ spawn_subprocess,
48
+ )
49
+ from .wine import as_nt_path
@@ -0,0 +1,302 @@
1
+ import contextlib
2
+ import logging
3
+ import typing as t
4
+ import copy
5
+ from urllib import parse
6
+ from datetime import datetime, timezone
7
+
8
+ import celery
9
+
10
+ from polyswarm_engine import exceptions
11
+ from polyswarm_engine.bounty import get_bounty_expiration, get_bounty_tasked_at, CANNOT_FETCH
12
+ from polyswarm_engine.settings import PSENGINE_METADATA_ARCHTECTURE, PSENGINE_METADATA_OS, PSENGINE_DELIVERY_TASK
13
+ from polyswarm_engine.constants import (
14
+ BENIGN,
15
+ MALICIOUS,
16
+ SUSPICIOUS,
17
+ UNKNOWN,
18
+ AnalysisConclusions,
19
+ )
20
+
21
+ from polyswarm_engine.typing import Analysis
22
+ from polyswarm_engine.celeryconfig import CeleryConfig
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def get_all_backend_names():
28
+ """Used by internal tooling"""
29
+ return ['CeleryBackend']
30
+
31
+
32
+ class CeleryBackend:
33
+ app = None
34
+
35
+ def __init__(
36
+ self,
37
+ name=None,
38
+ analyze=None,
39
+ head=None,
40
+ lifecycle=contextlib.nullcontext,
41
+ deliver_func=None,
42
+ deliver_task_name=None,
43
+ ):
44
+ # lazy setting the backend
45
+ if CeleryBackend.app is None:
46
+ CeleryBackend.app = celery.Celery('polyswarm_engine_celery_backend', config_source=CeleryConfig())
47
+
48
+ self.name = name
49
+ self._analyze = analyze
50
+ self._head = head
51
+ self._lifecycle = lifecycle
52
+ self.analysis_environment = None
53
+ self._analyze_task = self._create_analyze_task()
54
+ self._lifecycle_context = None
55
+
56
+ self._deliver_task_name = deliver_task_name or PSENGINE_DELIVERY_TASK
57
+ if self._deliver_task_name:
58
+ self._deliver = deliver_func or self._queue_deliver
59
+ else:
60
+ self._deliver = deliver_func or self._http_deliver
61
+ logger.debug("CeleryBackend deliver function: '%r'", self._deliver)
62
+
63
+ def __repr__(self):
64
+ return '{}(engine="{}")'.format(self.__class__.__name__, self.name)
65
+
66
+ @contextlib.contextmanager
67
+ def _run(self):
68
+ with self._lifecycle():
69
+ self.update_analysis_environment()
70
+ logger.info("%r started", self)
71
+ try:
72
+ yield self
73
+ finally:
74
+ logger.info("%r stopped", self)
75
+
76
+ def _enter(self, **_):
77
+ logger.info('Setting up the lifecycle context for the forked worker.')
78
+ self._lifecycle_context = self._run()
79
+ try:
80
+ return self._lifecycle_context.__enter__()
81
+ except Exception as e:
82
+ logger.warning('Could not start worker: %r', e, exc_info=True)
83
+ raise celery.exceptions.WorkerShutdown from e
84
+
85
+ def _exit(self, **_):
86
+ logger.info('Cleaning up the lifecycle context for the forked worker.')
87
+ result = self._lifecycle_context.__exit__(None, None, None)
88
+ self._lifecycle_context = None
89
+ return result
90
+
91
+ @contextlib.contextmanager
92
+ def run(self):
93
+ # signals need to receive kwargs
94
+ if CeleryBackend.app.conf.task_always_eager:
95
+ self._enter()
96
+ try:
97
+ yield self
98
+ finally:
99
+ self._exit()
100
+ else:
101
+ celery.signals.worker_process_init.connect(self._enter)
102
+ celery.signals.worker_process_shutdown.connect(self._exit)
103
+ yield self
104
+
105
+ def __exit__(self, exc_type, exc_val, exc_tb):
106
+ return None
107
+
108
+ def head(self):
109
+ head_ = self._head() if self._head else {}
110
+
111
+ if PSENGINE_METADATA_ARCHTECTURE or PSENGINE_METADATA_OS:
112
+ environment: dict = head_.setdefault('scanner', {}).setdefault('environment', {})
113
+ if PSENGINE_METADATA_OS:
114
+ environment.setdefault('operating_system', PSENGINE_METADATA_OS)
115
+ if PSENGINE_METADATA_ARCHTECTURE:
116
+ environment.setdefault('architecture', PSENGINE_METADATA_ARCHTECTURE)
117
+ return head_
118
+
119
+ def update_analysis_environment(self):
120
+ self.analysis_environment = dict(product=self.name)
121
+ self.analysis_environment.update(self.head())
122
+
123
+ def validate_result(self, analysis: 'Analysis'):
124
+ assert "verdict" in analysis, f"verdict is missing in analysis: {analysis}"
125
+ assert analysis.get("verdict") in AnalysisConclusions, "invalid verdict: {} must be one of {}".format(
126
+ analysis.get('verdict'), ','.join(AnalysisConclusions)
127
+ )
128
+
129
+ if analysis["verdict"] in [MALICIOUS, BENIGN]:
130
+ assert "bid" in analysis and isinstance(
131
+ analysis["bid"], int
132
+ ), f"bid must be an int, got: {analysis.get('bid')}"
133
+ assert analysis["bid"] > 0, \
134
+ f"benign and malicious verdicts require a bid > 0.0. got: {analysis.get('bid')}"
135
+ elif analysis["verdict"] in [SUSPICIOUS, UNKNOWN]:
136
+ if "bid" in analysis:
137
+ logger.info("suspicious and unknown verdicts should not have a bid, it will be ignored")
138
+
139
+ def validate_optional_field(key, obj, expected_type):
140
+ assert isinstance(
141
+ obj.get(key), (type(None), expected_type)
142
+ ), f"{key} must be a {expected_type}, got: {obj.get(key)}"
143
+
144
+ validate_optional_field("vendor", analysis, str)
145
+ validate_optional_field("author", analysis, str)
146
+ validate_optional_field("metadata", analysis, dict)
147
+ validate_optional_field("confidence", analysis, float)
148
+
149
+ if "metadata" in analysis:
150
+ metadata = analysis["metadata"]
151
+ validate_optional_field("malware_family", metadata, str)
152
+ validate_optional_field("product", metadata, str)
153
+ validate_optional_field("heuristic", metadata, bool)
154
+ validate_optional_field("scanner", metadata, dict)
155
+ validate_optional_field("comments", metadata, list)
156
+
157
+ if "scanner" in metadata:
158
+ scanner = metadata["scanner"]
159
+ validate_optional_field("vendor_version", scanner, str)
160
+ validate_optional_field("signatures_version", scanner, str)
161
+ validate_optional_field("version", scanner, str)
162
+ validate_optional_field("environment", scanner, dict)
163
+
164
+ if "environment" in scanner:
165
+ environment = scanner["environment"]
166
+ validate_optional_field("architecture", environment, str)
167
+ validate_optional_field("operating_system", environment, str)
168
+
169
+ @staticmethod
170
+ def merge_inner(x, y):
171
+ rv = dict()
172
+ rv.update(y)
173
+ rv.update(x)
174
+
175
+ for k in (x.keys() & y.keys()):
176
+ if isinstance(x[k], t.Mapping) and isinstance(y[k], t.Mapping):
177
+ rv[k] = CeleryBackend.merge_inner(x[k], y[k])
178
+
179
+ return rv
180
+
181
+ def generate_enriched_result(self, analysis):
182
+ # Handle the case of pre-nested metadata keys
183
+ if "metadata" in self.analysis_environment and isinstance(self.analysis_environment["metadata"], dict):
184
+ return self.merge_inner(analysis, self.analysis_environment)
185
+ else:
186
+ return self.merge_inner(analysis, {"metadata": self.analysis_environment})
187
+
188
+ def _get_callback_info(self, url) -> tuple[str, str, str]:
189
+ """
190
+ Grabs info from the response_url of bounties
191
+
192
+ Returns a tuple with (task_type, bounty_id, nonce),
193
+ where task_type should be the 'assertions' or 'votes' string.
194
+ """
195
+ url = parse.urlparse(url)
196
+ _, _, _, bounty_id, task_type, _ = url.path.split('/')
197
+ _, nonce = url.query.split('=')
198
+ return task_type, bounty_id, nonce
199
+
200
+ def _queue_deliver(self, bounty: dict, enriched_result: dict):
201
+ """Defer the delivery of bounty results to a Celery task
202
+
203
+ The task name comes from `deliver_task_name` instance init args
204
+ and is formatted with `task_type` and `response_url` as context,
205
+ allowing some limited dynamic choosing of the Celery task
206
+ """
207
+ response_url = bounty.get('response_url')
208
+ if response_url:
209
+ task_type, bounty_id, nonce = self._get_callback_info(response_url)
210
+ enriched_result['bounty'] = bounty_id
211
+ enriched_result['_nonce'] = [nonce]
212
+
213
+ taskname = self._deliver_task_name.format(
214
+ task_type=task_type,
215
+ response_url=response_url,
216
+ )
217
+ CeleryBackend.app.send_task(
218
+ taskname,
219
+ args=(enriched_result,),
220
+ queue=task_type,
221
+ )
222
+
223
+ def _http_deliver(self, bounty: dict, enriched_result: dict):
224
+ """Produce the delivery of bounty results by using an HTTP call
225
+
226
+ Fails silently if `response_url` is not available in the `bounty`,
227
+ as it assumes this `bounty` to be a local test.
228
+ """
229
+ import requests
230
+
231
+ response_url = bounty.get('response_url')
232
+ if response_url:
233
+ try:
234
+ with _http_debug_wrapper():
235
+ response = requests.post(response_url, json=enriched_result)
236
+ if logger.getEffectiveLevel() < logging.DEBUG:
237
+ logger.debug('request body: %s', response.request.body)
238
+ logger.debug('response body: %s', response.text)
239
+ response.raise_for_status()
240
+ except requests.exceptions.HTTPError as e:
241
+ logger.error('Request failed: %s %s', e.response.text, e)
242
+
243
+ def process_bounty(self, bounty):
244
+ tasked_at = get_bounty_tasked_at(bounty)
245
+ expiration = get_bounty_expiration(bounty)
246
+ processing_start = datetime.now(timezone.utc)
247
+ if processing_start > expiration:
248
+ raise exceptions.EngineTimeoutError(
249
+ 'Current time %s is past expiration time %s',
250
+ processing_start.isoformat(),
251
+ expiration.isoformat(),
252
+ )
253
+
254
+ try:
255
+ result = self._analyze(bounty)
256
+ except exceptions.BountyException as err:
257
+ result: dict = copy.deepcopy(CANNOT_FETCH)
258
+ result.setdefault('metadata', {})['error'] = repr(err)
259
+
260
+ enriched_result = self.generate_enriched_result(result)
261
+ self.validate_result(enriched_result)
262
+
263
+ self._deliver(bounty, enriched_result)
264
+ return enriched_result
265
+
266
+ def analyze(self, bounty, queue=None, **options):
267
+ if not queue and not CeleryBackend.app.conf.task_always_eager:
268
+ raise exceptions.EngineException('Celery backend needs a queue.')
269
+ return self._analyze_task.apply_async(
270
+ args=(bounty, ),
271
+ queue=queue,
272
+ **options,
273
+ )
274
+
275
+ def _create_analyze_task(self):
276
+ @CeleryBackend.app.task(name='polyswarm_engine.celery_backend.analyze_task')
277
+ def analyze_task(bounty):
278
+ return self.process_bounty(bounty)
279
+
280
+ return analyze_task
281
+
282
+
283
+ @contextlib.contextmanager
284
+ def _http_debug():
285
+ """
286
+ Produce logs of HTTP calls
287
+
288
+ Produces logs by manipulating `http.client.HTTPConnetion`,
289
+ as suggested on https://github.com/urllib3/urllib3/issues/107#issuecomment-11690207
290
+ """
291
+ # You'll need to do this before urllib3 creates any http connection objects
292
+ import http.client
293
+
294
+ initial_debuglevel = http.client.HTTPConnection.debuglevel
295
+ http.client.HTTPConnection.debuglevel = 5
296
+ try:
297
+ yield
298
+ finally:
299
+ http.client.HTTPConnection.debuglevel = initial_debuglevel
300
+
301
+
302
+ _http_debug_wrapper = _http_debug if logger.getEffectiveLevel() < logging.DEBUG else contextlib.nullcontext
@@ -0,0 +1,69 @@
1
+ import logging
2
+ import typing as t
3
+
4
+ from polyswarm_engine.constants import NCT_TO_WEI_CONVERSION
5
+
6
+
7
+ if t.TYPE_CHECKING:
8
+ from .typing import Bid, Bounty
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ def bid_median(bounty: 'Bounty') -> 'Bid':
14
+ """Get the median of the minimum and maximum allowed bid from :param:`bounty`"""
15
+ min_bid, max_bid = bid_range(bounty)
16
+ return (min_bid+max_bid) // 2
17
+
18
+
19
+ def bid_min(bounty: 'Bounty') -> 'Bid':
20
+ """Get the minimum allowed bid from :param:`bounty`"""
21
+ return bid_range(bounty)[0]
22
+
23
+
24
+ def bid_max(bounty: 'Bounty') -> 'Bid':
25
+ """Get the maximum allowed bid from :param:`bounty`"""
26
+ return bid_range(bounty)[1]
27
+
28
+
29
+ def bid_range(bounty: 'Bounty') -> t.Tuple['Bid', 'Bid']:
30
+ """Return a tuple of the minimum & maximum allowed bit from :param:`bounty`"""
31
+ rules = bounty['rules']
32
+ return rules['min_allowed_bid'], rules['max_allowed_bid']
33
+
34
+
35
+ def rescale_to_bid(bounty: 'Bounty', value: 't.SupportsInt', min=0, max=100) -> 'Bid':
36
+ """Scale a `value` (a number between ``min`` and ``max``) to ``bounty``'s maximum bid"""
37
+ min_bid, max_bid = bid_range(bounty)
38
+
39
+ if value >= min and value <= max:
40
+ return int(normalize(value, min, max, min_bid, max_bid))
41
+ else:
42
+ log.error("value (%f) is not between %f and %f", value, min, max)
43
+ return min_bid
44
+
45
+
46
+ def dni_to_bid(bounty: 'Bounty', value: str) -> 'Bid':
47
+ """Transform string value from the None / Low / Med / High scale to a bid"""
48
+ value = value.lower()
49
+
50
+ if value == "none":
51
+ return bid_min(bounty)
52
+ elif value == "low":
53
+ return rescale_to_bid(bounty, 25, max=100)
54
+ elif value == "med":
55
+ return rescale_to_bid(bounty, 75, max=100)
56
+ elif value == "high":
57
+ return bid_max(bounty)
58
+ else:
59
+ raise ValueError(bounty, value)
60
+
61
+
62
+ def normalize(x, x_min, x_max, a, b):
63
+ """Scale `x`, a value between `x_min` and `x_max`, to a value between `a` and `b`"""
64
+ return a + (((x-x_min) * (b-a)) / (x_max-x_min))
65
+
66
+
67
+ def to_wei(nct):
68
+ """Convert a value in NCT to wei, usable in the bid value"""
69
+ return nct * NCT_TO_WEI_CONVERSION