polyswarm-engine 3.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,387 @@
1
+ import contextlib
2
+ import functools
3
+ import hashlib
4
+ import logging
5
+ import os
6
+ import random
7
+ import pathlib
8
+ import tempfile
9
+ import typing as t
10
+ import urllib.request
11
+ import uuid
12
+ from datetime import datetime, timezone, timedelta
13
+
14
+ import requests
15
+
16
+ from .constants import (
17
+ ARTIFACT_TYPES,
18
+ FILE_ARTIFACT,
19
+ FILE_BOUNTY_UUID,
20
+ SKIPPED_COMMENT,
21
+ SKIPPED_ENCRYPTED_COMMENT,
22
+ SKIPPED_HIGHCOMPRESSION_COMMENT,
23
+ SKIPPED_UNSUPPORTED_COMMENT,
24
+ SKIPPED_CANNOT_FETCH_COMMENT,
25
+ SUSPICIOUS,
26
+ UNKNOWN,
27
+ URL_ARTIFACT,
28
+ URL_BOUNTY_UUID,
29
+ URL_MIMETYPE,
30
+ )
31
+ from .typing import (
32
+ Analysis,
33
+ ArtifactType,
34
+ Bid,
35
+ Bounty,
36
+ BountyMetadata,
37
+ GenericPathLike,
38
+ )
39
+ from .utils import build_data_uri, guess_mimetype
40
+ from .exceptions import BountyFetchException
41
+
42
+ log = logging.getLogger(__name__)
43
+
44
+ # report an analysis was skipped
45
+ SKIPPED: 't.Final[Analysis]' = dict(
46
+ verdict=UNKNOWN,
47
+ bid=0,
48
+ metadata=dict(comments=[SKIPPED_COMMENT]),
49
+ )
50
+
51
+ # report skipped analysis due to an encrypted artifact (e.g password protected archive)
52
+ ENCRYPTED: 't.Final[Analysis]' = dict(
53
+ verdict=UNKNOWN,
54
+ bid=0,
55
+ metadata=dict(comments=[SKIPPED_ENCRYPTED_COMMENT]),
56
+ )
57
+
58
+ # report skipped analysis due to unsafe decompression requirements (e.g zip bombs)
59
+ UNSAFE_DECOMPRESSION: 't.Final[Analysis]' = dict(
60
+ verdict=SUSPICIOUS,
61
+ bid=0,
62
+ metadata=dict(comments=[SKIPPED_HIGHCOMPRESSION_COMMENT]),
63
+ )
64
+
65
+ # report skipped analysis due to unrecognized or corrupt artifact
66
+ UNSUPPORTED: 't.Final[Analysis]' = dict(
67
+ verdict=UNKNOWN,
68
+ bid=0,
69
+ metadata=dict(comments=[SKIPPED_UNSUPPORTED_COMMENT]),
70
+ )
71
+
72
+ # repost skipped analysis due to a non-fetchable artifact
73
+ CANNOT_FETCH: 't.Final[Analysis]' = dict(
74
+ verdict=UNKNOWN,
75
+ bid=0,
76
+ metadata=dict(comments=[SKIPPED_CANNOT_FETCH_COMMENT]),
77
+ )
78
+
79
+
80
+ def get_bounty_tasked_at(bounty: Bounty, *, default_timedelta=timedelta(seconds=0)) -> datetime:
81
+ return _get_bounty_datekey(bounty, datekey='tasked_at', default_timedelta=default_timedelta)
82
+
83
+
84
+ def get_bounty_expiration(bounty: Bounty, *, default_timedelta=timedelta(seconds=90)) -> datetime:
85
+ return _get_bounty_datekey(bounty, datekey='expiration', default_timedelta=default_timedelta)
86
+
87
+
88
+ def _get_bounty_datekey(bounty: Bounty, datekey: str, *, default_timedelta=timedelta(seconds=90)) -> datetime:
89
+ """Return a `datetime` for this bounty's expiration"""
90
+ value = bounty.get(datekey)
91
+
92
+ if not value:
93
+ log.debug('No %s in bounty=%s', datekey, bounty)
94
+ return datetime.now(timezone.utc) + default_timedelta
95
+ elif isinstance(value, str):
96
+ value = datetime.fromisoformat(value)
97
+ value = value.astimezone(timezone.utc)
98
+ return value
99
+ elif isinstance(value, datetime):
100
+ return value
101
+ else:
102
+ raise TypeError(f'Illegal bounty {datekey}', value, bounty)
103
+
104
+
105
+ def lookup_artifact_type(
106
+ artifact_type,
107
+ *,
108
+ typemap={
109
+ fn(t): t # type: ignore
110
+ for t in ARTIFACT_TYPES for fn in (str.lower, str.upper, str.capitalize, lambda x: x)
111
+ },
112
+ ) -> 'ArtifactType':
113
+ """Map a case-insensitive `artifact_type` to correct value"""
114
+ try:
115
+ return typemap[artifact_type]
116
+ except KeyError:
117
+ log.exception("Illegal artifact_type='%s'", artifact_type)
118
+ return artifact_type
119
+
120
+
121
+ def get_artifact_type(bounty: Bounty) -> 'ArtifactType':
122
+ """Return the ``ArtifactType`` of ``Bounty``"""
123
+ return lookup_artifact_type(bounty["artifact_type"])
124
+
125
+
126
+ def is_file_artifact(bounty: Bounty) -> bool:
127
+ """Check if ``bounty`` is for a FILE artifact"""
128
+ return get_artifact_type(bounty) == FILE_ARTIFACT
129
+
130
+
131
+ def is_url_artifact(bounty: Bounty) -> bool:
132
+ """Check if ``bounty`` is for a URL artifact"""
133
+ return get_artifact_type(bounty) == URL_ARTIFACT
134
+
135
+
136
+ def get_artifact_bytes(bounty: Bounty) -> bytes:
137
+ """Read and return ``bounty``'s artifact as bytes."""
138
+ local, path = _lookup_artifact_path(bounty)
139
+
140
+ if local:
141
+ return path.read_bytes()
142
+
143
+ with contextlib.closing(get_artifact_stream(bounty)) as fp:
144
+ return b''.join(_blocks_iter(fp))
145
+
146
+
147
+ def get_artifact_stream(bounty: Bounty) -> t.BinaryIO:
148
+ """Return a `Path` pointing to a temporary file containing this bounty's contents.
149
+ Return a readable, non-seekable binary stream."""
150
+ local, path = _lookup_artifact_path(bounty)
151
+
152
+ if local:
153
+ return open(path, 'rb')
154
+
155
+ return _open_artifact_uri(bounty)
156
+
157
+
158
+ def get_artifact_path(bounty: Bounty) -> pathlib.Path:
159
+ """Copy ``bounty``'s artifact to a tmpfile & return a `Path` pointing to it."""
160
+ local, path = _lookup_artifact_path(bounty)
161
+
162
+ if not local:
163
+ try:
164
+ # otherwise, write the data directly to our temporary file
165
+ with contextlib.closing(_open_artifact_uri(bounty)) as fp:
166
+ with open(path, 'wb') as tfp:
167
+ for block in _blocks_iter(fp):
168
+ tfp.write(block)
169
+ except requests.exceptions.HTTPError as err:
170
+ log.warning('HTTPError fetching the artifact: %r', err)
171
+ raise
172
+
173
+ return path
174
+
175
+
176
+ @contextlib.contextmanager
177
+ def ArtifactTempfile(bounty: Bounty):
178
+ """ContextManager to get a temporary filename to ``bounty``'s artifact
179
+
180
+ .. example::
181
+
182
+ >>> with ArtifactTempfile(eicar_bounty) as path:
183
+ >>> print(path)
184
+ PosixPath('/tmp/275a021bbfb6489e54d471899f7db9d1663fc695ec2fe2a2c4538aabf651fd0f')
185
+ """
186
+ try:
187
+ try:
188
+ yield get_artifact_path(bounty)
189
+ except requests.exceptions.HTTPError as err:
190
+ log.warning('HTTPError fetching the bounty: %r', err)
191
+ raise BountyFetchException from err
192
+ finally:
193
+ bounty_cleanup(bounty)
194
+
195
+
196
+ def bounty_cleanup(bounty: Bounty):
197
+ """Cleanup all temporary files created while handling this bounty"""
198
+
199
+ with contextlib.suppress(FileNotFoundError):
200
+ local, path = _lookup_artifact_path(bounty)
201
+
202
+ if not local:
203
+ os.unlink(path)
204
+
205
+
206
+ def forge_local_bounty(
207
+ *,
208
+ artifact_type: ArtifactType = FILE_ARTIFACT,
209
+ artifact_uri: t.Optional[str] = None,
210
+ metadata: 'BountyMetadata' = None,
211
+ data: t.Optional[t.Union[str, bytes]] = None,
212
+ path: t.Optional['GenericPathLike'] = None,
213
+ stream: t.Optional[t.BinaryIO] = None,
214
+ sha256: t.Optional[t.Union[str, bytes]] = None,
215
+ mimetype: t.Optional[str] = None,
216
+ min_allowed_bid: 'Bid' = int(0.0625 * 1e18),
217
+ max_allowed_bid: 'Bid' = int(0.9999 * 1e18),
218
+ expiration: t.Union[str, datetime, timedelta] = timedelta(seconds=30),
219
+ ) -> 'Bounty':
220
+ """Convenience method to forge mock `Bounty` for local testing
221
+
222
+ Examples
223
+ --------
224
+
225
+ Providing `path` will produce a bounty with a `file://` artifact uri
226
+
227
+ >>> forge_local_bounty(path='/usr/bin/ls', artifact_type='file')
228
+ {'id': 4157832140, 'artifact_uri': 'file:///usr/bin/ls', 'artifact_type': 'file', 'sha256':
229
+ 'b1b249f39beaa9360abe95570560437f41e0a0f8bb7e3c74546078996d80c5ff', 'mimetype': 'application/x-pie-executable'}
230
+
231
+ If `data` is available, but no `path` is provided, a `data:` URI will be generated instead.
232
+
233
+ >>> forge_local_bounty(data=b'test', artifact_type='file')
234
+ {'id': 3146510944, 'artifact_uri': 'data:text/plain;base64,dGVzdA==', 'artifact_type': 'file', 'sha256':
235
+ '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 'mimetype': 'text/plain'}
236
+ """
237
+ # Check the value of min_allowed_bid / max_allowed_bid
238
+ assert min_allowed_bid > 0, f"min_allowed_bid ({min_allowed_bid}) must be larger than 0"
239
+ assert max_allowed_bid > min_allowed_bid,\
240
+ f"max_allowed_bid ({min_allowed_bid}) must be larger than min_allowed_bid ({min_allowed_bid})"
241
+
242
+ artifact_type = lookup_artifact_type(artifact_type)
243
+
244
+ # URL artifacts should always use the `URL_MIMETYPE`
245
+ if artifact_type == URL_ARTIFACT:
246
+ mimetype = URL_MIMETYPE
247
+
248
+ # Handle file-like streams
249
+ if stream is not None:
250
+ data = b''.join(iter(stream.read, b''))
251
+
252
+ tempfile = None
253
+ if artifact_uri is not None:
254
+ tempfile = ArtifactTempfile({
255
+ 'artifact_uri' : artifact_uri,
256
+ 'id': f'{random.random():0.10f}'[2:],
257
+ })
258
+ path = tempfile.__enter__()
259
+
260
+ mimetype = mimetype or guess_mimetype(path)
261
+
262
+ elif path is not None:
263
+ # Convert str & os.PathLikes to `pathlib.Path`
264
+ if not isinstance(path, pathlib.Path):
265
+ path = pathlib.Path(path)
266
+
267
+ # resolve this to it's abspath
268
+ path = path.resolve()
269
+ artifact_uri = path.as_uri()
270
+
271
+ # data == True -> is from an HTTP location
272
+ mimetype = mimetype or guess_mimetype(path)
273
+
274
+ elif data is not None:
275
+ if isinstance(data, str):
276
+ data = data.encode()
277
+
278
+ mimetype = mimetype or guess_mimetype(data)
279
+
280
+ # Build a `data:` URI with a base64-encoded `data`
281
+ artifact_uri = build_data_uri(data, mimetype)
282
+
283
+ assert artifact_uri is not None, "Cannot build URI without 'path' or 'data'"
284
+
285
+ # Create our SHA256 digest
286
+ if sha256 is None:
287
+ if data is None:
288
+ data = path.read_bytes()
289
+
290
+ sha256 = hashlib.sha256(data).hexdigest() # type: ignore
291
+ elif isinstance(sha256, bytes):
292
+ sha256 = sha256.hex()
293
+
294
+ assert len(sha256) == 64, "Invalid SHA256 digest"
295
+
296
+ if isinstance(expiration, timedelta):
297
+ duration = int(expiration.total_seconds())
298
+ expiration = (datetime.now(timezone.utc) + expiration).isoformat()
299
+ else:
300
+ # For a fake, the standard should be ok.
301
+ duration = 30
302
+
303
+ assert isinstance(expiration, (datetime, str))
304
+
305
+ return Bounty(
306
+ id=_forge_bounty_id(artifact_type, sha256),
307
+ artifact_uri=artifact_uri,
308
+ artifact_type=artifact_type,
309
+ metadata={
310
+ 'sha256': sha256,
311
+ 'mimetype': mimetype,
312
+ },
313
+ rules={
314
+ "min_allowed_bid": min_allowed_bid,
315
+ "max_allowed_bid": max_allowed_bid,
316
+ },
317
+ duration=duration,
318
+ expiration=expiration,
319
+ )
320
+
321
+
322
+ def _open_artifact_uri(bounty: Bounty) -> t.BinaryIO:
323
+ """Return a stream of the contents of this bounty's `artifact_uri`"""
324
+ uri = bounty['artifact_uri']
325
+ uri_scheme = urllib.parse.urlsplit(uri).scheme
326
+
327
+ if uri_scheme in {'data', 'file'}:
328
+ return urllib.request.urlopen(uri)
329
+ else:
330
+ request = requests.get(uri, stream=True)
331
+ request.raise_for_status()
332
+ request.raw.decode_content = True
333
+ return request.raw
334
+
335
+
336
+ def _forge_bounty_id(artifact_type: ArtifactType, sha256: str) -> int:
337
+ """Convenience method to forge a mock bounty ID
338
+
339
+ Distinct UUID namespaces are used for file & urls to distinguish between files *containing* a URL and the URL
340
+ itself.
341
+ """
342
+ guid = _forge_bounty_uuid(artifact_type, sha256)
343
+ # Return only the lower 32 bits of our UUID
344
+ return guid.int & 0xffffffff
345
+
346
+
347
+ def _forge_bounty_uuid(artifact_type: ArtifactType, digest: str) -> "uuid.UUID":
348
+ """Convenience method to forge a mock bounty UUID
349
+
350
+ Distinct UUID namespaces are used for file & urls to distinguish between files *containing* a URL and the URL
351
+ itself.
352
+ """
353
+ # The SHA256 cannot distinguish between Bounties on files *containing* a URL and actual URLs.
354
+ if artifact_type == FILE_ARTIFACT:
355
+ namespace = FILE_BOUNTY_UUID
356
+ elif artifact_type == URL_ARTIFACT:
357
+ namespace = URL_BOUNTY_UUID
358
+ else:
359
+ raise ValueError(f"Invalid artifact_type='{artifact_type}'")
360
+
361
+ return uuid.uuid5(namespace, digest)
362
+
363
+
364
+ ARTIFACT_TMPDIR = None
365
+ ArtifactPathLookup = t.NamedTuple('ArtifactPathLookup', [('local', bool), ('path', pathlib.Path)])
366
+
367
+
368
+ def _lookup_artifact_path(bounty: Bounty) -> 'ArtifactPathLookup':
369
+ # Just return the local path for file:// URLs. No sense in performing a copy unless requested.
370
+ uri = urllib.parse.urlsplit(bounty['artifact_uri'])
371
+
372
+ if uri.scheme == 'file':
373
+ # `uri.path` contains a URL-encoded path, which we must decode.
374
+ real_path = urllib.parse.unquote(uri.path)
375
+ # If a file:// uri has been passed that doesn’t exist, `FileNotFoundError` is raised.
376
+ return ArtifactPathLookup(local=True, path=pathlib.Path(real_path).resolve(strict=True))
377
+ else:
378
+ global ARTIFACT_TMPDIR
379
+
380
+ if ARTIFACT_TMPDIR is None:
381
+ ARTIFACT_TMPDIR = pathlib.Path(tempfile.gettempdir()).absolute()
382
+
383
+ return ArtifactPathLookup(local=False, path=ARTIFACT_TMPDIR.joinpath('bounty-{id}'.format_map(bounty)))
384
+
385
+
386
+ def _blocks_iter(fp: t.BinaryIO, *, block_size: int = 4096) -> t.Iterator[bytes]:
387
+ return iter(functools.partial(fp.read, block_size), b'')
@@ -0,0 +1,76 @@
1
+ import functools
2
+ import importlib
3
+
4
+ from celery.worker.consumer import mingle, gossip
5
+ from celery.worker.worker import WorkController
6
+
7
+ import polyswarm_engine.settings
8
+
9
+ # monkey patch to enable -Ofair, the most stable during our tests
10
+ _original_setup_defaults = WorkController.setup_defaults
11
+ @functools.wraps(WorkController.setup_defaults)
12
+ def _new_setup_defaults(self, *args, **kwargs):
13
+ kwargs['optimization'] = 'fair'
14
+ return _original_setup_defaults(self, *args, **kwargs)
15
+ WorkController.setup_defaults = _new_setup_defaults
16
+ # monkey patch to disable mingle and gossip
17
+ mingle.Mingle.compatible_transports = {}
18
+ gossip.Gossip.compatible_transports = {}
19
+
20
+ ##########################################
21
+ # Celery Configuration
22
+ ##########################################
23
+ # https://docs.celeryproject.org/en/stable/userguide/configuration.html
24
+ class CeleryConfig:
25
+ def __init__(
26
+ self,
27
+ broker: str = None,
28
+ vhost: str = None,
29
+ **kwargs,
30
+ ):
31
+ # Needs to reload to address PSENGINE_TASK_ALWAYS_EAGER late changes
32
+ importlib.reload(polyswarm_engine.settings)
33
+
34
+ from polyswarm_engine.settings import (
35
+ PSENGINE_BROKER_URL,
36
+ PSENGINE_BROKER_VHOST,
37
+ PSENGINE_WORKER_CONCURRENCY,
38
+ PSENGINE_WORKER_MAX_TASKS_PER_CHILD,
39
+ PSENGINE_WORKER_PREFETCH_MULTIPLIER,
40
+ PSENGINE_TASK_ALWAYS_EAGER,
41
+ )
42
+
43
+ broker = PSENGINE_BROKER_URL
44
+ vhost = PSENGINE_BROKER_VHOST
45
+
46
+ self.broker_url = f'{broker}/{vhost}' if vhost else broker
47
+ self.broker_heartbeat = None
48
+ self.broker_connection_retry_on_startup = True
49
+ self.result_backend = None
50
+ self.task_ignore_result = True
51
+ self.task_acks_late = True
52
+ self.task_reject_on_worker_lost = True
53
+ self.task_store_errors_even_if_ignored = False
54
+ self.task_queue_max_priority = 10
55
+ self.task_default_priority = 5
56
+ self.worker_concurrency = PSENGINE_WORKER_CONCURRENCY
57
+ self.worker_prefetch_multiplier = PSENGINE_WORKER_PREFETCH_MULTIPLIER
58
+ self.worker_hijack_root_logger = False
59
+ self.worker_max_tasks_per_child = PSENGINE_WORKER_MAX_TASKS_PER_CHILD
60
+ self.worker_send_task_events = False
61
+ self.worker_enable_remote_control = False
62
+ self.worker_cancel_long_running_tasks_on_connection_loss = True
63
+ self.task_always_eager = PSENGINE_TASK_ALWAYS_EAGER
64
+ self.broker_transport_options = {
65
+ 'max_retries': 3,
66
+ 'interval_start': 0,
67
+ 'interval_step': 0.2,
68
+ 'interval_max': 0.5,
69
+ 'fanout_prefix': True,
70
+ 'fanout_patterns': True,
71
+ }
72
+
73
+ # Allows general settings override by user.
74
+ # AT YOUR OWN RISK
75
+ for k, v in kwargs.items():
76
+ setattr(self, k, v)