polyswarm-engine 3.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ import os
2
+ import platform
3
+ import shutil
4
+
5
+ #: The queue system URL used by Celery
6
+ PSENGINE_BROKER_URL = os.getenv('PSENGINE_BROKER_URL', 'amqp://user:password@rabbitmq:5672')
7
+ #: Default vhost if Celery is backed by RabbitMQ
8
+ PSENGINE_BROKER_VHOST = os.getenv('PSENGINE_BROKER_VHOST', 'engines')
9
+
10
+ #: Verbosity level for logs
11
+ LOG_LEVEL = os.getenv('LOG_LEVEL', 'WARNING')
12
+ if LOG_LEVEL.isnumeric():
13
+ LOG_LEVEL = int(LOG_LEVEL)
14
+ #: Format of logs. 'json' is available, in adition to 'text'
15
+ LOG_FORMAT = os.getenv('LOG_FORMAT', 'text')
16
+
17
+ #: Path of the `wine` executable
18
+ WINELOADER: str | None = os.getenv('WINELOADER') or shutil.which('wine')
19
+ #: Path of the `wineserver` executable
20
+ WINESERVER: str | None = os.getenv('WINESERVER') or shutil.which('wineserver')
21
+ #: Path of the `winepath` command executable
22
+ WINEPATH_CMD: str | None = os.getenv('WINEPATH_CMD') or shutil.which('winepath')
23
+
24
+ #: Reported machine archtecture where the scanner runs
25
+ PSENGINE_METADATA_ARCHTECTURE: str = os.getenv('PSENGINE_METADATA_ARCHTECTURE', platform.machine())
26
+ #: Reported operational system where the scanner runs
27
+ PSENGINE_METADATA_OS = os.getenv('PSENGINE_METADATA_OS', platform.system())
28
+
29
+ #: Used to compute HMAC for PolySwarm bounties sent via HTTP
30
+ PSENGINE_WEBHOOK_SECRET = os.getenv('PSENGINE_WEBHOOK_SECRET')
31
+
32
+ # Celery Worker related configs.
33
+ # Names are prefixed to conflict not with user instances of Celery
34
+ PSENGINE_WORKER_CONCURRENCY: int = int(os.getenv('PSENGINE_WORKER_CONCURRENCY', '1'))
35
+ PSENGINE_WORKER_PREFETCH_MULTIPLIER: int = int(os.getenv('PSENGINE_WORKER_PREFETCH_MULTIPLIER', '1'))
36
+ PSENGINE_WORKER_MAX_TASKS_PER_CHILD: int = int(os.getenv('PSENGINE_WORKER_MAX_TASKS_PER_CHILD', '1000'))
37
+ PSENGINE_TASK_ALWAYS_EAGER: bool = bool(int(os.getenv('PSENGINE_TASK_ALWAYS_EAGER', '0')))
38
+
39
+ #: Name of the Celery task that processes the delivery of assertions and votes.
40
+ # If empty, fallback to doing the delivery directly via HTTP
41
+ PSENGINE_DELIVERY_TASK = os.getenv('PSENGINE_DELIVERY_TASK', '')
@@ -0,0 +1,125 @@
1
+ import os
2
+ import typing as t
3
+
4
+ if t.TYPE_CHECKING:
5
+ import datetime as dt
6
+
7
+ AnalysisResult = t.Literal["benign", "malicious", "suspicious", "unknown"]
8
+ GenericPathLike = t.Union[str, os.PathLike]
9
+ ArtifactType = t.Literal["FILE", "URL"]
10
+ Bid = int
11
+ Duration = t.Union[int, float, str]
12
+ OperatingSystemName = t.Literal["linux", "darwin", "windows"]
13
+
14
+
15
+ class Bounty(t.TypedDict, total=False):
16
+ id: int
17
+ artifact_uri: str
18
+ artifact_type: 'ArtifactType'
19
+ response_url: t.Optional[str]
20
+ metadata: 'BountyMetadata'
21
+ rules: 'BountyRules'
22
+ duration: t.Optional['Duration']
23
+ expiration: t.Union['dt.datetime', str]
24
+ tasked_at: t.Optional[t.Union['dt.datetime', str]]
25
+
26
+
27
+ class BountyMetadata(t.TypedDict, total=False):
28
+ sha256: t.Optional[str]
29
+ mimetype: t.Optional[str]
30
+
31
+
32
+ class BountyRules(t.TypedDict, total=False):
33
+ max_allowed_bid: 'Bid'
34
+ min_allowed_bid: 'Bid'
35
+
36
+
37
+ # XXX: This could be a dangerously named type
38
+ class Environment(t.TypedDict, total=False):
39
+ architecture: t.Optional[str]
40
+
41
+ # The operating system used for the dynamic analysis of the malware instance. This applies to virtualized operating
42
+ # systems as well as those running on bare metal
43
+ operating_system: t.Optional['OperatingSystemName']
44
+
45
+
46
+ class Scanner(t.TypedDict, total=False):
47
+ environment: t.Optional['Environment']
48
+
49
+ # The version of the analysis engine or product (including AV engines) that was used to perform the analysis.
50
+ vendor_version: t.Optional[str]
51
+
52
+ # The version of the analysis definitions used by the analysis tool (including AV tools).
53
+ signatures_version: t.Optional[str]
54
+
55
+ # The version of the PolySwarm engine wrapper
56
+ version: t.Optional[str]
57
+
58
+
59
+ class AnalysisMetadata(t.TypedDict, total=False):
60
+ # The name of the analysis engine or product that was used. Product names
61
+ # SHOULD be all lowercase with words separated by a dash "-".
62
+ product: str
63
+
64
+ # The classification result or name assigned to the malware instance by the scanner tool.
65
+ malware_family: t.Optional[str]
66
+
67
+ # Captures comments regarding the analysis that was performed
68
+ comments: t.List[str]
69
+
70
+ scanner: t.Optional['Scanner']
71
+
72
+ # indicator for assertions generated from heuristics
73
+ heuristic: t.Optional[bool]
74
+
75
+
76
+ class Analysis(t.TypedDict, total=False):
77
+ # Captures the conclusion of the analysis, such as whether the binary was found to be malicious.
78
+ verdict: 'AnalysisResult'
79
+
80
+ # Captures the relative measure of confidence in the accuracy of the analysis results.
81
+ # The confidence value *MUST* be a float in the range of 0.0 ~ 1.0
82
+ confidence: t.Optional[float]
83
+
84
+ bid: t.Optional['Bid']
85
+ metadata: t.Optional['AnalysisMetadata']
86
+
87
+ # Specifies the name of the vendor of this analysis engine
88
+ vendor: t.Optional[str]
89
+
90
+ author: t.Optional[str]
91
+
92
+
93
+ class CompletedProcessDict(t.TypedDict, total=True):
94
+ returncode: int
95
+ args: t.Sequence[str]
96
+ stdout: t.Optional[str]
97
+ stderr: t.Optional[str]
98
+
99
+
100
+ class ApplyResult(t.Protocol):
101
+ """The class of the result returned by BaseTaskBackend.apply_async()"""
102
+
103
+ def get(self, timeout: float = None):
104
+ ...
105
+
106
+ def wait(self, timeout: float = None):
107
+ ...
108
+
109
+ def ready(self) -> bool:
110
+ ...
111
+
112
+ def successful(self) -> bool:
113
+ ...
114
+
115
+
116
+ EngineHeadCallable = t.Callable[[], "AnalysisMetadata"]
117
+ EngineCheckCallable = t.Callable[["Bounty"], bool]
118
+ EngineAnalyzeCallable = t.Callable[["Bounty"], "Analysis"]
119
+ EngineCommandCallable = t.Callable
120
+
121
+ PollResultT = t.TypeVar('PollResultT')
122
+
123
+ PollTargetCallable = t.Callable[..., PollResultT]
124
+ PollStepCallable = t.Callable[[float], float]
125
+ PollCheckCallable = t.Callable[[PollResultT], bool]
@@ -0,0 +1,434 @@
1
+ import base64
2
+ import datetime as dt
3
+ from datetime import timezone as tz
4
+ import functools
5
+ import inspect
6
+ import logging
7
+ import os
8
+ import os.path
9
+ import re
10
+ import stat
11
+ import subprocess
12
+ import sys
13
+ import time
14
+ import typing as t
15
+
16
+ from .exceptions import EngineExpiredException, EngineMaxCallException
17
+ from .typing import GenericPathLike
18
+ from .wine import WINELOADER
19
+
20
+ if t.TYPE_CHECKING:
21
+ from .engine import EngineManager
22
+ from .typing import (
23
+ CompletedProcessDict,
24
+ PollCheckCallable,
25
+ PollResultT,
26
+ PollStepCallable,
27
+ PollTargetCallable,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def spawn_subprocess(args: t.Sequence[str], use_wine=False, text=True, **popen_kwargs) -> 'CompletedProcessDict':
34
+ """Run the command described by args. Wait for command to complete, then return a `CompletedProcessDict`.
35
+
36
+ :param args: Command arguments
37
+ :param use_wine:
38
+ If true, prefix `args` with the absolute path of ``WINELOADER``. Packaged WINE installations generally register
39
+ a PE `binfmt <https://en.wikipedia.org/wiki/Binfmt_misc>`, however `use_wine` should *still* be provided if
40
+ this command runs under WINE.
41
+ :param popen_kwargs: Additional keyword args to pass to `subprocess.run`
42
+ """
43
+ if use_wine and sys.platform != "win32":
44
+ if not WINELOADER:
45
+ raise FileNotFoundError("wine not found")
46
+ args = [WINELOADER, *args]
47
+
48
+ # supply 'capture_output=True` unless 'stdout' or 'stderr' are provided.
49
+ if 'stdout' not in popen_kwargs and 'stderr' not in popen_kwargs:
50
+ popen_kwargs.setdefault('capture_output', True)
51
+
52
+ popen_kwargs.setdefault('timeout', 30)
53
+
54
+ # NOTE only use spawn if your command returns text
55
+ if text:
56
+ popen_kwargs['text'] = True
57
+ popen_kwargs.setdefault('encoding', 'utf-8')
58
+ popen_kwargs.setdefault('errors', 'ignore')
59
+ else:
60
+ popen_kwargs['text'] = False
61
+
62
+ proc = subprocess.run(args, **popen_kwargs)
63
+
64
+ return dict(
65
+ returncode=proc.returncode,
66
+ args=tuple(map(str, proc.args)),
67
+ stdout=proc.stdout or None,
68
+ stderr=proc.stderr or None,
69
+ )
70
+
71
+
72
+ def pattern_matches(
73
+ stream: str,
74
+ patterns: t.Union[t.Iterator[str], t.Sequence[str]],
75
+ in_order: bool = False,
76
+ index: int = 0,
77
+ flags: int = re.MULTILINE,
78
+ foldspaces: bool = True,
79
+ ):
80
+ """Generic "search for pattern in stream, using index" behavior.
81
+
82
+ :param stream:
83
+ The string to match against
84
+
85
+ :param patterns:
86
+ A sequence of regular expressions whose regex groups (`(?P<GROUP_NAME>matches)`) will be extracted as a
87
+ dictionary (as in `re.Match.groupdict`)
88
+
89
+ :param in_order:
90
+ Setting causes patterns to only match those patterns appearing *after* the last matching pattern.
91
+
92
+ :param index:
93
+ The index to begin searching for patterns
94
+
95
+ :param foldspaces:
96
+ Controls if tabs & spaces inside `str` patterns (ignores `re.Pattern`) match any number of tabs /or/ spaces.
97
+ """
98
+ string = stream[index:].replace("\r\n", os.linesep)
99
+
100
+ if foldspaces:
101
+ patterns = (re.sub(r'(?<!\[)[ \t]+(?!\])', r'[ \t]+', p) for p in patterns)
102
+
103
+ pattern = re.compile('|'.join(map('(?:{})'.format, patterns)), flags=flags)
104
+
105
+ # Update seek index if we've matched
106
+ last_group_index = -1
107
+
108
+ # Search, across lines if necessary
109
+ for match in pattern.finditer(string):
110
+ for group_name, value in match.groupdict(None).items():
111
+ if group_name and value:
112
+ group_index = pattern.groupindex[group_name]
113
+
114
+ if in_order:
115
+ if group_index < last_group_index:
116
+ continue
117
+ last_group_index = group_index
118
+
119
+ yield group_name, value
120
+
121
+
122
+ def get_func_name(f: t.Callable) -> str:
123
+ module, export_name = get_func_qual(f)
124
+
125
+ if module and export_name:
126
+ return f"{module}.{export_name}"
127
+ elif export_name:
128
+ return export_name
129
+ else:
130
+ raise ValueError("Could not find function name")
131
+
132
+
133
+ def get_func_qual(func) -> t.Tuple[str, str]:
134
+ """ Return the function import path (as a list of module names), and a name for the function. """
135
+ # Unwrap `functools.partials`
136
+ while hasattr(func, 'func'):
137
+ func = func.func
138
+
139
+ if hasattr(func, '__module__'):
140
+ module = func.__module__
141
+ else:
142
+ try:
143
+ module = inspect.getmodule(func)
144
+ except TypeError:
145
+ if hasattr(func, '__class__'):
146
+ module = func.__class__.__module__
147
+ else:
148
+ module = 'unknown'
149
+ if module is None:
150
+ # Happens in doctests, eg
151
+ module = ''
152
+ elif module == '__main__':
153
+ try:
154
+ filename = os.path.abspath(inspect.getsourcefile(func))
155
+ except:
156
+ filename = None
157
+ if filename is not None:
158
+ # mangling of full path to filename
159
+ parts = filename.split(os.sep)
160
+
161
+ if parts[-1].startswith('<ipython-input'):
162
+ # We're in a IPython (or notebook) session. parts[-1] comes
163
+ # from func.__code__.co_filename and is of the form
164
+ # <ipython-input-N-XYZ>, where:
165
+ # - N is the cell number where the function was defined
166
+ # - XYZ is a hash representing the function's code (and name).
167
+ # It will be consistent across sessions and kernel restarts,
168
+ # and will change if the function's code/name changes
169
+ # We remove N so that cache is properly hit if the cell where
170
+ # the func is defined is re-exectuted.
171
+ # The XYZ hash should avoid collisions between functions with
172
+ # the same name, both within the same notebook but also across
173
+ # notebooks
174
+ splitted = parts[-1].split('-')
175
+ parts[-1] = '-'.join(splitted[:2] + splitted[3:])
176
+ elif len(parts) > 2 and parts[-2].startswith('ipykernel_'):
177
+ # In a notebook session (ipykernel). Filename seems to be 'xyz'
178
+ # of above. parts[-2] has the structure ipykernel_XXXXXX where
179
+ # XXXXXX is a six-digit number identifying the current run (?).
180
+ # If we split it off, the function again has the same
181
+ # identifier across runs.
182
+ parts[-2] = 'ipykernel'
183
+ filename = '-'.join(parts)
184
+ if filename.endswith('.py'):
185
+ filename = filename[:-3]
186
+ module = '{}-{}'.format(module, filename)
187
+
188
+ if hasattr(func, 'func_name'):
189
+ name = func.func_name
190
+ elif hasattr(func, '__name__'):
191
+ name = func.__name__
192
+ else:
193
+ name = 'unknown'
194
+
195
+ # XXX maybe add a warning here? this is a hack to detect functions not defined at the module-level
196
+ if hasattr(func, 'func_globals') and name in func.func_globals:
197
+ if func.func_globals[name] is not func:
198
+ name = '%s-alias' % name
199
+
200
+ if inspect.ismethod(func):
201
+ # We need to add the name of the class
202
+ if hasattr(func, 'im_class'):
203
+ klass = func.im_class # type: ignore
204
+ module = '{}{}'.format(module, klass.__name__)
205
+
206
+ return module, name
207
+
208
+
209
+ def build_data_uri(data: bytes, mimetype: t.Optional[str] = None) -> str:
210
+ """Return a RFC2397-compatible "data" URI"""
211
+ return "data:{};base64,{}".format(mimetype or "", base64.b64encode(data).decode("ascii"))
212
+
213
+
214
+ def guess_mimetype(data: t.Union[bytes, 'GenericPathLike']) -> str:
215
+ """Guess the MIME type of a file based on its contents"""
216
+ inputdata = data if isinstance(data, bytes) else None
217
+ try:
218
+ return subprocess.check_output(
219
+ [
220
+ "file",
221
+ "--brief",
222
+ "--mime-type",
223
+ "-" if inputdata else str(data),
224
+ ],
225
+ input=inputdata,
226
+ timeout=5,
227
+ ).decode("ascii").strip() or None
228
+ except FileNotFoundError as e:
229
+ logger.error(e)
230
+ return None
231
+
232
+
233
+ def is_fifo(path) -> bool:
234
+ """Check if a path is a FIFO"""
235
+ if isinstance(path, (str, os.PathLike)) and os.path.exists(path):
236
+ return stat.S_ISFIFO(os.stat(path).st_mode)
237
+ else:
238
+ return False
239
+
240
+
241
+ def get_open_port() -> int:
242
+ """Returns an open port"""
243
+ import socket
244
+
245
+ with socket.socket(family=socket.AF_INET, type=socket.SOCK_STREAM) as s:
246
+ s.bind(("", 0))
247
+ return s.getsockname()[1]
248
+
249
+
250
+ def resource_path(*paths, where: str = None, strict: bool = False) -> str:
251
+ """Return the absolute, resolved path relative to the caller's file
252
+
253
+ Example
254
+ -------
255
+
256
+ If `resource_path` was called within the file `/tmp/test-engine/engine.py`:
257
+
258
+ >>> resource_path("vendor/file")
259
+ /tmp/test-engine/vendor/file
260
+
261
+ Using `where` to manually specify the relative root file:
262
+
263
+ >>> resource_path("other", where=__file__)
264
+ /tmp/test-engine/other
265
+
266
+ Notes
267
+ -----
268
+
269
+ The `strict` parameter is used from some engines to produce a warning missing files/folders which are required for
270
+ correct operation of the engine. However, this has been removed because it breaks non-worker users of engines.
271
+ """
272
+ if where is None:
273
+ import inspect
274
+
275
+ try:
276
+ caller = inspect.stack(context=1)[1]
277
+ where = caller.filename
278
+ finally:
279
+ # NOTE: The documentation isn't detailed enough to determine if needed
280
+ # see: https://docs.python.org/3/library/inspect.html#the-interpreter-stack
281
+ del caller
282
+
283
+ return os.path.realpath(os.path.join(os.path.dirname(where), *paths))
284
+
285
+
286
+ def poll(
287
+ target: 'PollTargetCallable',
288
+ args=(),
289
+ kwargs=None,
290
+ step: 't.Union[float, int]' = 1,
291
+ timeout: 't.Optional[t.Union[int, float, dt.timedelta]]' = None,
292
+ expiration: 't.Optional[dt.datetime]' = None,
293
+ max_tries: 't.Optional[int]' = None,
294
+ check_success: 'PollCheckCallable' = lambda x: x is not None,
295
+ step_function: 'PollStepCallable' = lambda s: s,
296
+ ignore_exceptions: 't.Tuple[t.Type[BaseException], ...]' = tuple(),
297
+ ) -> 'PollResultT':
298
+ """Poll by calling a target function until a certain condition is met.
299
+
300
+ You must specify at least a target function to be called and the step --
301
+ base wait time between each function call.
302
+
303
+ :param args: Arguments to be passed to the target function
304
+
305
+ :param kwargs: Keyword arguments to be passed to the target function
306
+
307
+ :param step: Step defines the amount of time to wait (in seconds)
308
+
309
+ :param timeout: The target function will be called until the time elapsed is
310
+ greater than the maximum timeout (in seconds).
311
+
312
+ :param expiration: The target function will be called until the time is after
313
+ the expiration if non-`None`.
314
+
315
+ :param max_tries: Maximum number of times the target function will be called
316
+ before failing
317
+
318
+ :param check_success: A callback function that accepts the return value of
319
+ the target function. It should return true if you want the polling
320
+ function to stop and return this value. It should return false if you
321
+ want it to continue executing. The default is a callback that tests for
322
+ truthiness (anything not False, 0, or empty collection).
323
+
324
+ :param step_function: A callback function that accepts each iteration's
325
+ "step." By default, this is constant, but you can also pass a function
326
+ that will increase or decrease the step.
327
+
328
+ :param ignore_exceptions: You can specify a tuple of exceptions that should
329
+ be caught and ignored on every iteration. If the target function raises
330
+ one of these exceptions, it will be caught and the exception instance
331
+ will be pushed to the queue of values collected during polling. Any
332
+ other exceptions raised will be raised as normal.
333
+
334
+ :return: Polling will return first value from the target function that meets
335
+ the condions of the check_success callback. By default, this will be the
336
+ first value that is not None, 0, False, '', or an empty collection.
337
+
338
+
339
+ Note
340
+ ----
341
+
342
+ The actual execution time of the function *can* exceed the time specified in
343
+ the timeout or expiration. For instance, if the target function takes 10
344
+ seconds to execute and the timeout is 21 seconds, the polling function will
345
+ take a total of 30 seconds (two iterations of the target --20s which is less
346
+ than the timeout--21s, and a final iteration).
347
+ """
348
+ assert expiration is not None or timeout is not None or max_tries is not None, \
349
+ 'You did not specify an expiration, maximum number of tries or a timeout.'
350
+
351
+ if timeout is not None:
352
+ if isinstance(timeout, (int, float)):
353
+ timeout = dt.timedelta(seconds=timeout)
354
+
355
+ timeout_dt = dt.datetime.now(tz.utc) + timeout
356
+
357
+ if expiration is None:
358
+ expiration = timeout_dt
359
+ else:
360
+ expiration = min(expiration, timeout_dt)
361
+ logger.debug(f"Using minimum of expiration & timeout ({expiration:%c})")
362
+
363
+ tries = 0
364
+ kwargs = kwargs or dict()
365
+
366
+ logger.debug("Begin polling on %s(expiration=%s, tries=%d, max_tries=%s)", target, expiration, tries, max_tries)
367
+
368
+ last_item = None
369
+
370
+ while True:
371
+ if max_tries is not None and tries >= max_tries:
372
+ raise EngineMaxCallException(last_item)
373
+
374
+ try:
375
+ val = target(*args, **kwargs)
376
+ last_item = val
377
+ except ignore_exceptions as e:
378
+ last_item = e
379
+ logger.error("poll() ignored exception %r", e)
380
+ else:
381
+ # Condition passes, this is the only "successful" exit from the polling function
382
+ if check_success(val):
383
+ logger.debug("Success, continuing %s(tries=%d)", target, tries)
384
+ return val
385
+ else:
386
+ logger.debug("Failed, continuing %s(tries=%d)", target, tries)
387
+
388
+ tries += 1
389
+ logger.debug("%s(expiration=%s, tries=%d, max_tries=%s)", target, expiration, tries, max_tries)
390
+
391
+ # Check the max tries at this point so it will not sleep before raising the exception
392
+ if max_tries is not None and tries >= max_tries:
393
+ raise EngineMaxCallException(last_item)
394
+
395
+ # Check the time after to make sure the poll function is called at least once
396
+ if expiration is not None and dt.datetime.now(tz.utc) >= expiration:
397
+ raise EngineExpiredException(last_item)
398
+
399
+ time.sleep(step)
400
+ step = step_function(step)
401
+
402
+
403
+ def poll_decorator(
404
+ extract_poll_kwargs: t.Optional[t.Tuple[str, ...]] = (
405
+ "expiration",
406
+ "max_tries",
407
+ "step",
408
+ "step_function",
409
+ "timeout",
410
+ ),
411
+ **poll_kwargs,
412
+ ):
413
+ """Use poll() as a decorator.
414
+
415
+ :param extract_poll_kwargs: Tuple of keys which are popped from the
416
+ decorated wrapper function's keyword args (``kwargs``) and merged
417
+ with the keyword args passed to `poll` (``poll_kwargs``).
418
+
419
+ :return: decorator using poll()"""
420
+
421
+ def decorator(target):
422
+
423
+ @functools.wraps(target)
424
+ def wrapper(*args, **kwargs):
425
+ if extract_poll_kwargs:
426
+ for poll_keyword in extract_poll_kwargs:
427
+ if poll_keyword in kwargs:
428
+ poll_kwargs[poll_keyword] = kwargs.pop(poll_keyword)
429
+
430
+ return poll(target=target, args=args, kwargs=kwargs, **poll_kwargs)
431
+
432
+ return wrapper
433
+
434
+ return decorator