azul-client 9.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ """Module for interacting with binary endpoints."""
2
+
3
+ import copy
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import struct
9
+ from dataclasses import dataclass
10
+ from http import HTTPMethod
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from tempfile import SpooledTemporaryFile
14
+ from typing import IO
15
+
16
+ import cart
17
+ import httpx
18
+ import malpz
19
+ import pendulum
20
+ import tenacity
21
+ from azul_bedrock import models_network as azm
22
+ from azul_bedrock import models_restapi
23
+
24
+ from azul_client import exceptions
25
+ from azul_client.api.base_api import BaseApiHandler
26
+
27
+ logger = logging.getLogger(__name__)
28
+ DEFAULT_MAX_BYTES_TO_READ = 10 * 1024 * 1024 # 10MB worth of strings.
29
+
30
+
31
+ class _OpenFile:
32
+ """A handler for a potential filepath, raw bytes or a file handle."""
33
+
34
+ handle: IO[bytes] | None
35
+
36
+ def __init__(self, file_path_or_contents: IO[bytes] | bytes | str | Path):
37
+ self.opened_file = False
38
+ self.file_path_or_contents = file_path_or_contents
39
+ self.handle = None
40
+
41
+ def _get_file_handle(self, file_path_or_contents: Path | str | IO[bytes] | bytes):
42
+ if isinstance(file_path_or_contents, bytes):
43
+ self.handle = BytesIO(file_path_or_contents)
44
+ elif isinstance(file_path_or_contents, Path):
45
+ if not file_path_or_contents.exists():
46
+ raise FileExistsError(f"The file with the path {file_path_or_contents=} does not exist.")
47
+ self.opened_file = True
48
+ self.handle = file_path_or_contents.open(mode="rb")
49
+ elif isinstance(file_path_or_contents, str):
50
+ if not os.path.exists(file_path_or_contents):
51
+ raise FileExistsError(f"The file with the path {file_path_or_contents=} does not exist.")
52
+ self.handle = open(file_path_or_contents, mode="rb")
53
+ elif isinstance(file_path_or_contents, SpooledTemporaryFile):
54
+ self.handle = file_path_or_contents
55
+ else:
56
+ # Will already be IO[bytes]
57
+ self.handle = file_path_or_contents
58
+
59
+ def open(self) -> IO[bytes]:
60
+ """Open or provide the handle to a file."""
61
+ self._get_file_handle(self.file_path_or_contents)
62
+ return self.handle
63
+
64
+ def close(self):
65
+ """If a file was opened close it."""
66
+ if self.opened_file:
67
+ try:
68
+ if self.handle.closed:
69
+ self.handle.close()
70
+ except Exception:
71
+ print("Failed to close a file.")
72
+ raise
73
+
74
+ def __enter__(self):
75
+ """Open or provide the handle to a file."""
76
+ return self.open()
77
+
78
+ def __exit__(self, exc_type, exc_val, exc_tb):
79
+ """If a file was opened close it."""
80
+ self.close()
81
+
82
+
83
+ @dataclass
84
+ class AugmentedStream:
85
+ """An augmented way of viewing the original binary.
86
+
87
+ E.g: the binary is compiled C# code and an augmented stream is a decompiled version of the code.
88
+ E.g2: the original file is a jpg and the augmented stream is a png which has had any potential malware removed.
89
+ """
90
+
91
+ label: azm.DataLabel
92
+ file_name: str
93
+ contents_file_path: IO[bytes] | bytes | str | Path | _OpenFile
94
+
95
+
96
+ class _OpenAugmentedStreams:
97
+ opened_streams: list[AugmentedStream]
98
+
99
+ def __init__(self, streams: list[AugmentedStream]):
100
+ self.opened_file = False
101
+ self._in_streams = streams
102
+ self.opened_streams = []
103
+ self.file_handles = []
104
+ self.contents = None
105
+
106
+ def __enter__(self):
107
+ for stream in self._in_streams:
108
+ # Open the file or contents
109
+ open_file = _OpenFile(stream.contents_file_path)
110
+ self.file_handles.append(open_file)
111
+ # Make a new augmented stream object with the file or contents.
112
+ stream_copy = copy.copy(stream)
113
+ stream_copy.contents_file_path = open_file
114
+ self.opened_streams.append(stream_copy)
115
+ return self.opened_streams
116
+
117
+ def __exit__(self, exc_type, exc_val, exc_tb):
118
+ for fh in self.file_handles:
119
+ try:
120
+ fh.close()
121
+ except Exception:
122
+ print("Warning: Failed to close one of the Augmented streams.")
123
+
124
+
125
+ class BinariesData(BaseApiHandler):
126
+ """Interact with binary endpoints."""
127
+
128
+ SHA256_regex = r"^[a-fA-F0-9]{64}$"
129
+ upload_download_timeout = 120
130
+
131
+ def check_data(self, sha256: str) -> bool:
132
+ """Check data exists for hash."""
133
+ return self._generic_head_request(self.cfg.azul_url + f"/api/v0/binaries/{sha256}/content")
134
+
135
+ def download(self, sha256: str) -> bytes:
136
+ """Download binary with the given sha256 in cart format."""
137
+ return self._request(
138
+ method=HTTPMethod.GET,
139
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/content",
140
+ timeout=self.upload_download_timeout,
141
+ ).content
142
+
143
+ def download_bulk(self, hashes: list[str]) -> bytes:
144
+ """Download multiple binaries with the given list of sha256 hashes."""
145
+ return self._request(
146
+ method=HTTPMethod.POST,
147
+ url=self.cfg.azul_url + "/api/v0/binaries/content/bulk",
148
+ json={"binaries": hashes},
149
+ timeout=self.upload_download_timeout,
150
+ ).content
151
+
152
+ @tenacity.retry(
153
+ stop=tenacity.stop_after_attempt(3),
154
+ wait=tenacity.wait_random(min=1, max=2),
155
+ retry=tenacity.retry_if_exception_type(httpx.TimeoutException),
156
+ before_sleep=tenacity.before_sleep_log(logger, logging.WARNING),
157
+ reraise=True,
158
+ )
159
+ def _base_upload(
160
+ self,
161
+ body: dict,
162
+ *,
163
+ api: str,
164
+ file_path_or_contents: Path | str | bytes | IO[bytes] | SpooledTemporaryFile | None = None,
165
+ augmented_streams: list[AugmentedStream] | None = None,
166
+ filename: str | None = None,
167
+ password: str = "",
168
+ extract: bool = False,
169
+ refresh: bool = False,
170
+ ) -> models_restapi.BinaryData:
171
+ if not augmented_streams:
172
+ augmented_streams = []
173
+
174
+ for k, v in list(body.items()):
175
+ if v is None:
176
+ body.pop(k)
177
+
178
+ with _OpenFile(file_path_or_contents) as file_handle:
179
+ safe_file = None
180
+ if file_handle is not None:
181
+ # Try to identify this as either a .malpz or .cart
182
+ is_malpz = False
183
+ malpz_header_length = len(malpz.MALPZ_HEADER)
184
+ malpz_header = file_handle.read(malpz_header_length)
185
+ if len(malpz_header) == malpz_header_length:
186
+ # File too small otherwise
187
+ try:
188
+ malpz.validate_version(malpz_header)
189
+ is_malpz = True
190
+ except malpz.MetadataException:
191
+ pass
192
+ file_handle.seek(0)
193
+
194
+ cart_header_length = struct.calcsize(cart.MANDATORY_HEADER_FMT)
195
+ cart_header = file_handle.read(cart_header_length)
196
+ is_cart = cart.is_cart(cart_header)
197
+ file_handle.seek(0)
198
+
199
+ if not extract and not is_malpz and not is_cart:
200
+ # This file is not neutered; do this now before sending over the network
201
+ safe_file = SpooledTemporaryFile(max_size=1000 * 1000)
202
+ try:
203
+ print("Packing file as a .CaRT...")
204
+ cart.pack_stream(file_handle, safe_file)
205
+ print("CaRT size:", safe_file.tell())
206
+ safe_file.seek(0)
207
+ except BaseException:
208
+ # Avoid leaving temp files in case CaRTing fails. httpx should
209
+ # handle our file handle otherwise.
210
+ safe_file.close()
211
+ raise
212
+ else:
213
+ # Use the original data source
214
+ safe_file = file_handle
215
+
216
+ with _OpenAugmentedStreams(augmented_streams) as opened_streams:
217
+ stream_data = [
218
+ ("stream_data", (s.file_name, s.contents_file_path.open(), "application/octet-stream"))
219
+ for s in opened_streams
220
+ ]
221
+ body["stream_labels"] = [s.label for s in opened_streams]
222
+
223
+ main_file = []
224
+ # If there is any contents.
225
+ if safe_file:
226
+ main_file.append(("binary", (filename, safe_file, "application/octet-stream")))
227
+ resp = self._request_upload(
228
+ url=self.cfg.azul_url + api,
229
+ params={"refresh": refresh, "extract": extract, "password": password},
230
+ files=main_file + stream_data,
231
+ data=body,
232
+ timeout=self.upload_download_timeout,
233
+ )
234
+
235
+ if resp.status_code != 200 and resp.status_code != 206:
236
+ raise exceptions.bad_response(resp)
237
+
238
+ entity = resp.json()[0]
239
+ if not entity.get("id"):
240
+ entity["id"] = entity.get("sha256")
241
+ return models_restapi.BinaryData.model_validate(entity)
242
+
243
+ def upload(
244
+ self,
245
+ file_path_or_contents: Path | str | bytes | IO[bytes] | SpooledTemporaryFile,
246
+ source_id: str,
247
+ *,
248
+ references: dict[str, str] | None = None,
249
+ submit_settings: dict[str, str] | None = None,
250
+ augmented_streams: list[AugmentedStream] | None = None,
251
+ filename: str | None = None,
252
+ timestamp: str | None = None,
253
+ security: str,
254
+ extract: bool = False,
255
+ password: str | None = None,
256
+ refresh: bool = False,
257
+ exclude_security_labels: list[str] = None,
258
+ include_queries: bool = False,
259
+ ) -> models_restapi.BinaryData:
260
+ """Upload binary handle with corresponding form data."""
261
+ # If there are no augmented stream and the file isn't being extracted their must be a filename.
262
+ if not augmented_streams and not extract and not filename:
263
+ raise ValueError("If the upload isn't an archive and you aren't uploading streams a filename is required.")
264
+
265
+ if not source_id:
266
+ raise ValueError(f"{source_id=} is required to be a valid value.")
267
+
268
+ if security is not None and not isinstance(security, str):
269
+ raise ValueError("Security must be a string value.")
270
+
271
+ if not timestamp:
272
+ timestamp = pendulum.now(pendulum.UTC).to_iso8601_string()
273
+
274
+ references = json.dumps(references) if references else None
275
+ submit_settings = json.dumps(submit_settings) if submit_settings else None
276
+
277
+ return self._base_upload(
278
+ body=dict(
279
+ source_id=source_id,
280
+ references=references,
281
+ settings=submit_settings,
282
+ timestamp=timestamp,
283
+ security=security,
284
+ exclude_security_labels=exclude_security_labels,
285
+ include_queries=include_queries,
286
+ filename=filename,
287
+ ),
288
+ api="/api/v0/binaries/source",
289
+ file_path_or_contents=file_path_or_contents,
290
+ augmented_streams=augmented_streams,
291
+ filename=filename,
292
+ extract=extract,
293
+ password=password,
294
+ refresh=refresh,
295
+ )
296
+
297
+ def upload_dataless(
298
+ self,
299
+ binary_id: str, # sha256 of the binary to have it's metadata updated
300
+ source_id: str,
301
+ *,
302
+ references: dict[str, str] | None = None,
303
+ augmented_streams: list[AugmentedStream] | None = None,
304
+ filename: str | None = None,
305
+ timestamp: str | None = None,
306
+ security: str,
307
+ refresh: bool = False,
308
+ exclude_security_labels: list[str] = None,
309
+ include_queries: bool = False,
310
+ ) -> models_restapi.BinaryData:
311
+ """Upload new metadata and potentially alt-streams for a binary."""
312
+ if not binary_id and re.search(self.SHA256_regex, binary_id):
313
+ raise ValueError(f"{binary_id=} must be set to a valid sha256 value.")
314
+
315
+ if security is not None and not isinstance(security, str):
316
+ raise ValueError("Security must be a string value.")
317
+
318
+ if not timestamp:
319
+ timestamp = pendulum.now(pendulum.UTC).to_iso8601_string()
320
+
321
+ references = json.dumps(references) if references else None
322
+
323
+ return self._base_upload(
324
+ body=dict(
325
+ sha256=binary_id,
326
+ source_id=source_id,
327
+ references=references,
328
+ timestamp=timestamp,
329
+ security=security,
330
+ exclude_security_labels=exclude_security_labels,
331
+ include_queries=include_queries,
332
+ filename=filename,
333
+ ),
334
+ api="/api/v0/binaries/source/dataless",
335
+ augmented_streams=augmented_streams,
336
+ filename=filename,
337
+ refresh=refresh,
338
+ )
339
+
340
+ def upload_child(
341
+ self,
342
+ file_path_or_contents: Path | str | bytes | IO[bytes],
343
+ parent_sha256: str,
344
+ relationship: dict[str, str],
345
+ *,
346
+ submit_settings: dict[str, str] | None = None,
347
+ parent_type: str = "binary",
348
+ filename: str | None = None,
349
+ timestamp: str | None = None,
350
+ security: str,
351
+ extract: bool = False,
352
+ password: str | None = None,
353
+ refresh: bool = False,
354
+ exclude_security_labels: list[str] = None,
355
+ include_queries: bool = False,
356
+ ) -> models_restapi.BinaryData:
357
+ """Upload a child binary and attach it to the parent binary with the provided sha256 ID."""
358
+ if not parent_sha256 or not re.search(self.SHA256_regex, parent_sha256):
359
+ raise ValueError(f"{parent_sha256=} must be set to a valid sha256 value.")
360
+
361
+ if not relationship:
362
+ raise ValueError(f"{relationship=} must be a dictionary with at least one key value pair.")
363
+ relationship = json.dumps(relationship) if relationship else None
364
+ submit_settings = json.dumps(submit_settings) if submit_settings else None
365
+
366
+ if not extract and not filename:
367
+ raise ValueError("If the upload isn't an archive a filename is required.")
368
+
369
+ if security is not None and not isinstance(security, str):
370
+ raise ValueError("Security must be a string value.")
371
+
372
+ if not timestamp:
373
+ timestamp = pendulum.now(pendulum.UTC).to_iso8601_string()
374
+
375
+ return self._base_upload(
376
+ body=dict(
377
+ timestamp=timestamp,
378
+ security=security,
379
+ exclude_security_labels=exclude_security_labels,
380
+ include_queries=include_queries,
381
+ relationship=relationship,
382
+ settings=submit_settings,
383
+ parent_type=parent_type,
384
+ parent_sha256=parent_sha256,
385
+ filename=filename,
386
+ ),
387
+ api="/api/v0/binaries/child",
388
+ file_path_or_contents=file_path_or_contents,
389
+ filename=filename,
390
+ extract=extract,
391
+ password=password,
392
+ refresh=refresh,
393
+ )
394
+
395
+ def expedite_processing(self, sha256: str, *, bypass_cache: bool = False) -> None:
396
+ """Expedite or reprocess the file with the provided sha256.
397
+
398
+ If bypass_cache is on ensure plugins actually re-process the binary and don't rely on the cache.
399
+ """
400
+ self._request(
401
+ method=HTTPMethod.POST,
402
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/expedite",
403
+ params={"bypass_cache": bypass_cache},
404
+ )
405
+
406
+ def download_augmented_stream(self, sha256: str, stream_sha256: str):
407
+ """Download the raw augmented stream for a given submission binary.
408
+
409
+ First sha256 is the sha256 of the binary and the second sha256 is that of the augmented stream.
410
+ """
411
+ return self._request(
412
+ method=HTTPMethod.GET,
413
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/content/{stream_sha256}",
414
+ ).content
415
+
416
+ def download_hex(
417
+ self, sha256, *, offset: int = 0, max_bytes_to_read: int | None = None, shortform: bool = False
418
+ ) -> models_restapi.BinaryHexView:
419
+ """Download either all or a section of the raw hex of a file.
420
+
421
+ Typically used in chunks to download sections of a file until either you have the whole file or have
422
+ enough information from the file.
423
+
424
+ :param str sha256: sha256 of the file you want to download hex for.
425
+ :param int offset: starting offset for where to download the hex from.
426
+ :param int max_bytes_to_read: bytes to read before stopping and returning what you have.
427
+ :param bool shortform: If true, will return 16 hex bytes as a string instead of 16 strings in a list.
428
+ """
429
+ params = {"offset": offset, "max_bytes_to_read": max_bytes_to_read, "shortform": shortform}
430
+ params = self.filter_none_values(params)
431
+
432
+ return self._request_with_pydantic_model_response(
433
+ method=HTTPMethod.GET,
434
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/hexview",
435
+ response_model=models_restapi.BinaryHexView,
436
+ params=params,
437
+ )
438
+
439
+ def get_strings(
440
+ self,
441
+ sha256: str,
442
+ *,
443
+ min_length: int = 4,
444
+ max_length: int = 200,
445
+ offset: int = 0,
446
+ max_bytes_to_read: int = DEFAULT_MAX_BYTES_TO_READ,
447
+ take_n_strings: int = 1000,
448
+ filter: str | None = None,
449
+ regex: str | None = None,
450
+ file_format_legacy: str | None = None,
451
+ ) -> models_restapi.BinaryStrings:
452
+ """Get strings for a binary file with multiple potential additional parameters.
453
+
454
+ :param str sha256: File to get strings for.
455
+ :param int min_length: Minimum length of string (when decoded).
456
+ :param int max_length: Maximum length of string (when decoded).
457
+ :param int offset: Search for strings from offset.
458
+ :param int max_bytes_to_read: How many bytes to search for, default of 10MB.
459
+ :param int take_n_strings: ow many strings to return.
460
+ :param str filter: Case-insensitive search string to filter strings with.
461
+ :param str regex: Regex pattern to search strings with.
462
+ :param str file_format_legacy: Optional file type for AI string filter.
463
+ """
464
+ params = {
465
+ "min_length": min_length,
466
+ "max_length": max_length,
467
+ "offset": offset,
468
+ "max_bytes_to_read": max_bytes_to_read,
469
+ "take_n_strings": take_n_strings,
470
+ "filter": filter,
471
+ "regex": regex,
472
+ "file_format_legacy": file_format_legacy,
473
+ }
474
+ params = self.filter_none_values(params)
475
+
476
+ return self._request_with_pydantic_model_response(
477
+ method=HTTPMethod.GET,
478
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/strings",
479
+ response_model=models_restapi.BinaryStrings,
480
+ params=params,
481
+ )
482
+
483
+ def search_hex(
484
+ self,
485
+ sha256: str,
486
+ filter: str,
487
+ *,
488
+ offset: int = 0,
489
+ max_bytes_to_read: int | None = None,
490
+ take_n_hits: int = 1000,
491
+ ) -> models_restapi.BinaryStrings:
492
+ """Search a hex file with the given filter.
493
+
494
+ :param str sha256: Search the data file that has this sha256.
495
+ :param str filter: Search a hex file with the given hex string filter.
496
+ :param int offset: Search for hits from offset.
497
+ :param int max_bytes_to_read: How many bytes to search for, if this is not set, return to EOF.
498
+ :param int take_n_hits: Maximum number of hits to return.
499
+ """
500
+ params = {
501
+ "offset": offset,
502
+ "max_bytes_to_read": max_bytes_to_read,
503
+ "take_n_hits": take_n_hits,
504
+ "filter": filter,
505
+ }
506
+ params = self.filter_none_values(params)
507
+
508
+ return self._request_with_pydantic_model_response(
509
+ method=HTTPMethod.GET,
510
+ url=self.cfg.azul_url + f"/api/v0/binaries/{sha256}/search/hex",
511
+ response_model=models_restapi.BinaryStrings,
512
+ params=params,
513
+ )