credsweeper 1.11.1__py3-none-any.whl → 1.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +6 -4
- credsweeper/app.py +7 -3
- credsweeper/common/keyword_pattern.py +26 -12
- credsweeper/common/morpheme_checklist.txt +4 -2
- credsweeper/credentials/line_data.py +14 -10
- credsweeper/deep_scanner/abstract_scanner.py +10 -1
- credsweeper/deep_scanner/deep_scanner.py +19 -8
- credsweeper/deep_scanner/docx_scanner.py +1 -1
- credsweeper/deep_scanner/encoder_scanner.py +2 -2
- credsweeper/deep_scanner/html_scanner.py +3 -3
- credsweeper/deep_scanner/jks_scanner.py +2 -4
- credsweeper/deep_scanner/lang_scanner.py +2 -2
- credsweeper/deep_scanner/lzma_scanner.py +40 -0
- credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
- credsweeper/deep_scanner/xml_scanner.py +2 -2
- credsweeper/file_handler/data_content_provider.py +21 -12
- credsweeper/filters/__init__.py +0 -1
- credsweeper/filters/group/group.py +3 -4
- credsweeper/filters/group/url_credentials_group.py +2 -3
- credsweeper/filters/value_allowlist_check.py +6 -7
- credsweeper/filters/value_array_dictionary_check.py +3 -1
- credsweeper/filters/value_azure_token_check.py +1 -2
- credsweeper/filters/value_base64_part_check.py +30 -21
- credsweeper/filters/value_discord_bot_check.py +1 -2
- credsweeper/filters/value_entropy_base32_check.py +11 -31
- credsweeper/filters/value_entropy_base36_check.py +11 -34
- credsweeper/filters/value_entropy_base64_check.py +19 -48
- credsweeper/filters/{value_first_word_check.py → value_entropy_base_check.py} +13 -14
- credsweeper/filters/value_file_path_check.py +1 -1
- credsweeper/filters/value_hex_number_check.py +3 -3
- credsweeper/filters/value_json_web_token_check.py +4 -5
- credsweeper/filters/value_string_type_check.py +11 -3
- credsweeper/filters/value_token_base32_check.py +0 -4
- credsweeper/filters/value_token_base36_check.py +0 -4
- credsweeper/filters/value_token_base64_check.py +0 -4
- credsweeper/filters/value_token_check.py +1 -1
- credsweeper/ml_model/features/file_extension.py +1 -1
- credsweeper/ml_model/features/morpheme_dense.py +0 -4
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in_path.py +0 -9
- credsweeper/ml_model/features/word_in_postamble.py +0 -11
- credsweeper/ml_model/features/word_in_preamble.py +0 -11
- credsweeper/ml_model/features/word_in_transition.py +0 -11
- credsweeper/ml_model/features/word_in_value.py +0 -11
- credsweeper/ml_model/features/word_in_variable.py +0 -11
- credsweeper/ml_model/ml_validator.py +4 -3
- credsweeper/rules/config.yaml +238 -208
- credsweeper/scanner/scan_type/scan_type.py +2 -3
- credsweeper/scanner/scanner.py +7 -1
- credsweeper/secret/config.json +16 -5
- credsweeper/utils/pem_key_detector.py +4 -5
- credsweeper/utils/util.py +67 -144
- {credsweeper-1.11.1.dist-info → credsweeper-1.11.3.dist-info}/METADATA +1 -1
- {credsweeper-1.11.1.dist-info → credsweeper-1.11.3.dist-info}/RECORD +58 -58
- credsweeper/utils/entropy_validator.py +0 -72
- {credsweeper-1.11.1.dist-info → credsweeper-1.11.3.dist-info}/WHEEL +0 -0
- {credsweeper-1.11.1.dist-info → credsweeper-1.11.3.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.11.1.dist-info → credsweeper-1.11.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -38,13 +38,12 @@ class ScanType(ABC):
|
|
|
38
38
|
raise NotImplementedError()
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
|
-
def filtering(cls,
|
|
41
|
+
def filtering(cls, target: AnalysisTarget, line_data: LineData, filters: List[Filter]) -> bool:
|
|
42
42
|
"""Check if line data should be removed based on filters.
|
|
43
43
|
|
|
44
44
|
If `use_filters` option is false, always return False
|
|
45
45
|
|
|
46
46
|
Args:
|
|
47
|
-
config: dict of credsweeper configuration
|
|
48
47
|
target: AnalysisTarget from which `line_data` was obtained
|
|
49
48
|
line_data: Line data to check with `filters`
|
|
50
49
|
filters: Filters to use
|
|
@@ -112,7 +111,7 @@ class ScanType(ABC):
|
|
|
112
111
|
bypass_start = line_data.value_end
|
|
113
112
|
bypass_end = offset_end
|
|
114
113
|
|
|
115
|
-
if config.use_filters and cls.filtering(
|
|
114
|
+
if config.use_filters and cls.filtering(target, line_data, filters):
|
|
116
115
|
if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
|
|
117
116
|
# may be next matched item will be not filtered - let search it after variable
|
|
118
117
|
bypass_start = line_data.variable_end
|
credsweeper/scanner/scanner.py
CHANGED
|
@@ -146,7 +146,13 @@ class Scanner:
|
|
|
146
146
|
# "cache" - YAPF and pycharm formatters ...
|
|
147
147
|
matched_keyword = \
|
|
148
148
|
target_line_stripped_len >= self.min_keyword_len and ( #
|
|
149
|
-
'=' in target_line_stripped
|
|
149
|
+
'=' in target_line_stripped
|
|
150
|
+
or ':' in target_line_stripped
|
|
151
|
+
or "set" in target_line_stripped
|
|
152
|
+
or "#define" in target_line_stripped
|
|
153
|
+
or "%define" in target_line_stripped
|
|
154
|
+
or "%global" in target_line_stripped
|
|
155
|
+
) #
|
|
150
156
|
matched_pem_key = \
|
|
151
157
|
target_line_stripped_len >= self.min_pem_key_len \
|
|
152
158
|
and PEM_BEGIN_PATTERN in target_line_stripped and "PRIVATE" in target_line_stripped
|
credsweeper/secret/config.json
CHANGED
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
"exclude": {
|
|
3
3
|
"pattern": [],
|
|
4
4
|
"containers": [
|
|
5
|
+
".aar",
|
|
5
6
|
".apk",
|
|
6
7
|
".bz2",
|
|
7
8
|
".gz",
|
|
9
|
+
".lzma",
|
|
8
10
|
".tar",
|
|
11
|
+
".xz",
|
|
9
12
|
".zip"
|
|
10
13
|
],
|
|
11
14
|
"documents": [
|
|
@@ -20,17 +23,20 @@
|
|
|
20
23
|
],
|
|
21
24
|
"extension": [
|
|
22
25
|
".7z",
|
|
26
|
+
".a",
|
|
23
27
|
".aac",
|
|
24
|
-
".aar",
|
|
25
28
|
".avi",
|
|
29
|
+
".bin",
|
|
26
30
|
".bmp",
|
|
27
31
|
".class",
|
|
28
32
|
".css",
|
|
29
33
|
".dmg",
|
|
30
34
|
".ear",
|
|
31
35
|
".eot",
|
|
36
|
+
".elf",
|
|
32
37
|
".exe",
|
|
33
38
|
".gif",
|
|
39
|
+
".gmo",
|
|
34
40
|
".ico",
|
|
35
41
|
".img",
|
|
36
42
|
".info",
|
|
@@ -45,6 +51,7 @@
|
|
|
45
51
|
".mp4",
|
|
46
52
|
".npy",
|
|
47
53
|
".npz",
|
|
54
|
+
".obj",
|
|
48
55
|
".ogg",
|
|
49
56
|
".pak",
|
|
50
57
|
".png",
|
|
@@ -52,10 +59,13 @@
|
|
|
52
59
|
".pyc",
|
|
53
60
|
".pyd",
|
|
54
61
|
".pyo",
|
|
62
|
+
".rar",
|
|
55
63
|
".rc",
|
|
56
64
|
".rc2",
|
|
57
65
|
".rar",
|
|
58
66
|
".realm",
|
|
67
|
+
".res",
|
|
68
|
+
".rpm",
|
|
59
69
|
".s7z",
|
|
60
70
|
".scss",
|
|
61
71
|
".so",
|
|
@@ -70,6 +80,7 @@
|
|
|
70
80
|
".wav",
|
|
71
81
|
".webm",
|
|
72
82
|
".webp",
|
|
83
|
+
".wma",
|
|
73
84
|
".woff",
|
|
74
85
|
".yuv"
|
|
75
86
|
],
|
|
@@ -160,13 +171,13 @@
|
|
|
160
171
|
"line_num",
|
|
161
172
|
"path",
|
|
162
173
|
"info",
|
|
163
|
-
"value",
|
|
164
|
-
"value_start",
|
|
165
|
-
"value_end",
|
|
166
174
|
"variable",
|
|
167
175
|
"variable_start",
|
|
168
176
|
"variable_end",
|
|
169
|
-
"
|
|
177
|
+
"value",
|
|
178
|
+
"value_start",
|
|
179
|
+
"value_end",
|
|
180
|
+
"entropy"
|
|
170
181
|
],
|
|
171
182
|
"candidate_output": [
|
|
172
183
|
"rule",
|
|
@@ -4,12 +4,11 @@ import re
|
|
|
4
4
|
import string
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
-
from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN,
|
|
7
|
+
from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, ENTROPY_LIMIT_BASE64
|
|
8
8
|
from credsweeper.config import Config
|
|
9
9
|
from credsweeper.credentials import LineData
|
|
10
10
|
from credsweeper.file_handler.analysis_target import AnalysisTarget
|
|
11
11
|
from credsweeper.utils import Util
|
|
12
|
-
from credsweeper.utils.entropy_validator import EntropyValidator
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
@@ -68,10 +67,10 @@ class PemKeyDetector:
|
|
|
68
67
|
elif PEM_END_PATTERN in subline:
|
|
69
68
|
if "PGP" in target.line_strip:
|
|
70
69
|
# Check if entropy is high enough for base64 set with padding sign
|
|
71
|
-
|
|
72
|
-
if
|
|
70
|
+
entropy = Util.get_shannon_entropy(key_data)
|
|
71
|
+
if ENTROPY_LIMIT_BASE64 <= entropy:
|
|
73
72
|
return line_data
|
|
74
|
-
logger.debug("Filtered with entropy %f '%s'",
|
|
73
|
+
logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
|
|
75
74
|
if "OPENSSH" in target.line_strip:
|
|
76
75
|
# Check whether the key is encrypted
|
|
77
76
|
with contextlib.suppress(Exception):
|
credsweeper/utils/util.py
CHANGED
|
@@ -12,13 +12,14 @@ from dataclasses import dataclass
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any, Dict, List, Tuple, Optional, Union
|
|
14
14
|
|
|
15
|
+
import numpy as np
|
|
15
16
|
import whatthepatch
|
|
16
17
|
import yaml
|
|
17
18
|
from lxml import etree
|
|
18
19
|
from typing_extensions import TypedDict
|
|
19
20
|
|
|
20
21
|
from credsweeper.common.constants import DiffRowType, AVAILABLE_ENCODINGS, \
|
|
21
|
-
DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE
|
|
22
|
+
DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, ASCII
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
@@ -65,21 +66,17 @@ class Util:
|
|
|
65
66
|
return result
|
|
66
67
|
|
|
67
68
|
@staticmethod
|
|
68
|
-
def get_shannon_entropy(data: str,
|
|
69
|
+
def get_shannon_entropy(data: Union[str, bytes]) -> float:
|
|
69
70
|
"""Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html."""
|
|
70
71
|
if not data:
|
|
71
|
-
return 0
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
p_x = data.count(x) / data_len
|
|
77
|
-
if p_x > 0:
|
|
78
|
-
entropy += -p_x * math.log(p_x, 2)
|
|
79
|
-
|
|
72
|
+
return 0.
|
|
73
|
+
size = len(data)
|
|
74
|
+
_uniq, counts = np.unique(list(data), return_counts=True)
|
|
75
|
+
probabilities = counts / size
|
|
76
|
+
entropy = float(-np.sum(probabilities * np.log2(probabilities)))
|
|
80
77
|
return entropy
|
|
81
78
|
|
|
82
|
-
|
|
79
|
+
# Precalculated data for speedup
|
|
83
80
|
MIN_DATA_ENTROPY: Dict[int, float] = {
|
|
84
81
|
16: 1.66973671780348,
|
|
85
82
|
20: 2.07723544540831,
|
|
@@ -153,41 +150,39 @@ class Util:
|
|
|
153
150
|
return entropy < min_entropy
|
|
154
151
|
|
|
155
152
|
@staticmethod
|
|
156
|
-
def is_known(data: bytes) -> bool:
|
|
157
|
-
"""
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
or Util.is_bzip2(data) \
|
|
164
|
-
or Util.is_com(data) \
|
|
165
|
-
or Util.is_pdf(data) \
|
|
166
|
-
or Util.is_elf(data):
|
|
167
|
-
return True
|
|
153
|
+
def is_known(data: Union[bytes, bytearray]) -> bool:
|
|
154
|
+
"""Returns True if any known binary format is found to prevent extra scan a file without an extension."""
|
|
155
|
+
if isinstance(data, (bytes, bytearray)):
|
|
156
|
+
if 127 <= len(data) and data.startswith(b"\x7f\x45\x4c\x46"):
|
|
157
|
+
# https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
|
|
158
|
+
# minimal ELF is 127 bytes https://github.com/tchajed/minimal-elf
|
|
159
|
+
return True
|
|
168
160
|
return False
|
|
169
161
|
|
|
170
162
|
@staticmethod
|
|
171
|
-
def is_binary(data: bytes) -> bool:
|
|
163
|
+
def is_binary(data: Union[bytes, bytearray]) -> bool:
|
|
172
164
|
"""
|
|
173
|
-
Returns True when two zeroes sequence is found
|
|
174
|
-
UTF-32 is not supported
|
|
165
|
+
Returns True when two zeroes sequence is found in begin of data.
|
|
166
|
+
The sequence never exists in text format (UTF-8, UTF-16). UTF-32 is not supported.
|
|
175
167
|
"""
|
|
176
168
|
if 0 <= data.find(b"\0\0", 0, MAX_LINE_LENGTH):
|
|
177
169
|
return True
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
170
|
+
else:
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
NOT_LATIN1_PRINTABLE_SET = (set(range(0,
|
|
174
|
+
256)).difference(set(x for x in string.printable.encode(ASCII))).difference(
|
|
175
|
+
set(x for x in range(0xA0, 0x100))))
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def is_latin1(data: Union[bytes, bytearray]) -> bool:
|
|
179
|
+
"""Returns True when data looks like LATIN-1 for first MAX_LINE_LENGTH bytes."""
|
|
180
|
+
result = False
|
|
184
181
|
if data:
|
|
182
|
+
non_latin1_cnt = sum(1 for x in data[:MAX_LINE_LENGTH] if x in Util.NOT_LATIN1_PRINTABLE_SET)
|
|
185
183
|
# experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal
|
|
186
|
-
chunk_len =
|
|
187
|
-
result = 0.1
|
|
188
|
-
else:
|
|
189
|
-
# empty data case
|
|
190
|
-
result = False
|
|
184
|
+
chunk_len = min(MAX_LINE_LENGTH, len(data))
|
|
185
|
+
result = 0.1 > non_latin1_cnt / chunk_len
|
|
191
186
|
return result
|
|
192
187
|
|
|
193
188
|
@staticmethod
|
|
@@ -231,7 +226,7 @@ class Util:
|
|
|
231
226
|
encodings = AVAILABLE_ENCODINGS
|
|
232
227
|
for encoding in encodings:
|
|
233
228
|
try:
|
|
234
|
-
if binary_suggest and LATIN_1 == encoding and (Util.
|
|
229
|
+
if binary_suggest and LATIN_1 == encoding and (Util.is_binary(content) or not Util.is_latin1(content)):
|
|
235
230
|
# LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
|
|
236
231
|
# so skip this encoding when checking binaries
|
|
237
232
|
logger.warning("Binary file detected")
|
|
@@ -374,7 +369,7 @@ class Util:
|
|
|
374
369
|
line = change["line"]
|
|
375
370
|
if isinstance(line, str):
|
|
376
371
|
rows_data.extend(Util.preprocess_diff_rows(change.get("new"), change.get("old"), line))
|
|
377
|
-
elif isinstance(line, bytes):
|
|
372
|
+
elif isinstance(line, (bytes, bytearray)):
|
|
378
373
|
logger.warning("The feature is available with the deep scan option")
|
|
379
374
|
else:
|
|
380
375
|
logger.error(f"Unknown type of line {type(line)}")
|
|
@@ -382,9 +377,9 @@ class Util:
|
|
|
382
377
|
return rows_data
|
|
383
378
|
|
|
384
379
|
@staticmethod
|
|
385
|
-
def is_zip(data: bytes) -> bool:
|
|
380
|
+
def is_zip(data: Union[bytes, bytearray]) -> bool:
|
|
386
381
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
387
|
-
if isinstance(data, bytes) and 3 < len(data):
|
|
382
|
+
if isinstance(data, (bytes, bytearray)) and 3 < len(data):
|
|
388
383
|
# PK
|
|
389
384
|
if data.startswith(b"PK"):
|
|
390
385
|
if 0x03 == data[2] and 0x04 == data[3]:
|
|
@@ -398,18 +393,18 @@ class Util:
|
|
|
398
393
|
return False
|
|
399
394
|
|
|
400
395
|
@staticmethod
|
|
401
|
-
def is_com(data: bytes) -> bool:
|
|
396
|
+
def is_com(data: Union[bytes, bytearray]) -> bool:
|
|
402
397
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
403
|
-
if isinstance(data, bytes) and 8 < len(data):
|
|
398
|
+
if isinstance(data, (bytes, bytearray)) and 8 < len(data):
|
|
404
399
|
if data.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
|
|
405
400
|
# Compound File Binary Format: doc, xls, ppt, msi, msg
|
|
406
401
|
return True
|
|
407
402
|
return False
|
|
408
403
|
|
|
409
404
|
@staticmethod
|
|
410
|
-
def is_tar(data: bytes) -> bool:
|
|
405
|
+
def is_tar(data: Union[bytes, bytearray]) -> bool:
|
|
411
406
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
412
|
-
if isinstance(data, bytes) and 512 <= len(data):
|
|
407
|
+
if isinstance(data, (bytes, bytearray)) and 512 <= len(data):
|
|
413
408
|
if 0x75 == data[257] and 0x73 == data[258] and 0x74 == data[259] \
|
|
414
409
|
and 0x61 == data[260] and 0x72 == data[261] and (
|
|
415
410
|
0x00 == data[262] and 0x30 == data[263] and 0x30 == data[264]
|
|
@@ -425,9 +420,9 @@ class Util:
|
|
|
425
420
|
return False
|
|
426
421
|
|
|
427
422
|
@staticmethod
|
|
428
|
-
def is_bzip2(data: bytes) -> bool:
|
|
423
|
+
def is_bzip2(data: Union[bytes, bytearray]) -> bool:
|
|
429
424
|
"""According https://en.wikipedia.org/wiki/Bzip2"""
|
|
430
|
-
if isinstance(data, bytes) and 10 <= len(data):
|
|
425
|
+
if isinstance(data, (bytes, bytearray)) and 10 <= len(data):
|
|
431
426
|
if data.startswith(b"\x42\x5A\x68") \
|
|
432
427
|
and 0x31 <= data[3] <= 0x39 \
|
|
433
428
|
and 0x31 == data[4] and 0x41 == data[5] and 0x59 == data[6] \
|
|
@@ -436,34 +431,41 @@ class Util:
|
|
|
436
431
|
return False
|
|
437
432
|
|
|
438
433
|
@staticmethod
|
|
439
|
-
def is_gzip(data: bytes) -> bool:
|
|
434
|
+
def is_gzip(data: Union[bytes, bytearray]) -> bool:
|
|
440
435
|
"""According https://www.rfc-editor.org/rfc/rfc1952"""
|
|
441
|
-
if isinstance(data, bytes) and 3 <= len(data):
|
|
436
|
+
if isinstance(data, (bytes, bytearray)) and 3 <= len(data):
|
|
442
437
|
if data.startswith(b"\x1F\x8B\x08"):
|
|
443
438
|
return True
|
|
444
439
|
return False
|
|
445
440
|
|
|
446
441
|
@staticmethod
|
|
447
|
-
def is_pdf(data: bytes) -> bool:
|
|
442
|
+
def is_pdf(data: Union[bytes, bytearray]) -> bool:
|
|
448
443
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - pdf"""
|
|
449
|
-
if isinstance(data, bytes) and 5 <= len(data):
|
|
444
|
+
if isinstance(data, (bytes, bytearray)) and 5 <= len(data):
|
|
450
445
|
if data.startswith(b"\x25\x50\x44\x46\x2D"):
|
|
451
446
|
return True
|
|
452
447
|
return False
|
|
453
448
|
|
|
454
449
|
@staticmethod
|
|
455
|
-
def is_jks(data: bytes) -> bool:
|
|
450
|
+
def is_jks(data: Union[bytes, bytearray]) -> bool:
|
|
456
451
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - jks"""
|
|
457
|
-
if isinstance(data, bytes) and 4 <= len(data):
|
|
452
|
+
if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
|
|
458
453
|
if data.startswith(b"\xFE\xED\xFE\xED"):
|
|
459
454
|
return True
|
|
460
455
|
return False
|
|
461
456
|
|
|
462
457
|
@staticmethod
|
|
463
|
-
def
|
|
458
|
+
def is_lzma(data: Union[bytes, bytearray]) -> bool:
|
|
459
|
+
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
|
|
460
|
+
if isinstance(data, (bytes, bytearray)) and 6 <= len(data):
|
|
461
|
+
if data.startswith(b"\xFD\x37\x7A\x58\x5A\x00") or data.startswith(b"\x5D\x00\x00"):
|
|
462
|
+
return True
|
|
463
|
+
return False
|
|
464
|
+
|
|
465
|
+
@staticmethod
|
|
466
|
+
def is_asn1(data: Union[bytes, bytearray]) -> bool:
|
|
464
467
|
"""Only sequence type 0x30 and size correctness is checked"""
|
|
465
|
-
|
|
466
|
-
if isinstance(data, bytes) and 4 <= data_length:
|
|
468
|
+
if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
|
|
467
469
|
# sequence
|
|
468
470
|
if 0x30 == data[0]:
|
|
469
471
|
# https://www.oss.com/asn1/resources/asn1-made-simple/asn1-quick-reference/basic-encoding-rules.html#Lengths
|
|
@@ -471,7 +473,7 @@ class Util:
|
|
|
471
473
|
byte_len = (0x7F & length)
|
|
472
474
|
if 0x80 == length and data.endswith(b"\x00\x00"):
|
|
473
475
|
return True
|
|
474
|
-
elif 0x80 < length and 1 < byte_len <
|
|
476
|
+
elif 0x80 < length and 1 < byte_len < len(data): # additional check
|
|
475
477
|
len_bytes = data[2:2 + byte_len]
|
|
476
478
|
try:
|
|
477
479
|
long_size = struct.unpack(">h", len_bytes)
|
|
@@ -482,26 +484,17 @@ class Util:
|
|
|
482
484
|
length = data[2]
|
|
483
485
|
else:
|
|
484
486
|
byte_len = 0
|
|
485
|
-
return
|
|
486
|
-
return False
|
|
487
|
-
|
|
488
|
-
@staticmethod
|
|
489
|
-
def is_elf(data: Union[bytes, bytearray]) -> bool:
|
|
490
|
-
"""According to https://en.wikipedia.org/wiki/Executable_and_Linkable_Format use only 5 bytes"""
|
|
491
|
-
if isinstance(data, (bytes, bytearray)) and 127 <= len(data):
|
|
492
|
-
# minimal is 127 bytes https://github.com/tchajed/minimal-elf
|
|
493
|
-
if data.startswith(b"\x7f\x45\x4c\x46") and (0x01 == data[5] or 0x02 == data[5]):
|
|
494
|
-
return True
|
|
487
|
+
return len(data) == length + 2 + byte_len
|
|
495
488
|
return False
|
|
496
489
|
|
|
497
490
|
@staticmethod
|
|
498
491
|
def is_html(data: Union[bytes, bytearray]) -> bool:
|
|
499
492
|
"""Used to detect html format. Suppose, invocation of is_xml() was True before."""
|
|
500
493
|
if isinstance(data, (bytes, bytearray)):
|
|
501
|
-
for opening_tag, closing_tag in [(b"<html
|
|
502
|
-
(b"<
|
|
503
|
-
(b"<
|
|
504
|
-
(b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
|
|
494
|
+
for opening_tag, closing_tag in [(b"<html", b"</html>"), (b"<body", b"</body>"), (b"<table", b"</table>"),
|
|
495
|
+
(b"<p>", b"</p>"), (b"<span>", b"</span>"), (b"<div>", b"</div>"),
|
|
496
|
+
(b"<li>", b"</li>"), (b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"),
|
|
497
|
+
(b"<th>", b"</th>"), (b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
|
|
505
498
|
opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
|
|
506
499
|
if 0 <= opening_pos < data.find(closing_tag, opening_pos):
|
|
507
500
|
# opening and closing tags were found - suppose it is an HTML
|
|
@@ -658,81 +651,11 @@ class Util:
|
|
|
658
651
|
except Exception as exc:
|
|
659
652
|
logging.error(f"Failed to write: {file_path} {exc}")
|
|
660
653
|
|
|
661
|
-
@staticmethod
|
|
662
|
-
def __extract_value(node: Any, value: Any) -> List[Any]:
|
|
663
|
-
result = []
|
|
664
|
-
for i in getattr(node, "targets"):
|
|
665
|
-
if hasattr(i, "id"):
|
|
666
|
-
result.append({getattr(i, "id"): value})
|
|
667
|
-
else:
|
|
668
|
-
logger.error(f"{str(i)} has no 'id'")
|
|
669
|
-
return result
|
|
670
|
-
|
|
671
|
-
@staticmethod
|
|
672
|
-
def __extract_assign(node: Any) -> List[Any]:
|
|
673
|
-
result = []
|
|
674
|
-
if hasattr(node, "value") and hasattr(node, "targets"):
|
|
675
|
-
value = getattr(node, "value")
|
|
676
|
-
if hasattr(value, "value"):
|
|
677
|
-
# python 3.8 - 3.10
|
|
678
|
-
result.extend(Util.__extract_value(node, getattr(value, "value")))
|
|
679
|
-
else:
|
|
680
|
-
logger.error(f"value.{value} has no 'value' {dir(value)}")
|
|
681
|
-
else:
|
|
682
|
-
logger.error(f"{str(node)} has no 'value' {dir(node)}")
|
|
683
|
-
return result
|
|
684
|
-
|
|
685
|
-
@staticmethod
|
|
686
|
-
def ast_to_dict(node: Any) -> List[Any]:
|
|
687
|
-
"""Recursive parsing AST tree of python source to list with strings"""
|
|
688
|
-
result: List[Any] = []
|
|
689
|
-
if hasattr(node, "value") and isinstance(node.value, str):
|
|
690
|
-
result.append(node.value)
|
|
691
|
-
|
|
692
|
-
if isinstance(node, ast.Module) \
|
|
693
|
-
or isinstance(node, ast.FunctionDef):
|
|
694
|
-
if hasattr(node, "body"):
|
|
695
|
-
for i in node.body:
|
|
696
|
-
x = Util.ast_to_dict(i)
|
|
697
|
-
if x:
|
|
698
|
-
result.extend(x)
|
|
699
|
-
elif isinstance(node, ast.Import):
|
|
700
|
-
logger.debug("Import:%s", str(node))
|
|
701
|
-
elif isinstance(node, ast.Assign):
|
|
702
|
-
result.extend(Util.__extract_assign(node))
|
|
703
|
-
elif isinstance(node, ast.Expr) \
|
|
704
|
-
or isinstance(node, ast.AnnAssign) \
|
|
705
|
-
or isinstance(node, ast.AugAssign) \
|
|
706
|
-
or isinstance(node, ast.Call) \
|
|
707
|
-
or isinstance(node, ast.JoinedStr) \
|
|
708
|
-
or isinstance(node, ast.Return) \
|
|
709
|
-
or isinstance(node, ast.ImportFrom) \
|
|
710
|
-
or isinstance(node, ast.Assert) \
|
|
711
|
-
or isinstance(node, ast.Pass) \
|
|
712
|
-
or isinstance(node, ast.Raise) \
|
|
713
|
-
or isinstance(node, ast.Str) \
|
|
714
|
-
or isinstance(node, ast.Name) \
|
|
715
|
-
or isinstance(node, ast.FormattedValue) \
|
|
716
|
-
or isinstance(node, ast.Global):
|
|
717
|
-
if hasattr(node, "value"):
|
|
718
|
-
result.extend(Util.ast_to_dict(getattr(node, "value")))
|
|
719
|
-
if hasattr(node, "args"):
|
|
720
|
-
for i in getattr(node, "args"):
|
|
721
|
-
result.extend(Util.ast_to_dict(i))
|
|
722
|
-
if hasattr(node, "values"):
|
|
723
|
-
for i in getattr(node, "values"):
|
|
724
|
-
result.extend(Util.ast_to_dict(i))
|
|
725
|
-
else:
|
|
726
|
-
logger.debug(f"skip:{str(node)}")
|
|
727
|
-
else:
|
|
728
|
-
logger.debug(f"unknown:{str(node)}")
|
|
729
|
-
return result
|
|
730
|
-
|
|
731
654
|
@staticmethod
|
|
732
655
|
def parse_python(source: str) -> List[Any]:
|
|
733
|
-
"""Parse python source to
|
|
656
|
+
"""Parse python source and back to remove strings merge and line wrap"""
|
|
734
657
|
src = ast.parse(source)
|
|
735
|
-
result =
|
|
658
|
+
result = ast.unparse(src).splitlines()
|
|
736
659
|
return result
|
|
737
660
|
|
|
738
661
|
@staticmethod
|