credsweeper 1.11.2__py3-none-any.whl → 1.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +7 -5
- credsweeper/app.py +28 -47
- credsweeper/common/constants.py +2 -5
- credsweeper/common/keyword_pattern.py +15 -9
- credsweeper/common/morpheme_checklist.txt +4 -2
- credsweeper/credentials/candidate_key.py +1 -1
- credsweeper/credentials/credential_manager.py +4 -3
- credsweeper/credentials/line_data.py +16 -15
- credsweeper/deep_scanner/abstract_scanner.py +10 -1
- credsweeper/deep_scanner/deb_scanner.py +48 -0
- credsweeper/deep_scanner/deep_scanner.py +65 -43
- credsweeper/deep_scanner/docx_scanner.py +1 -1
- credsweeper/deep_scanner/encoder_scanner.py +2 -2
- credsweeper/deep_scanner/gzip_scanner.py +1 -1
- credsweeper/deep_scanner/html_scanner.py +3 -3
- credsweeper/deep_scanner/jks_scanner.py +2 -4
- credsweeper/deep_scanner/lang_scanner.py +2 -2
- credsweeper/deep_scanner/lzma_scanner.py +40 -0
- credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
- credsweeper/deep_scanner/xml_scanner.py +2 -2
- credsweeper/file_handler/byte_content_provider.py +2 -2
- credsweeper/file_handler/content_provider.py +1 -1
- credsweeper/file_handler/data_content_provider.py +23 -14
- credsweeper/file_handler/diff_content_provider.py +2 -2
- credsweeper/file_handler/file_path_extractor.py +1 -1
- credsweeper/file_handler/files_provider.py +2 -4
- credsweeper/file_handler/patches_provider.py +1 -1
- credsweeper/file_handler/string_content_provider.py +2 -2
- credsweeper/file_handler/struct_content_provider.py +1 -1
- credsweeper/file_handler/text_content_provider.py +2 -2
- credsweeper/filters/value_array_dictionary_check.py +3 -1
- credsweeper/filters/value_azure_token_check.py +1 -2
- credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
- credsweeper/filters/value_base64_part_check.py +30 -21
- credsweeper/filters/value_discord_bot_check.py +1 -2
- credsweeper/filters/value_entropy_base32_check.py +11 -31
- credsweeper/filters/value_entropy_base36_check.py +11 -34
- credsweeper/filters/value_entropy_base64_check.py +15 -48
- credsweeper/filters/value_entropy_base_check.py +37 -0
- credsweeper/filters/value_file_path_check.py +1 -1
- credsweeper/filters/value_hex_number_check.py +3 -3
- credsweeper/filters/value_json_web_token_check.py +4 -5
- credsweeper/filters/value_pattern_check.py +64 -16
- credsweeper/filters/value_string_type_check.py +11 -3
- credsweeper/filters/value_token_base32_check.py +0 -4
- credsweeper/filters/value_token_base36_check.py +0 -4
- credsweeper/filters/value_token_base64_check.py +0 -4
- credsweeper/filters/value_token_check.py +1 -1
- credsweeper/ml_model/features/file_extension.py +2 -2
- credsweeper/ml_model/features/morpheme_dense.py +0 -4
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in_path.py +0 -9
- credsweeper/ml_model/features/word_in_postamble.py +0 -11
- credsweeper/ml_model/features/word_in_preamble.py +0 -11
- credsweeper/ml_model/features/word_in_transition.py +0 -11
- credsweeper/ml_model/features/word_in_value.py +0 -11
- credsweeper/ml_model/features/word_in_variable.py +0 -11
- credsweeper/ml_model/ml_validator.py +45 -22
- credsweeper/rules/config.yaml +238 -208
- credsweeper/rules/rule.py +3 -3
- credsweeper/scanner/scan_type/scan_type.py +2 -3
- credsweeper/scanner/scanner.py +7 -1
- credsweeper/secret/config.json +16 -5
- credsweeper/utils/hop_stat.py +3 -3
- credsweeper/utils/pem_key_detector.py +8 -7
- credsweeper/utils/util.py +76 -146
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/METADATA +1 -1
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/RECORD +72 -70
- credsweeper/utils/entropy_validator.py +0 -72
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/WHEEL +0 -0
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/licenses/LICENSE +0 -0
credsweeper/rules/rule.py
CHANGED
|
@@ -179,7 +179,6 @@ class Rule:
|
|
|
179
179
|
for value in _values:
|
|
180
180
|
_pattern = KeywordPattern.get_keyword_pattern(value)
|
|
181
181
|
_patterns.append(_pattern)
|
|
182
|
-
return _patterns
|
|
183
182
|
elif RuleType.MULTI == self.rule_type and 2 == len(_values) \
|
|
184
183
|
or self.rule_type in (RuleType.PATTERN, RuleType.PEM_KEY) and 0 < len(_values):
|
|
185
184
|
for value in _values:
|
|
@@ -188,8 +187,9 @@ class Rule:
|
|
|
188
187
|
logger.warning(f"Rule {self.rule_name} has extra patterns. Only single pattern supported.")
|
|
189
188
|
elif RuleType.MULTI == self.rule_type and 2 < len(_values):
|
|
190
189
|
logger.warning(f"Rule {self.rule_name} has extra patterns. Only two patterns supported.")
|
|
191
|
-
|
|
192
|
-
|
|
190
|
+
else:
|
|
191
|
+
raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
|
|
192
|
+
return _patterns
|
|
193
193
|
|
|
194
194
|
@cached_property
|
|
195
195
|
def patterns(self) -> List[re.Pattern]:
|
|
@@ -38,13 +38,12 @@ class ScanType(ABC):
|
|
|
38
38
|
raise NotImplementedError()
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
|
-
def filtering(cls,
|
|
41
|
+
def filtering(cls, target: AnalysisTarget, line_data: LineData, filters: List[Filter]) -> bool:
|
|
42
42
|
"""Check if line data should be removed based on filters.
|
|
43
43
|
|
|
44
44
|
If `use_filters` option is false, always return False
|
|
45
45
|
|
|
46
46
|
Args:
|
|
47
|
-
config: dict of credsweeper configuration
|
|
48
47
|
target: AnalysisTarget from which `line_data` was obtained
|
|
49
48
|
line_data: Line data to check with `filters`
|
|
50
49
|
filters: Filters to use
|
|
@@ -112,7 +111,7 @@ class ScanType(ABC):
|
|
|
112
111
|
bypass_start = line_data.value_end
|
|
113
112
|
bypass_end = offset_end
|
|
114
113
|
|
|
115
|
-
if config.use_filters and cls.filtering(
|
|
114
|
+
if config.use_filters and cls.filtering(target, line_data, filters):
|
|
116
115
|
if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
|
|
117
116
|
# may be next matched item will be not filtered - let search it after variable
|
|
118
117
|
bypass_start = line_data.variable_end
|
credsweeper/scanner/scanner.py
CHANGED
|
@@ -146,7 +146,13 @@ class Scanner:
|
|
|
146
146
|
# "cache" - YAPF and pycharm formatters ...
|
|
147
147
|
matched_keyword = \
|
|
148
148
|
target_line_stripped_len >= self.min_keyword_len and ( #
|
|
149
|
-
'=' in target_line_stripped
|
|
149
|
+
'=' in target_line_stripped
|
|
150
|
+
or ':' in target_line_stripped
|
|
151
|
+
or "set" in target_line_stripped
|
|
152
|
+
or "#define" in target_line_stripped
|
|
153
|
+
or "%define" in target_line_stripped
|
|
154
|
+
or "%global" in target_line_stripped
|
|
155
|
+
) #
|
|
150
156
|
matched_pem_key = \
|
|
151
157
|
target_line_stripped_len >= self.min_pem_key_len \
|
|
152
158
|
and PEM_BEGIN_PATTERN in target_line_stripped and "PRIVATE" in target_line_stripped
|
credsweeper/secret/config.json
CHANGED
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
"exclude": {
|
|
3
3
|
"pattern": [],
|
|
4
4
|
"containers": [
|
|
5
|
+
".aar",
|
|
5
6
|
".apk",
|
|
6
7
|
".bz2",
|
|
7
8
|
".gz",
|
|
9
|
+
".lzma",
|
|
8
10
|
".tar",
|
|
11
|
+
".xz",
|
|
9
12
|
".zip"
|
|
10
13
|
],
|
|
11
14
|
"documents": [
|
|
@@ -20,17 +23,20 @@
|
|
|
20
23
|
],
|
|
21
24
|
"extension": [
|
|
22
25
|
".7z",
|
|
26
|
+
".a",
|
|
23
27
|
".aac",
|
|
24
|
-
".aar",
|
|
25
28
|
".avi",
|
|
29
|
+
".bin",
|
|
26
30
|
".bmp",
|
|
27
31
|
".class",
|
|
28
32
|
".css",
|
|
29
33
|
".dmg",
|
|
30
34
|
".ear",
|
|
31
35
|
".eot",
|
|
36
|
+
".elf",
|
|
32
37
|
".exe",
|
|
33
38
|
".gif",
|
|
39
|
+
".gmo",
|
|
34
40
|
".ico",
|
|
35
41
|
".img",
|
|
36
42
|
".info",
|
|
@@ -45,6 +51,7 @@
|
|
|
45
51
|
".mp4",
|
|
46
52
|
".npy",
|
|
47
53
|
".npz",
|
|
54
|
+
".obj",
|
|
48
55
|
".ogg",
|
|
49
56
|
".pak",
|
|
50
57
|
".png",
|
|
@@ -52,10 +59,13 @@
|
|
|
52
59
|
".pyc",
|
|
53
60
|
".pyd",
|
|
54
61
|
".pyo",
|
|
62
|
+
".rar",
|
|
55
63
|
".rc",
|
|
56
64
|
".rc2",
|
|
57
65
|
".rar",
|
|
58
66
|
".realm",
|
|
67
|
+
".res",
|
|
68
|
+
".rpm",
|
|
59
69
|
".s7z",
|
|
60
70
|
".scss",
|
|
61
71
|
".so",
|
|
@@ -70,6 +80,7 @@
|
|
|
70
80
|
".wav",
|
|
71
81
|
".webm",
|
|
72
82
|
".webp",
|
|
83
|
+
".wma",
|
|
73
84
|
".woff",
|
|
74
85
|
".yuv"
|
|
75
86
|
],
|
|
@@ -160,13 +171,13 @@
|
|
|
160
171
|
"line_num",
|
|
161
172
|
"path",
|
|
162
173
|
"info",
|
|
163
|
-
"value",
|
|
164
|
-
"value_start",
|
|
165
|
-
"value_end",
|
|
166
174
|
"variable",
|
|
167
175
|
"variable_start",
|
|
168
176
|
"variable_end",
|
|
169
|
-
"
|
|
177
|
+
"value",
|
|
178
|
+
"value_start",
|
|
179
|
+
"value_end",
|
|
180
|
+
"entropy"
|
|
170
181
|
],
|
|
171
182
|
"candidate_output": [
|
|
172
183
|
"rule",
|
credsweeper/utils/hop_stat.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import statistics
|
|
2
|
-
from typing import Tuple
|
|
2
|
+
from typing import Tuple, Dict
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class HopStat:
|
|
@@ -62,7 +62,7 @@ class HopStat:
|
|
|
62
62
|
})
|
|
63
63
|
|
|
64
64
|
def __init__(self):
|
|
65
|
-
self.__hop_dict =
|
|
65
|
+
self.__hop_dict: Dict[Tuple[str, str], int] = {}
|
|
66
66
|
base = ''.join(x for x in HopStat.KEYBOARD)
|
|
67
67
|
for a in (x for x in base if '\0' != x):
|
|
68
68
|
for b in (x for x in base if '\0' != x):
|
|
@@ -81,7 +81,7 @@ class HopStat:
|
|
|
81
81
|
def __get_xyz(c: str) -> Tuple[int, int, int]:
|
|
82
82
|
"""Returns axial coordinates of a char on keyboad qwerty"""
|
|
83
83
|
x = y = z = 0
|
|
84
|
-
for i in
|
|
84
|
+
for i, _ in enumerate(HopStat.KEYBOARD):
|
|
85
85
|
x = HopStat.KEYBOARD[i].find(c)
|
|
86
86
|
if 0 <= x:
|
|
87
87
|
z = i
|
|
@@ -4,15 +4,16 @@ import re
|
|
|
4
4
|
import string
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
-
from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN
|
|
7
|
+
from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN
|
|
8
8
|
from credsweeper.config import Config
|
|
9
9
|
from credsweeper.credentials import LineData
|
|
10
10
|
from credsweeper.file_handler.analysis_target import AnalysisTarget
|
|
11
11
|
from credsweeper.utils import Util
|
|
12
|
-
from credsweeper.utils.entropy_validator import EntropyValidator
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
15
|
+
ENTROPY_LIMIT_BASE64 = 4.5
|
|
16
|
+
|
|
16
17
|
|
|
17
18
|
class PemKeyDetector:
|
|
18
19
|
"""Class to detect PEM PRIVATE keys only"""
|
|
@@ -65,13 +66,13 @@ class PemKeyDetector:
|
|
|
65
66
|
if PEM_BEGIN_PATTERN in subline:
|
|
66
67
|
begin_pattern_not_passed = False
|
|
67
68
|
continue
|
|
68
|
-
|
|
69
|
+
if PEM_END_PATTERN in subline:
|
|
69
70
|
if "PGP" in target.line_strip:
|
|
70
71
|
# Check if entropy is high enough for base64 set with padding sign
|
|
71
|
-
|
|
72
|
-
if
|
|
72
|
+
entropy = Util.get_shannon_entropy(key_data)
|
|
73
|
+
if ENTROPY_LIMIT_BASE64 <= entropy:
|
|
73
74
|
return line_data
|
|
74
|
-
logger.debug("Filtered with entropy %f '%s'",
|
|
75
|
+
logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
|
|
75
76
|
if "OPENSSH" in target.line_strip:
|
|
76
77
|
# Check whether the key is encrypted
|
|
77
78
|
with contextlib.suppress(Exception):
|
|
@@ -125,7 +126,7 @@ class PemKeyDetector:
|
|
|
125
126
|
line = line.strip(string.whitespace)
|
|
126
127
|
if line.startswith("//"):
|
|
127
128
|
# simplify first condition for speed-up of doxygen style processing
|
|
128
|
-
if line.startswith("// "
|
|
129
|
+
if line.startswith(("// ", "/// ")):
|
|
129
130
|
# Assume that the commented line is to be separated from base64 code, it may be a part of PEM, otherwise
|
|
130
131
|
line = line[3:]
|
|
131
132
|
if line.startswith("/*"):
|
credsweeper/utils/util.py
CHANGED
|
@@ -12,13 +12,14 @@ from dataclasses import dataclass
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any, Dict, List, Tuple, Optional, Union
|
|
14
14
|
|
|
15
|
+
import numpy as np
|
|
15
16
|
import whatthepatch
|
|
16
17
|
import yaml
|
|
17
18
|
from lxml import etree
|
|
18
19
|
from typing_extensions import TypedDict
|
|
19
20
|
|
|
20
21
|
from credsweeper.common.constants import DiffRowType, AVAILABLE_ENCODINGS, \
|
|
21
|
-
DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE
|
|
22
|
+
DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, ASCII
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
@@ -65,21 +66,17 @@ class Util:
|
|
|
65
66
|
return result
|
|
66
67
|
|
|
67
68
|
@staticmethod
|
|
68
|
-
def get_shannon_entropy(data: str,
|
|
69
|
+
def get_shannon_entropy(data: Union[str, bytes]) -> float:
|
|
69
70
|
"""Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html."""
|
|
70
71
|
if not data:
|
|
71
|
-
return 0
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
p_x = data.count(x) / data_len
|
|
77
|
-
if p_x > 0:
|
|
78
|
-
entropy += -p_x * math.log(p_x, 2)
|
|
79
|
-
|
|
72
|
+
return 0.
|
|
73
|
+
size = len(data)
|
|
74
|
+
_uniq, counts = np.unique(list(data), return_counts=True)
|
|
75
|
+
probabilities = counts / size
|
|
76
|
+
entropy = float(-np.sum(probabilities * np.log2(probabilities)))
|
|
80
77
|
return entropy
|
|
81
78
|
|
|
82
|
-
|
|
79
|
+
# Precalculated data for speedup
|
|
83
80
|
MIN_DATA_ENTROPY: Dict[int, float] = {
|
|
84
81
|
16: 1.66973671780348,
|
|
85
82
|
20: 2.07723544540831,
|
|
@@ -153,41 +150,39 @@ class Util:
|
|
|
153
150
|
return entropy < min_entropy
|
|
154
151
|
|
|
155
152
|
@staticmethod
|
|
156
|
-
def is_known(data: bytes) -> bool:
|
|
157
|
-
"""
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
or Util.is_bzip2(data) \
|
|
164
|
-
or Util.is_com(data) \
|
|
165
|
-
or Util.is_pdf(data) \
|
|
166
|
-
or Util.is_elf(data):
|
|
167
|
-
return True
|
|
153
|
+
def is_known(data: Union[bytes, bytearray]) -> bool:
|
|
154
|
+
"""Returns True if any known binary format is found to prevent extra scan a file without an extension."""
|
|
155
|
+
if isinstance(data, (bytes, bytearray)):
|
|
156
|
+
if 127 <= len(data) and data.startswith(b"\x7f\x45\x4c\x46"):
|
|
157
|
+
# https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
|
|
158
|
+
# minimal ELF is 127 bytes https://github.com/tchajed/minimal-elf
|
|
159
|
+
return True
|
|
168
160
|
return False
|
|
169
161
|
|
|
170
162
|
@staticmethod
|
|
171
|
-
def is_binary(data: bytes) -> bool:
|
|
163
|
+
def is_binary(data: Union[bytes, bytearray]) -> bool:
|
|
172
164
|
"""
|
|
173
|
-
Returns True when two zeroes sequence is found
|
|
174
|
-
UTF-32 is not supported
|
|
165
|
+
Returns True when two zeroes sequence is found in begin of data.
|
|
166
|
+
The sequence never exists in text format (UTF-8, UTF-16). UTF-32 is not supported.
|
|
175
167
|
"""
|
|
176
168
|
if 0 <= data.find(b"\0\0", 0, MAX_LINE_LENGTH):
|
|
177
169
|
return True
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
170
|
+
else:
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
NOT_LATIN1_PRINTABLE_SET = set(range(0, 256)) \
|
|
174
|
+
.difference(set(x for x in string.printable.encode(ASCII))) \
|
|
175
|
+
.difference(set(x for x in range(0xA0, 0x100)))
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def is_latin1(data: Union[bytes, bytearray]) -> bool:
|
|
179
|
+
"""Returns True when data looks like LATIN-1 for first MAX_LINE_LENGTH bytes."""
|
|
180
|
+
result = False
|
|
184
181
|
if data:
|
|
182
|
+
non_latin1_cnt = sum(1 for x in data[:MAX_LINE_LENGTH] if x in Util.NOT_LATIN1_PRINTABLE_SET)
|
|
185
183
|
# experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal
|
|
186
|
-
chunk_len =
|
|
187
|
-
result = 0.1
|
|
188
|
-
else:
|
|
189
|
-
# empty data case
|
|
190
|
-
result = False
|
|
184
|
+
chunk_len = min(MAX_LINE_LENGTH, len(data))
|
|
185
|
+
result = 0.1 > non_latin1_cnt / chunk_len
|
|
191
186
|
return result
|
|
192
187
|
|
|
193
188
|
@staticmethod
|
|
@@ -231,10 +226,10 @@ class Util:
|
|
|
231
226
|
encodings = AVAILABLE_ENCODINGS
|
|
232
227
|
for encoding in encodings:
|
|
233
228
|
try:
|
|
234
|
-
if binary_suggest and LATIN_1 == encoding and (Util.
|
|
229
|
+
if binary_suggest and LATIN_1 == encoding and (Util.is_binary(content) or not Util.is_latin1(content)):
|
|
235
230
|
# LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
|
|
236
231
|
# so skip this encoding when checking binaries
|
|
237
|
-
logger.warning("Binary file detected")
|
|
232
|
+
logger.warning("Binary file detected %s", repr(content[:8]))
|
|
238
233
|
break
|
|
239
234
|
text = content.decode(encoding, errors="strict")
|
|
240
235
|
if content != text.encode(encoding, errors="strict"):
|
|
@@ -374,7 +369,7 @@ class Util:
|
|
|
374
369
|
line = change["line"]
|
|
375
370
|
if isinstance(line, str):
|
|
376
371
|
rows_data.extend(Util.preprocess_diff_rows(change.get("new"), change.get("old"), line))
|
|
377
|
-
elif isinstance(line, bytes):
|
|
372
|
+
elif isinstance(line, (bytes, bytearray)):
|
|
378
373
|
logger.warning("The feature is available with the deep scan option")
|
|
379
374
|
else:
|
|
380
375
|
logger.error(f"Unknown type of line {type(line)}")
|
|
@@ -382,9 +377,9 @@ class Util:
|
|
|
382
377
|
return rows_data
|
|
383
378
|
|
|
384
379
|
@staticmethod
|
|
385
|
-
def is_zip(data: bytes) -> bool:
|
|
380
|
+
def is_zip(data: Union[bytes, bytearray]) -> bool:
|
|
386
381
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
387
|
-
if isinstance(data, bytes) and 3 < len(data):
|
|
382
|
+
if isinstance(data, (bytes, bytearray)) and 3 < len(data):
|
|
388
383
|
# PK
|
|
389
384
|
if data.startswith(b"PK"):
|
|
390
385
|
if 0x03 == data[2] and 0x04 == data[3]:
|
|
@@ -398,18 +393,18 @@ class Util:
|
|
|
398
393
|
return False
|
|
399
394
|
|
|
400
395
|
@staticmethod
|
|
401
|
-
def is_com(data: bytes) -> bool:
|
|
396
|
+
def is_com(data: Union[bytes, bytearray]) -> bool:
|
|
402
397
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
403
|
-
if isinstance(data, bytes) and 8 < len(data):
|
|
398
|
+
if isinstance(data, (bytes, bytearray)) and 8 < len(data):
|
|
404
399
|
if data.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
|
|
405
400
|
# Compound File Binary Format: doc, xls, ppt, msi, msg
|
|
406
401
|
return True
|
|
407
402
|
return False
|
|
408
403
|
|
|
409
404
|
@staticmethod
|
|
410
|
-
def is_tar(data: bytes) -> bool:
|
|
405
|
+
def is_tar(data: Union[bytes, bytearray]) -> bool:
|
|
411
406
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures"""
|
|
412
|
-
if isinstance(data, bytes) and 512 <= len(data):
|
|
407
|
+
if isinstance(data, (bytes, bytearray)) and 512 <= len(data):
|
|
413
408
|
if 0x75 == data[257] and 0x73 == data[258] and 0x74 == data[259] \
|
|
414
409
|
and 0x61 == data[260] and 0x72 == data[261] and (
|
|
415
410
|
0x00 == data[262] and 0x30 == data[263] and 0x30 == data[264]
|
|
@@ -425,9 +420,16 @@ class Util:
|
|
|
425
420
|
return False
|
|
426
421
|
|
|
427
422
|
@staticmethod
|
|
428
|
-
def
|
|
423
|
+
def is_deb(data: Union[bytes, bytearray]) -> bool:
|
|
424
|
+
"""According https://en.wikipedia.org/wiki/Deb_(file_format)"""
|
|
425
|
+
if isinstance(data, (bytes, bytearray)) and 512 <= len(data) and data.startswith(b"!<arch>\n"):
|
|
426
|
+
return True
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
@staticmethod
|
|
430
|
+
def is_bzip2(data: Union[bytes, bytearray]) -> bool:
|
|
429
431
|
"""According https://en.wikipedia.org/wiki/Bzip2"""
|
|
430
|
-
if isinstance(data, bytes) and 10 <= len(data):
|
|
432
|
+
if isinstance(data, (bytes, bytearray)) and 10 <= len(data):
|
|
431
433
|
if data.startswith(b"\x42\x5A\x68") \
|
|
432
434
|
and 0x31 <= data[3] <= 0x39 \
|
|
433
435
|
and 0x31 == data[4] and 0x41 == data[5] and 0x59 == data[6] \
|
|
@@ -436,42 +438,49 @@ class Util:
|
|
|
436
438
|
return False
|
|
437
439
|
|
|
438
440
|
@staticmethod
|
|
439
|
-
def is_gzip(data: bytes) -> bool:
|
|
441
|
+
def is_gzip(data: Union[bytes, bytearray]) -> bool:
|
|
440
442
|
"""According https://www.rfc-editor.org/rfc/rfc1952"""
|
|
441
|
-
if isinstance(data, bytes) and 3 <= len(data):
|
|
443
|
+
if isinstance(data, (bytes, bytearray)) and 3 <= len(data):
|
|
442
444
|
if data.startswith(b"\x1F\x8B\x08"):
|
|
443
445
|
return True
|
|
444
446
|
return False
|
|
445
447
|
|
|
446
448
|
@staticmethod
|
|
447
|
-
def is_pdf(data: bytes) -> bool:
|
|
449
|
+
def is_pdf(data: Union[bytes, bytearray]) -> bool:
|
|
448
450
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - pdf"""
|
|
449
|
-
if isinstance(data, bytes) and 5 <= len(data):
|
|
451
|
+
if isinstance(data, (bytes, bytearray)) and 5 <= len(data):
|
|
450
452
|
if data.startswith(b"\x25\x50\x44\x46\x2D"):
|
|
451
453
|
return True
|
|
452
454
|
return False
|
|
453
455
|
|
|
454
456
|
@staticmethod
|
|
455
|
-
def is_jks(data: bytes) -> bool:
|
|
457
|
+
def is_jks(data: Union[bytes, bytearray]) -> bool:
|
|
456
458
|
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - jks"""
|
|
457
|
-
if isinstance(data, bytes) and 4 <= len(data):
|
|
459
|
+
if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
|
|
458
460
|
if data.startswith(b"\xFE\xED\xFE\xED"):
|
|
459
461
|
return True
|
|
460
462
|
return False
|
|
461
463
|
|
|
462
464
|
@staticmethod
|
|
463
|
-
def
|
|
465
|
+
def is_lzma(data: Union[bytes, bytearray]) -> bool:
|
|
466
|
+
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
|
|
467
|
+
if isinstance(data, (bytes, bytearray)) and 6 <= len(data):
|
|
468
|
+
if data.startswith((b"\xFD\x37\x7A\x58\x5A\x00", b"\x5D\x00\x00")):
|
|
469
|
+
return True
|
|
470
|
+
return False
|
|
471
|
+
|
|
472
|
+
@staticmethod
|
|
473
|
+
def is_asn1(data: Union[bytes, bytearray]) -> bool:
|
|
464
474
|
"""Only sequence type 0x30 and size correctness is checked"""
|
|
465
|
-
|
|
466
|
-
if isinstance(data, bytes) and 4 <= data_length:
|
|
475
|
+
if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
|
|
467
476
|
# sequence
|
|
468
477
|
if 0x30 == data[0]:
|
|
469
478
|
# https://www.oss.com/asn1/resources/asn1-made-simple/asn1-quick-reference/basic-encoding-rules.html#Lengths
|
|
470
479
|
length = data[1]
|
|
471
|
-
byte_len =
|
|
480
|
+
byte_len = 0x7F & length
|
|
472
481
|
if 0x80 == length and data.endswith(b"\x00\x00"):
|
|
473
482
|
return True
|
|
474
|
-
elif 0x80 < length and 1 < byte_len <
|
|
483
|
+
elif 0x80 < length and 1 < byte_len < len(data): # additional check
|
|
475
484
|
len_bytes = data[2:2 + byte_len]
|
|
476
485
|
try:
|
|
477
486
|
long_size = struct.unpack(">h", len_bytes)
|
|
@@ -482,26 +491,17 @@ class Util:
|
|
|
482
491
|
length = data[2]
|
|
483
492
|
else:
|
|
484
493
|
byte_len = 0
|
|
485
|
-
return
|
|
486
|
-
return False
|
|
487
|
-
|
|
488
|
-
@staticmethod
|
|
489
|
-
def is_elf(data: Union[bytes, bytearray]) -> bool:
|
|
490
|
-
"""According to https://en.wikipedia.org/wiki/Executable_and_Linkable_Format use only 5 bytes"""
|
|
491
|
-
if isinstance(data, (bytes, bytearray)) and 127 <= len(data):
|
|
492
|
-
# minimal is 127 bytes https://github.com/tchajed/minimal-elf
|
|
493
|
-
if data.startswith(b"\x7f\x45\x4c\x46") and (0x01 == data[5] or 0x02 == data[5]):
|
|
494
|
-
return True
|
|
494
|
+
return len(data) == length + 2 + byte_len
|
|
495
495
|
return False
|
|
496
496
|
|
|
497
497
|
@staticmethod
|
|
498
498
|
def is_html(data: Union[bytes, bytearray]) -> bool:
|
|
499
499
|
"""Used to detect html format. Suppose, invocation of is_xml() was True before."""
|
|
500
500
|
if isinstance(data, (bytes, bytearray)):
|
|
501
|
-
for opening_tag, closing_tag in [(b"<html
|
|
502
|
-
(b"<
|
|
503
|
-
(b"<
|
|
504
|
-
(b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
|
|
501
|
+
for opening_tag, closing_tag in [(b"<html", b"</html>"), (b"<body", b"</body>"), (b"<table", b"</table>"),
|
|
502
|
+
(b"<p>", b"</p>"), (b"<span>", b"</span>"), (b"<div>", b"</div>"),
|
|
503
|
+
(b"<li>", b"</li>"), (b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"),
|
|
504
|
+
(b"<th>", b"</th>"), (b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
|
|
505
505
|
opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
|
|
506
506
|
if 0 <= opening_pos < data.find(closing_tag, opening_pos):
|
|
507
507
|
# opening and closing tags were found - suppose it is an HTML
|
|
@@ -658,81 +658,11 @@ class Util:
|
|
|
658
658
|
except Exception as exc:
|
|
659
659
|
logging.error(f"Failed to write: {file_path} {exc}")
|
|
660
660
|
|
|
661
|
-
@staticmethod
|
|
662
|
-
def __extract_value(node: Any, value: Any) -> List[Any]:
|
|
663
|
-
result = []
|
|
664
|
-
for i in getattr(node, "targets"):
|
|
665
|
-
if hasattr(i, "id"):
|
|
666
|
-
result.append({getattr(i, "id"): value})
|
|
667
|
-
else:
|
|
668
|
-
logger.error(f"{str(i)} has no 'id'")
|
|
669
|
-
return result
|
|
670
|
-
|
|
671
|
-
@staticmethod
|
|
672
|
-
def __extract_assign(node: Any) -> List[Any]:
|
|
673
|
-
result = []
|
|
674
|
-
if hasattr(node, "value") and hasattr(node, "targets"):
|
|
675
|
-
value = getattr(node, "value")
|
|
676
|
-
if hasattr(value, "value"):
|
|
677
|
-
# python 3.8 - 3.10
|
|
678
|
-
result.extend(Util.__extract_value(node, getattr(value, "value")))
|
|
679
|
-
else:
|
|
680
|
-
logger.error(f"value.{value} has no 'value' {dir(value)}")
|
|
681
|
-
else:
|
|
682
|
-
logger.error(f"{str(node)} has no 'value' {dir(node)}")
|
|
683
|
-
return result
|
|
684
|
-
|
|
685
|
-
@staticmethod
|
|
686
|
-
def ast_to_dict(node: Any) -> List[Any]:
|
|
687
|
-
"""Recursive parsing AST tree of python source to list with strings"""
|
|
688
|
-
result: List[Any] = []
|
|
689
|
-
if hasattr(node, "value") and isinstance(node.value, str):
|
|
690
|
-
result.append(node.value)
|
|
691
|
-
|
|
692
|
-
if isinstance(node, ast.Module) \
|
|
693
|
-
or isinstance(node, ast.FunctionDef):
|
|
694
|
-
if hasattr(node, "body"):
|
|
695
|
-
for i in node.body:
|
|
696
|
-
x = Util.ast_to_dict(i)
|
|
697
|
-
if x:
|
|
698
|
-
result.extend(x)
|
|
699
|
-
elif isinstance(node, ast.Import):
|
|
700
|
-
logger.debug("Import:%s", str(node))
|
|
701
|
-
elif isinstance(node, ast.Assign):
|
|
702
|
-
result.extend(Util.__extract_assign(node))
|
|
703
|
-
elif isinstance(node, ast.Expr) \
|
|
704
|
-
or isinstance(node, ast.AnnAssign) \
|
|
705
|
-
or isinstance(node, ast.AugAssign) \
|
|
706
|
-
or isinstance(node, ast.Call) \
|
|
707
|
-
or isinstance(node, ast.JoinedStr) \
|
|
708
|
-
or isinstance(node, ast.Return) \
|
|
709
|
-
or isinstance(node, ast.ImportFrom) \
|
|
710
|
-
or isinstance(node, ast.Assert) \
|
|
711
|
-
or isinstance(node, ast.Pass) \
|
|
712
|
-
or isinstance(node, ast.Raise) \
|
|
713
|
-
or isinstance(node, ast.Str) \
|
|
714
|
-
or isinstance(node, ast.Name) \
|
|
715
|
-
or isinstance(node, ast.FormattedValue) \
|
|
716
|
-
or isinstance(node, ast.Global):
|
|
717
|
-
if hasattr(node, "value"):
|
|
718
|
-
result.extend(Util.ast_to_dict(getattr(node, "value")))
|
|
719
|
-
if hasattr(node, "args"):
|
|
720
|
-
for i in getattr(node, "args"):
|
|
721
|
-
result.extend(Util.ast_to_dict(i))
|
|
722
|
-
if hasattr(node, "values"):
|
|
723
|
-
for i in getattr(node, "values"):
|
|
724
|
-
result.extend(Util.ast_to_dict(i))
|
|
725
|
-
else:
|
|
726
|
-
logger.debug(f"skip:{str(node)}")
|
|
727
|
-
else:
|
|
728
|
-
logger.debug(f"unknown:{str(node)}")
|
|
729
|
-
return result
|
|
730
|
-
|
|
731
661
|
@staticmethod
|
|
732
662
|
def parse_python(source: str) -> List[Any]:
|
|
733
|
-
"""Parse python source to
|
|
663
|
+
"""Parse python source and back to remove strings merge and line wrap"""
|
|
734
664
|
src = ast.parse(source)
|
|
735
|
-
result =
|
|
665
|
+
result = ast.unparse(src).splitlines()
|
|
736
666
|
return result
|
|
737
667
|
|
|
738
668
|
@staticmethod
|