yaralyzer 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .yaralyzer.example +65 -0
- CHANGELOG.md +128 -0
- LICENSE +674 -0
- yaralyzer/__init__.py +76 -0
- yaralyzer/bytes_match.py +276 -0
- yaralyzer/config.py +126 -0
- yaralyzer/decoding/bytes_decoder.py +207 -0
- yaralyzer/decoding/decoding_attempt.py +222 -0
- yaralyzer/encoding_detection/character_encodings.py +197 -0
- yaralyzer/encoding_detection/encoding_assessment.py +83 -0
- yaralyzer/encoding_detection/encoding_detector.py +145 -0
- yaralyzer/helpers/bytes_helper.py +268 -0
- yaralyzer/helpers/dict_helper.py +8 -0
- yaralyzer/helpers/file_helper.py +49 -0
- yaralyzer/helpers/list_helper.py +16 -0
- yaralyzer/helpers/rich_text_helper.py +150 -0
- yaralyzer/helpers/string_helper.py +34 -0
- yaralyzer/output/decoding_attempts_table.py +82 -0
- yaralyzer/output/decoding_table_row.py +60 -0
- yaralyzer/output/file_export.py +111 -0
- yaralyzer/output/file_hashes_table.py +82 -0
- yaralyzer/output/regex_match_metrics.py +97 -0
- yaralyzer/output/rich_console.py +114 -0
- yaralyzer/util/argument_parser.py +297 -0
- yaralyzer/util/logging.py +135 -0
- yaralyzer/yara/error.py +90 -0
- yaralyzer/yara/yara_match.py +160 -0
- yaralyzer/yara/yara_rule_builder.py +164 -0
- yaralyzer/yaralyzer.py +304 -0
- yaralyzer-1.0.11.dist-info/LICENSE +674 -0
- yaralyzer-1.0.11.dist-info/METADATA +151 -0
- yaralyzer-1.0.11.dist-info/RECORD +34 -0
- yaralyzer-1.0.11.dist-info/WHEEL +4 -0
- yaralyzer-1.0.11.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
|
|
3
|
+
"""
|
|
4
|
+
from sys import byteorder
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from rich.markup import escape
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from yaralyzer.bytes_match import BytesMatch # Formerly caused circular import issues
|
|
11
|
+
from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
|
|
12
|
+
UTF_8, encoding_width, is_wide_utf)
|
|
13
|
+
from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
|
|
14
|
+
from yaralyzer.helpers.rich_text_helper import prefix_with_style, unprintable_byte_to_text
|
|
15
|
+
from yaralyzer.output.rich_console import ALERT_STYLE, BYTES_BRIGHTER, BYTES_BRIGHTEST, BYTES_NO_DIM, GREY_ADDRESS
|
|
16
|
+
from yaralyzer.util.logging import log
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DecodingAttempt:
|
|
20
|
+
"""
|
|
21
|
+
Manages the process of attempting to decode a chunk of bytes into a string using a specified encoding.
|
|
22
|
+
|
|
23
|
+
This class tries to decode the bytes using the provided encoding, handling both standard and custom decoding
|
|
24
|
+
strategies (including multi-byte encodings and forced decoding attempts). It tracks the outcome, highlights
|
|
25
|
+
the decoded output, and provides information about the decoding process.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
bytes (bytes): The bytes (including context) to decode.
|
|
29
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing match and context info.
|
|
30
|
+
encoding (str): The encoding to attempt.
|
|
31
|
+
encoding_label (str): Label for the encoding (may include offset info).
|
|
32
|
+
start_offset (int): Byte offset used for decoding (for multi-byte encodings).
|
|
33
|
+
start_offset_label (Optional[str]): String label for the offset, if used.
|
|
34
|
+
was_force_decoded (bool): True if a forced decode was attempted.
|
|
35
|
+
failed_to_decode (bool): True if decoding failed.
|
|
36
|
+
decoded_string (Text): The decoded string as a Rich `Text` object (with highlighting).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Initialize a `DecodingAttempt` for a specific `encoding` on a given `BytesMatch`.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
|
|
45
|
+
encoding (str): The encoding to attempt for decoding the bytes.
|
|
46
|
+
"""
|
|
47
|
+
self.bytes = bytes_match.surrounding_bytes
|
|
48
|
+
self.bytes_match = bytes_match
|
|
49
|
+
self.encoding = encoding
|
|
50
|
+
# Inferred / derived values
|
|
51
|
+
self.encoding_label = encoding
|
|
52
|
+
self.start_offset = 0 # Offset in bytes to start decoding from
|
|
53
|
+
self.start_offset_label = None # String to indicate what offset we were able to decode
|
|
54
|
+
self.was_force_decoded = False
|
|
55
|
+
self.failed_to_decode = False
|
|
56
|
+
self.decoded_string = self._decode_bytes()
|
|
57
|
+
|
|
58
|
+
def _decode_bytes(self) -> Text:
|
|
59
|
+
"""
|
|
60
|
+
Tries builtin decode, hands off to other methods for harsher treatment (byte shifting for
|
|
61
|
+
UTF-16/32 and custom decode for the rest) if that fails. Has side effect of setting
|
|
62
|
+
`self.decoded_string` value.
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
|
|
66
|
+
log.info(f"{self.encoding} auto-decoded {self.bytes_match}")
|
|
67
|
+
return decoded_string
|
|
68
|
+
except UnicodeDecodeError:
|
|
69
|
+
log.info(f"{self.encoding} failed on 1st pass decoding {self.bytes_match} capture; custom decoding...")
|
|
70
|
+
except LookupError as e:
|
|
71
|
+
log.warning(f"Unknown encoding: {self.encoding}. {e}")
|
|
72
|
+
return self._failed_to_decode_msg_txt(e)
|
|
73
|
+
|
|
74
|
+
self.was_force_decoded = True
|
|
75
|
+
|
|
76
|
+
if is_wide_utf(self.encoding):
|
|
77
|
+
return self._decode_utf_multibyte()
|
|
78
|
+
else:
|
|
79
|
+
return self._custom_utf_decode()
|
|
80
|
+
|
|
81
|
+
def _custom_utf_decode(self) -> Text:
|
|
82
|
+
"""
|
|
83
|
+
Returns a `Text` obj representing an attempt to force a UTF-8 encoding onto an array of bytes.
|
|
84
|
+
"""
|
|
85
|
+
log.info(f"Custom decoding {self.bytes_match} with {self.encoding}...")
|
|
86
|
+
unprintable_char_map = ENCODINGS_TO_ATTEMPT.get(self.encoding)
|
|
87
|
+
output = Text('', style='bytes.decoded')
|
|
88
|
+
|
|
89
|
+
# We use this to skip over bytes consumed by multi-byte UTF-n chars
|
|
90
|
+
skip_next = 0
|
|
91
|
+
|
|
92
|
+
for i, b in enumerate(self.bytes):
|
|
93
|
+
if skip_next > 0:
|
|
94
|
+
skip_next -= 1
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
_byte = b.to_bytes(1, byteorder)
|
|
98
|
+
|
|
99
|
+
# Color the before and after bytes grey
|
|
100
|
+
if i < self.bytes_match.highlight_start_idx or i > self.bytes_match.highlight_end_idx:
|
|
101
|
+
style = GREY_ADDRESS
|
|
102
|
+
else:
|
|
103
|
+
style = self.bytes_match.highlight_style
|
|
104
|
+
|
|
105
|
+
if style not in [GREY_ADDRESS, ALERT_STYLE]:
|
|
106
|
+
if b <= 126:
|
|
107
|
+
style = BYTES_NO_DIM
|
|
108
|
+
elif b <= 192:
|
|
109
|
+
style = BYTES_BRIGHTER
|
|
110
|
+
else:
|
|
111
|
+
style = BYTES_BRIGHTEST
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
if unprintable_char_map is not None and b in unprintable_char_map:
|
|
115
|
+
output.append(unprintable_byte_to_text(unprintable_char_map[b], style=style))
|
|
116
|
+
elif b < 127:
|
|
117
|
+
output.append(_byte.decode(self.encoding), style=style)
|
|
118
|
+
elif self.encoding != UTF_8:
|
|
119
|
+
output.append(_byte.decode(self.encoding), style=style)
|
|
120
|
+
# At this point we know it's UTF-8, so it must be a continuation byte
|
|
121
|
+
elif b <= 192:
|
|
122
|
+
# In UTF-8 bytes from 128 to 192 is a continuation byte
|
|
123
|
+
output.append(unprintable_byte_to_text(f"CHAR{b}", style=style))
|
|
124
|
+
else:
|
|
125
|
+
if b <= 223:
|
|
126
|
+
char_width = 2
|
|
127
|
+
elif b <= 239:
|
|
128
|
+
char_width = 3
|
|
129
|
+
else:
|
|
130
|
+
char_width = 4
|
|
131
|
+
|
|
132
|
+
wide_char = self.bytes[i:i + char_width].decode(self.encoding)
|
|
133
|
+
output.append(wide_char, style=style)
|
|
134
|
+
skip_next = char_width - 1 # Won't be set if there's a decoding exception
|
|
135
|
+
log.info(f"Skipping next {skip_next} bytes because UTF-8 multibyte char '{wide_char}' used them")
|
|
136
|
+
except UnicodeDecodeError:
|
|
137
|
+
output.append(clean_byte_string(_byte), style=style)
|
|
138
|
+
|
|
139
|
+
return output
|
|
140
|
+
|
|
141
|
+
def _decode_utf_multibyte(self) -> Text:
|
|
142
|
+
"""
|
|
143
|
+
UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte
|
|
144
|
+
so we try several offsets until we find one that at least kind of works.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Text: Rich `Text` object representing the decoded string with highlighting.
|
|
148
|
+
"""
|
|
149
|
+
char_width = encoding_width(self.encoding)
|
|
150
|
+
last_exception = None
|
|
151
|
+
decoded_str = None
|
|
152
|
+
bytes_offset = 0
|
|
153
|
+
|
|
154
|
+
# Iterate through the possibly byte offsets until we find a valid decoded string (or don't)
|
|
155
|
+
while bytes_offset < char_width:
|
|
156
|
+
try:
|
|
157
|
+
decoded_str = truncate_for_encoding(self.bytes[bytes_offset:], self.encoding).decode(self.encoding)
|
|
158
|
+
except UnicodeDecodeError as e:
|
|
159
|
+
log.info(f"Exception decoding w/offset {bytes_offset} in {self.encoding}: {e}")
|
|
160
|
+
last_exception = e
|
|
161
|
+
|
|
162
|
+
# Append the current bytes_offset to the encoding label if we found a valid decoded string
|
|
163
|
+
if decoded_str is not None:
|
|
164
|
+
log.debug(f"Successfully decoded '{self.encoding}' w/offset {bytes_offset}")
|
|
165
|
+
self.start_offset = bytes_offset
|
|
166
|
+
self.start_offset_label = f"offset {self.start_offset} byte" + ('s' if self.start_offset > 1 else '')
|
|
167
|
+
self.encoding_label = f"{self.encoding} ({self.start_offset_label})"
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
bytes_offset += 1
|
|
171
|
+
|
|
172
|
+
if decoded_str is not None:
|
|
173
|
+
return self._to_rich_text(decoded_str, bytes_offset)
|
|
174
|
+
else:
|
|
175
|
+
return self._failed_to_decode_msg_txt(last_exception)
|
|
176
|
+
|
|
177
|
+
def _to_rich_text(self, _string: str, bytes_offset: int = 0) -> Text:
|
|
178
|
+
"""
|
|
179
|
+
Convert a decoded string to highlighted `Text` representation.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
_string (str): The decoded string to convert.
|
|
183
|
+
bytes_offset (int): The byte offset used during decoding (for multi-byte encodings).
|
|
184
|
+
Returns:
|
|
185
|
+
Text: The rich `Text` representation of the decoded string with appropriate highlighting.
|
|
186
|
+
"""
|
|
187
|
+
# Adjust where we start the highlighting given the multibyte nature of the encodings
|
|
188
|
+
log.debug(f"Stepping through {self.encoding} encoded string...")
|
|
189
|
+
txt = Text('', style=self.bytes_match.style_at_position(0))
|
|
190
|
+
current_byte_idx = 0
|
|
191
|
+
|
|
192
|
+
# Prevent unprintable chars other than newline. Some of them disfigure the terminal output permanently
|
|
193
|
+
if self.encoding in SINGLE_BYTE_ENCODINGS:
|
|
194
|
+
is_single_byte_encoding = True
|
|
195
|
+
unprintable_chars = ENCODINGS_TO_ATTEMPT[self.encoding]
|
|
196
|
+
else:
|
|
197
|
+
is_single_byte_encoding = False
|
|
198
|
+
unprintable_chars = {}
|
|
199
|
+
|
|
200
|
+
for _i, c in enumerate(_string):
|
|
201
|
+
char_bytes = bytes(c, self.encoding)
|
|
202
|
+
char_width = len(char_bytes)
|
|
203
|
+
style = self.bytes_match.style_at_position(current_byte_idx + bytes_offset)
|
|
204
|
+
|
|
205
|
+
# 10 is newline in single byte encodings
|
|
206
|
+
if c.isprintable() or (ord(c) == 10 and is_single_byte_encoding):
|
|
207
|
+
txt.append(c, style)
|
|
208
|
+
elif ord(c) == 9 and is_single_byte_encoding:
|
|
209
|
+
txt.append(unprintable_byte_to_text('\\t', style=style))
|
|
210
|
+
elif ord(c) in unprintable_chars:
|
|
211
|
+
txt.append(unprintable_byte_to_text(unprintable_chars[ord(c)], style=style))
|
|
212
|
+
else:
|
|
213
|
+
txt.append(unprintable_byte_to_text(f"CHAR{ord(c)}", style=style))
|
|
214
|
+
|
|
215
|
+
current_byte_idx += char_width
|
|
216
|
+
|
|
217
|
+
return txt
|
|
218
|
+
|
|
219
|
+
def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
|
|
220
|
+
"""Set `self.failed_to_decode` flag and return a `Text` object with the error message."""
|
|
221
|
+
self.failed_to_decode = True
|
|
222
|
+
return prefix_with_style(f"(decode failed: {exception})", style='red dim italic')
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants related to character encodings.
|
|
3
|
+
|
|
4
|
+
Helpful links:
|
|
5
|
+
|
|
6
|
+
* ISO-8859: [www.mit.edu/people/kenta/two/iso8859.html](https://www.mit.edu/people/kenta/two/iso8859.html)
|
|
7
|
+
|
|
8
|
+
* UTF-8: [www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec](https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec) # noqa: E501
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Bytes (TODO: why is this here?)
|
|
12
|
+
NEWLINE_BYTE = b"\n"
|
|
13
|
+
|
|
14
|
+
# String constants
|
|
15
|
+
ENCODING = 'encoding'
|
|
16
|
+
ASCII = 'ascii'
|
|
17
|
+
UTF_8 = 'utf-8'
|
|
18
|
+
UTF_16 = 'utf-16'
|
|
19
|
+
UTF_32 = 'utf-32'
|
|
20
|
+
ISO_8859_1 = 'iso-8859-1'
|
|
21
|
+
WINDOWS_1252 = 'windows-1252'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Byte order marks
|
|
25
|
+
BOMS = {
|
|
26
|
+
b'\x2b\x2f\x76': 'UTF-7 BOM',
|
|
27
|
+
b'\xef\xbb\xbf': 'UTF-8 BOM',
|
|
28
|
+
b'\xfe\xff': 'UTF-16 BOM big-endian',
|
|
29
|
+
b'\xff\xfe': 'UTF-16 BOM little-endian',
|
|
30
|
+
b'\xff\xfe\x00\x00': 'UTF-32 BOM little-endian',
|
|
31
|
+
b'\x00\x00\xfe\xff': 'UTF-32 BOM big-endian',
|
|
32
|
+
b'\x0e\xfe\xff': 'SCSU BOM',
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ASCII characters that either print nothing, put the cursor in a weird place, or (worst of all) actively
|
|
37
|
+
# delete stuff you already printed
|
|
38
|
+
UNPRINTABLE_ASCII = {
|
|
39
|
+
0: 'NUL', # 'Null',
|
|
40
|
+
1: 'SOH', # 'StartHeading',
|
|
41
|
+
2: 'STX', # 'StartText',
|
|
42
|
+
3: 'ETX',
|
|
43
|
+
4: 'EOT', # End of transmission
|
|
44
|
+
5: 'ENQ', # 'Enquiry',
|
|
45
|
+
6: 'ACK', # 'Acknowledgement',
|
|
46
|
+
7: 'BEL', # 'Bell',
|
|
47
|
+
8: 'BS', # 'BackSpace',
|
|
48
|
+
# 9: 'HT' # 'HorizontalTab',
|
|
49
|
+
# 10: 'LF', # 'LineFeed',
|
|
50
|
+
11: 'VT', # 'VerticalTab',
|
|
51
|
+
12: 'FF', # 'FormFeed', AKA 'NewPage'
|
|
52
|
+
13: 'CR', # 'CarriageReturn',
|
|
53
|
+
14: 'SO', # 'ShiftOut',
|
|
54
|
+
15: 'SI', # 'ShiftIn',
|
|
55
|
+
16: 'DLE', # 'DataLineEscape',
|
|
56
|
+
17: 'DC1', # DeviceControl1',
|
|
57
|
+
18: 'DC2', # 'DeviceControl2',
|
|
58
|
+
19: 'DC3', # 'DeviceControl3',
|
|
59
|
+
20: 'DC4', # 'DeviceControl4',
|
|
60
|
+
21: 'NAK', # NegativeAcknowledgement',
|
|
61
|
+
22: 'SYN', # 'SynchronousIdle',
|
|
62
|
+
23: 'ETB', # 'EndTransmitBlock',
|
|
63
|
+
24: 'CAN', # 'Cancel',
|
|
64
|
+
25: 'EM', # 'EndMedium',
|
|
65
|
+
26: 'SUB', # 'Substitute',
|
|
66
|
+
27: 'ESC', # 'Escape',
|
|
67
|
+
28: 'FS', # 'FileSeparator',
|
|
68
|
+
29: 'GS', # 'GroupSeparator',
|
|
69
|
+
30: 'RS', # 'RecordSeparator',
|
|
70
|
+
31: 'US', # 'UnitSeparator',
|
|
71
|
+
127: 'DEL', # Delete
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def scrub_c1_control_chars(char_map: dict) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Fill in a `dict` with integer keys/values corresponding to where a given char encoding has no chars
|
|
78
|
+
because this range is for C1 control chars (AKA the "undefined" part of most character maps).
|
|
79
|
+
"""
|
|
80
|
+
for i in range(128, 160):
|
|
81
|
+
char_map[i] = f"C1.CHAR{i}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ISO-8859-1 AKA "Latin-1". Basically ASCII but using more of 128-256 http://www.gammon.com.au/unicode/
|
|
85
|
+
UNPRINTABLE_ISO_8859_1 = UNPRINTABLE_ASCII.copy()
|
|
86
|
+
scrub_c1_control_chars(UNPRINTABLE_ISO_8859_1)
|
|
87
|
+
|
|
88
|
+
UNPRINTABLE_ISO_8859_1.update({
|
|
89
|
+
129: 'HOP',
|
|
90
|
+
141: 'RLF',
|
|
91
|
+
160: 'NBSP',
|
|
92
|
+
173: 'SHY',
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# UTF-8 Makes no use of 128-256 on their own, only as continuation bytes.
|
|
97
|
+
# The C1 bytes can appear but only as continuations
|
|
98
|
+
# https://en.wikipedia.org/wiki/UTF-8
|
|
99
|
+
UNPRINTABLE_UTF_8 = UNPRINTABLE_ASCII.copy()
|
|
100
|
+
|
|
101
|
+
# C0, C1, FE, and FF, etc. *never* appear in UTF-8
|
|
102
|
+
UNPRINTABLE_UTF_8.update({
|
|
103
|
+
192: 'C0',
|
|
104
|
+
193: 'C1',
|
|
105
|
+
245: 'F5',
|
|
106
|
+
246: 'F6',
|
|
107
|
+
247: 'F7',
|
|
108
|
+
248: 'F8',
|
|
109
|
+
249: 'F9',
|
|
110
|
+
250: 'FA',
|
|
111
|
+
251: 'FB',
|
|
112
|
+
252: 'FC',
|
|
113
|
+
253: 'FD',
|
|
114
|
+
254: 'FE',
|
|
115
|
+
255: 'FF',
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Win_1252 is a lot like other 256 char encodings but they colonized the C1 char DMZ in the middle
|
|
120
|
+
UNPRINTABLE_WIN_1252 = UNPRINTABLE_ASCII.copy()
|
|
121
|
+
|
|
122
|
+
UNPRINTABLE_WIN_1252.update({
|
|
123
|
+
129: 'HOP', # High Octet Preset
|
|
124
|
+
141: 'RLF', # Reverse Line Feed
|
|
125
|
+
143: 'SS3', # Single shift 3
|
|
126
|
+
144: 'DCS', # Device Control String
|
|
127
|
+
147: 'STS', # Set transmit state
|
|
128
|
+
160: 'NBSP', # Non-breaking space
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ISO-8859-7 #http://www.gammon.com.au/unicode/
|
|
133
|
+
UNPRINTABLE_ISO_8859_7 = UNPRINTABLE_ASCII.copy()
|
|
134
|
+
scrub_c1_control_chars(UNPRINTABLE_ISO_8859_7)
|
|
135
|
+
|
|
136
|
+
UNPRINTABLE_ISO_8859_7.update({
|
|
137
|
+
174: 'AE',
|
|
138
|
+
210: 'D2',
|
|
139
|
+
255: 'FF'
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Keys are names of encodings we will attempt to decode with, values are dicts mapping the unprintable bytes
|
|
144
|
+
# in that encoding to appropriate string represenations of those unprintable bytes.
|
|
145
|
+
# Order matters here, as we will attempt the decoding in the order of the keys.
|
|
146
|
+
ENCODINGS_TO_ATTEMPT = {
|
|
147
|
+
ASCII: UNPRINTABLE_ASCII,
|
|
148
|
+
UTF_8: UNPRINTABLE_UTF_8,
|
|
149
|
+
UTF_16: None,
|
|
150
|
+
UTF_32: None, # UTF-16 and 32 are handled differently
|
|
151
|
+
ISO_8859_1: UNPRINTABLE_ISO_8859_1,
|
|
152
|
+
WINDOWS_1252: UNPRINTABLE_WIN_1252,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
SINGLE_BYTE_ENCODINGS = [
|
|
156
|
+
ASCII,
|
|
157
|
+
ISO_8859_1,
|
|
158
|
+
WINDOWS_1252,
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
# Keys are encodings that use multiple bytes to represent a single character, values are the possible offsets
|
|
162
|
+
# to attempt to use as the starting point for decoding in a given set of bytes.
|
|
163
|
+
WIDE_UTF_ENCODINGS = {
|
|
164
|
+
UTF_16: [0, 1],
|
|
165
|
+
UTF_32: [0, 1, 2, 3],
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def encoding_offsets(encoding: str) -> list:
|
|
170
|
+
"""Get possible offsets for a given encoding. If the encoding is not in `WIDE_UTF_ENCODINGS`, return `[0]`."""
|
|
171
|
+
return WIDE_UTF_ENCODINGS.get(encoding, [0])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def encoding_width(encoding: str) -> int:
|
|
175
|
+
"""Get the width of a character in bytes for a given encoding, which is the number of possible offsets."""
|
|
176
|
+
return len(encoding_offsets(encoding))
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def is_wide_utf(encoding: str) -> bool:
|
|
180
|
+
"""Check if the encoding is a wide UTF encoding (UTF-16 or UTF-32)."""
|
|
181
|
+
return encoding in WIDE_UTF_ENCODINGS
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# TODO: this is unused cruft (mostly Asian language encodings)
|
|
185
|
+
ENCODINGS = [
|
|
186
|
+
'big5',
|
|
187
|
+
'big5hkscs',
|
|
188
|
+
'cp950',
|
|
189
|
+
'gb2312',
|
|
190
|
+
'gbk',
|
|
191
|
+
'gb18030',
|
|
192
|
+
'hz',
|
|
193
|
+
'iso2022_jp_2',
|
|
194
|
+
'utf-7',
|
|
195
|
+
'utf-8',
|
|
196
|
+
'utf-16',
|
|
197
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helps with `chardet` library.
|
|
3
|
+
"""
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
|
|
8
|
+
from yaralyzer.encoding_detection.character_encodings import ENCODING
|
|
9
|
+
from yaralyzer.helpers.rich_text_helper import (DIM_COUNTRY_THRESHOLD, meter_style,
|
|
10
|
+
prefix_with_style)
|
|
11
|
+
|
|
12
|
+
CONFIDENCE = 'confidence'
|
|
13
|
+
LANGUAGE = 'language'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EncodingAssessment:
|
|
17
|
+
"""
|
|
18
|
+
Class to smooth some of the rough edges around the `dict`s returned by `chardet.detect_all()`.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
assessment (dict): The dict returned by `chardet.detect_all()`.
|
|
22
|
+
encoding (str): The encoding detected, in lowercase.
|
|
23
|
+
confidence (float): Confidence score from 0.0 to 100.0.
|
|
24
|
+
confidence_text (Text): Rich `Text` object representing the confidence with styling.
|
|
25
|
+
language (Optional[str]): The detected language, if any.
|
|
26
|
+
encoding_label (Text): Rich `Text` object for displaying the encoding with optional language info.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, assessment: dict) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Args:
|
|
32
|
+
assessment (dict): The `dict` returned by `chardet.detect_all()`.
|
|
33
|
+
"""
|
|
34
|
+
self.assessment = assessment
|
|
35
|
+
self.encoding = assessment[ENCODING].lower()
|
|
36
|
+
|
|
37
|
+
# Shift confidence from 0-1.0 scale to 0-100.0 scale
|
|
38
|
+
self.confidence = 100.0 * (self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0)
|
|
39
|
+
self.confidence_text = prefix_with_style(f"{round(self.confidence, 1)}%", style=meter_style(self.confidence))
|
|
40
|
+
|
|
41
|
+
# Add detected language info and label if any language was detected
|
|
42
|
+
self.language = self._get_dict_empty_value_as_None(LANGUAGE)
|
|
43
|
+
self.set_encoding_label(self.language.title() if self.language else None)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def dummy_encoding_assessment(cls, encoding: str) -> 'EncodingAssessment':
|
|
47
|
+
"""
|
|
48
|
+
Construct an empty `EncodingAssessment` to use as a dummy when `chardet` gives us nothing.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
encoding (str): The encoding to use for the dummy assessment.
|
|
52
|
+
"""
|
|
53
|
+
assessment = cls({ENCODING: encoding, CONFIDENCE: 0.0})
|
|
54
|
+
assessment.confidence_text = Text('none', 'no_attempt')
|
|
55
|
+
return assessment
|
|
56
|
+
|
|
57
|
+
def set_encoding_label(self, alt_text: Optional[str]) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Alt text is displayed below the encoding in slightly dimmer font.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
alt_text (Optional[str]): Text to display along with the encoding (often the inferred language)
|
|
63
|
+
"""
|
|
64
|
+
self.encoding_label = Text(self.encoding, 'encoding.header')
|
|
65
|
+
|
|
66
|
+
if alt_text is not None:
|
|
67
|
+
dim = 'dim' if (self.confidence or 0.0) < DIM_COUNTRY_THRESHOLD else ''
|
|
68
|
+
self.encoding_label.append(f" ({alt_text})", style=f"color(23) {dim}")
|
|
69
|
+
|
|
70
|
+
def __rich__(self) -> Text:
|
|
71
|
+
return Text('<Chardet(', 'white') + self.encoding_label + Text(':') + self.confidence_text + Text('>')
|
|
72
|
+
|
|
73
|
+
def __str__(self) -> str:
|
|
74
|
+
return self.__rich__().plain
|
|
75
|
+
|
|
76
|
+
def _get_dict_empty_value_as_None(self, key: str) -> Any:
|
|
77
|
+
"""Return `None` if the value at `key` is an empty string, empty list, etc."""
|
|
78
|
+
value = self.assessment.get(key)
|
|
79
|
+
|
|
80
|
+
if isinstance(value, (dict, list, str)) and len(value) == 0:
|
|
81
|
+
return None
|
|
82
|
+
else:
|
|
83
|
+
return value
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`EncodingDetector` class for managing chardet encoding detection.
|
|
3
|
+
"""
|
|
4
|
+
from operator import attrgetter
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
import chardet
|
|
8
|
+
from rich import box
|
|
9
|
+
from rich.padding import Padding
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from yaralyzer.config import YaralyzerConfig
|
|
13
|
+
from yaralyzer.encoding_detection.encoding_assessment import ENCODING, EncodingAssessment
|
|
14
|
+
from yaralyzer.util.logging import log
|
|
15
|
+
|
|
16
|
+
CONFIDENCE_SCORE_RANGE = range(0, 101)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EncodingDetector:
|
|
20
|
+
"""
|
|
21
|
+
Manager class to ease dealing with the encoding detection library `chardet`.
|
|
22
|
+
|
|
23
|
+
Each instance of this class manages a `chardet.detect_all()` scan on a single set of bytes.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
bytes (bytes): The bytes to analyze.
|
|
27
|
+
bytes_len (int): The length of the bytes.
|
|
28
|
+
table (Table): A rich `Table` object summarizing the chardet results.
|
|
29
|
+
assessments (List[EncodingAssessment]): List of `EncodingAssessment` objects from `chardet` results.
|
|
30
|
+
unique_assessments (List[EncodingAssessment]): Unique assessments by encoding, highest confidence only.
|
|
31
|
+
raw_chardet_assessments (List[dict]): Raw list of dicts returned by `chardet.detect_all()`.
|
|
32
|
+
force_decode_assessments (List[EncodingAssessment]): Assessments above force decode threshold.
|
|
33
|
+
force_display_assessments (List[EncodingAssessment]): Assessments above force display threshold.
|
|
34
|
+
has_any_idea (Optional[bool]): `True` if `chardet` had any idea what the encoding might be,
|
|
35
|
+
`False` if not, `None` if `chardet` wasn't run yet.
|
|
36
|
+
force_display_threshold (float): `[class variable]` Default confidence threshold for forcing display
|
|
37
|
+
in decoded table.
|
|
38
|
+
force_decode_threshold (float): `[class variable]` Default confidence threshold for forcing a decode attempt.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Default value for encodings w/confidences below this will not be displayed in the decoded table
|
|
42
|
+
force_display_threshold = 20.0
|
|
43
|
+
# Default value for what chardet.detect() confidence % should we force a decode with an obscure encoding.
|
|
44
|
+
force_decode_threshold = 50.0
|
|
45
|
+
|
|
46
|
+
def __init__(self, _bytes: bytes) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Args:
|
|
49
|
+
_bytes (bytes): The bytes to analyze with `chardet`.
|
|
50
|
+
"""
|
|
51
|
+
self.bytes = _bytes
|
|
52
|
+
self.bytes_len = len(_bytes)
|
|
53
|
+
self.table = _empty_chardet_results_table()
|
|
54
|
+
|
|
55
|
+
# Skip chardet if there's not enough bytes available
|
|
56
|
+
if not self.has_enough_bytes():
|
|
57
|
+
log.debug(f"{self.bytes_len} is not enough bytes to run chardet.detect()")
|
|
58
|
+
self._set_empty_results()
|
|
59
|
+
self.has_any_idea = None # not false!
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
# Unique by encoding, ignoring language. Ordered from highest to lowest confidence
|
|
63
|
+
self.unique_assessments = []
|
|
64
|
+
self.raw_chardet_assessments = chardet.detect_all(self.bytes, ignore_threshold=True)
|
|
65
|
+
|
|
66
|
+
if len(self.raw_chardet_assessments) == 1 and self.raw_chardet_assessments[0][ENCODING] is None:
|
|
67
|
+
log.info(f"chardet.detect() has no idea what the encoding is, result: {self.raw_chardet_assessments}")
|
|
68
|
+
self._set_empty_results()
|
|
69
|
+
self.has_any_idea = False
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
self.has_any_idea = True
|
|
73
|
+
self.assessments = [EncodingAssessment(a) for a in self.raw_chardet_assessments]
|
|
74
|
+
self._uniquify_results_and_build_table()
|
|
75
|
+
self.force_decode_assessments = self.assessments_above_confidence(type(self).force_decode_threshold)
|
|
76
|
+
self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
|
|
77
|
+
|
|
78
|
+
def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
|
|
79
|
+
"""
|
|
80
|
+
Get the `chardet` assessment for a specific encoding.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
encoding (str): The encoding to look for.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
EncodingAssessment: Assessment for the given encoding if it exists, otherwise a dummy with 0 confidence.
|
|
87
|
+
"""
|
|
88
|
+
assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
|
|
89
|
+
return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
|
|
90
|
+
|
|
91
|
+
def has_enough_bytes(self) -> bool:
|
|
92
|
+
"""Return `True` if we have enough bytes to run `chardet.detect()`."""
|
|
93
|
+
return self.bytes_len >= YaralyzerConfig.args.min_chardet_bytes
|
|
94
|
+
|
|
95
|
+
def assessments_above_confidence(self, cutoff: float) -> List[EncodingAssessment]:
|
|
96
|
+
"""Return the assessments above the given confidence cutoff."""
|
|
97
|
+
return [a for a in self.unique_assessments if a.confidence >= cutoff]
|
|
98
|
+
|
|
99
|
+
def __rich__(self) -> Padding:
|
|
100
|
+
return Padding(self.table, (0, 0, 0, 0))
|
|
101
|
+
|
|
102
|
+
def _uniquify_results_and_build_table(self) -> None:
|
|
103
|
+
"""Keep the highest result per encoding, ignoring the language `chardet` has indicated."""
|
|
104
|
+
already_seen_encodings = {}
|
|
105
|
+
|
|
106
|
+
for i, result in enumerate(self.assessments):
|
|
107
|
+
if result.confidence < YaralyzerConfig.args.min_chardet_table_confidence:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
self.table.add_row(f"{i + 1}", result.encoding_label, result.confidence_text)
|
|
111
|
+
|
|
112
|
+
# self.unique_assessments retains one result per encoding possibility (the highest confidence one)
|
|
113
|
+
# Some encodings are not language specific and for those we don't care about the language
|
|
114
|
+
if result.encoding not in already_seen_encodings:
|
|
115
|
+
self.unique_assessments.append(result)
|
|
116
|
+
already_seen_encodings[result.encoding] = result
|
|
117
|
+
else:
|
|
118
|
+
log.debug(f"Skipping chardet result {result} (already saw {already_seen_encodings[result.encoding]})")
|
|
119
|
+
|
|
120
|
+
self.unique_assessments.sort(key=attrgetter('confidence'), reverse=True)
|
|
121
|
+
|
|
122
|
+
def _set_empty_results(self) -> None:
|
|
123
|
+
"""Set empty results for when `chardet` can't help us."""
|
|
124
|
+
self.assessments = []
|
|
125
|
+
self.unique_assessments = []
|
|
126
|
+
self.raw_chardet_assessments = []
|
|
127
|
+
self.force_decode_assessments = []
|
|
128
|
+
self.force_display_assessments = []
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _empty_chardet_results_table() -> Table:
|
|
132
|
+
"""Returns an empty `Table` with appropriate columns for `chardet` results."""
|
|
133
|
+
table = Table(
|
|
134
|
+
'Rank', 'Encoding', 'Confidence',
|
|
135
|
+
title='chardet.detect results',
|
|
136
|
+
title_style='color(153) italic dim',
|
|
137
|
+
header_style='off_white',
|
|
138
|
+
style='dim',
|
|
139
|
+
box=box.SIMPLE,
|
|
140
|
+
show_edge=False,
|
|
141
|
+
collapse_padding=True
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
table.columns[0].justify = 'right'
|
|
145
|
+
return table
|