yaralyzer 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ """
2
+ Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
3
+ """
4
+ from sys import byteorder
5
+ from typing import Optional
6
+
7
+ from rich.markup import escape
8
+ from rich.text import Text
9
+
10
+ from yaralyzer.bytes_match import BytesMatch # Formerly caused circular import issues
11
+ from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
12
+ UTF_8, encoding_width, is_wide_utf)
13
+ from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
14
+ from yaralyzer.helpers.rich_text_helper import prefix_with_style, unprintable_byte_to_text
15
+ from yaralyzer.output.rich_console import ALERT_STYLE, BYTES_BRIGHTER, BYTES_BRIGHTEST, BYTES_NO_DIM, GREY_ADDRESS
16
+ from yaralyzer.util.logging import log
17
+
18
+
19
+ class DecodingAttempt:
20
+ """
21
+ Manages the process of attempting to decode a chunk of bytes into a string using a specified encoding.
22
+
23
+ This class tries to decode the bytes using the provided encoding, handling both standard and custom decoding
24
+ strategies (including multi-byte encodings and forced decoding attempts). It tracks the outcome, highlights
25
+ the decoded output, and provides information about the decoding process.
26
+
27
+ Attributes:
28
+ bytes (bytes): The bytes (including context) to decode.
29
+ bytes_match (BytesMatch): The `BytesMatch` object containing match and context info.
30
+ encoding (str): The encoding to attempt.
31
+ encoding_label (str): Label for the encoding (may include offset info).
32
+ start_offset (int): Byte offset used for decoding (for multi-byte encodings).
33
+ start_offset_label (Optional[str]): String label for the offset, if used.
34
+ was_force_decoded (bool): True if a forced decode was attempted.
35
+ failed_to_decode (bool): True if decoding failed.
36
+ decoded_string (Text): The decoded string as a Rich `Text` object (with highlighting).
37
+ """
38
+
39
+ def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
40
+ """
41
+ Initialize a `DecodingAttempt` for a specific `encoding` on a given `BytesMatch`.
42
+
43
+ Args:
44
+ bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
45
+ encoding (str): The encoding to attempt for decoding the bytes.
46
+ """
47
+ self.bytes = bytes_match.surrounding_bytes
48
+ self.bytes_match = bytes_match
49
+ self.encoding = encoding
50
+ # Inferred / derived values
51
+ self.encoding_label = encoding
52
+ self.start_offset = 0 # Offset in bytes to start decoding from
53
+ self.start_offset_label = None # String to indicate what offset we were able to decode
54
+ self.was_force_decoded = False
55
+ self.failed_to_decode = False
56
+ self.decoded_string = self._decode_bytes()
57
+
58
+ def _decode_bytes(self) -> Text:
59
+ """
60
+ Tries builtin decode, hands off to other methods for harsher treatment (byte shifting for
61
+ UTF-16/32 and custom decode for the rest) if that fails. Has side effect of setting
62
+ `self.decoded_string` value.
63
+ """
64
+ try:
65
+ decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
66
+ log.info(f"{self.encoding} auto-decoded {self.bytes_match}")
67
+ return decoded_string
68
+ except UnicodeDecodeError:
69
+ log.info(f"{self.encoding} failed on 1st pass decoding {self.bytes_match} capture; custom decoding...")
70
+ except LookupError as e:
71
+ log.warning(f"Unknown encoding: {self.encoding}. {e}")
72
+ return self._failed_to_decode_msg_txt(e)
73
+
74
+ self.was_force_decoded = True
75
+
76
+ if is_wide_utf(self.encoding):
77
+ return self._decode_utf_multibyte()
78
+ else:
79
+ return self._custom_utf_decode()
80
+
81
+ def _custom_utf_decode(self) -> Text:
82
+ """
83
+ Returns a `Text` obj representing an attempt to force a UTF-8 encoding onto an array of bytes.
84
+ """
85
+ log.info(f"Custom decoding {self.bytes_match} with {self.encoding}...")
86
+ unprintable_char_map = ENCODINGS_TO_ATTEMPT.get(self.encoding)
87
+ output = Text('', style='bytes.decoded')
88
+
89
+ # We use this to skip over bytes consumed by multi-byte UTF-n chars
90
+ skip_next = 0
91
+
92
+ for i, b in enumerate(self.bytes):
93
+ if skip_next > 0:
94
+ skip_next -= 1
95
+ continue
96
+
97
+ _byte = b.to_bytes(1, byteorder)
98
+
99
+ # Color the before and after bytes grey
100
+ if i < self.bytes_match.highlight_start_idx or i > self.bytes_match.highlight_end_idx:
101
+ style = GREY_ADDRESS
102
+ else:
103
+ style = self.bytes_match.highlight_style
104
+
105
+ if style not in [GREY_ADDRESS, ALERT_STYLE]:
106
+ if b <= 126:
107
+ style = BYTES_NO_DIM
108
+ elif b <= 192:
109
+ style = BYTES_BRIGHTER
110
+ else:
111
+ style = BYTES_BRIGHTEST
112
+
113
+ try:
114
+ if unprintable_char_map is not None and b in unprintable_char_map:
115
+ output.append(unprintable_byte_to_text(unprintable_char_map[b], style=style))
116
+ elif b < 127:
117
+ output.append(_byte.decode(self.encoding), style=style)
118
+ elif self.encoding != UTF_8:
119
+ output.append(_byte.decode(self.encoding), style=style)
120
+ # At this point we know it's UTF-8, so it must be a continuation byte
121
+ elif b <= 192:
122
+ # In UTF-8 bytes from 128 to 192 is a continuation byte
123
+ output.append(unprintable_byte_to_text(f"CHAR{b}", style=style))
124
+ else:
125
+ if b <= 223:
126
+ char_width = 2
127
+ elif b <= 239:
128
+ char_width = 3
129
+ else:
130
+ char_width = 4
131
+
132
+ wide_char = self.bytes[i:i + char_width].decode(self.encoding)
133
+ output.append(wide_char, style=style)
134
+ skip_next = char_width - 1 # Won't be set if there's a decoding exception
135
+ log.info(f"Skipping next {skip_next} bytes because UTF-8 multibyte char '{wide_char}' used them")
136
+ except UnicodeDecodeError:
137
+ output.append(clean_byte_string(_byte), style=style)
138
+
139
+ return output
140
+
141
+ def _decode_utf_multibyte(self) -> Text:
142
+ """
143
+ UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte
144
+ so we try several offsets until we find one that at least kind of works.
145
+
146
+ Returns:
147
+ Text: Rich `Text` object representing the decoded string with highlighting.
148
+ """
149
+ char_width = encoding_width(self.encoding)
150
+ last_exception = None
151
+ decoded_str = None
152
+ bytes_offset = 0
153
+
154
+ # Iterate through the possibly byte offsets until we find a valid decoded string (or don't)
155
+ while bytes_offset < char_width:
156
+ try:
157
+ decoded_str = truncate_for_encoding(self.bytes[bytes_offset:], self.encoding).decode(self.encoding)
158
+ except UnicodeDecodeError as e:
159
+ log.info(f"Exception decoding w/offset {bytes_offset} in {self.encoding}: {e}")
160
+ last_exception = e
161
+
162
+ # Append the current bytes_offset to the encoding label if we found a valid decoded string
163
+ if decoded_str is not None:
164
+ log.debug(f"Successfully decoded '{self.encoding}' w/offset {bytes_offset}")
165
+ self.start_offset = bytes_offset
166
+ self.start_offset_label = f"offset {self.start_offset} byte" + ('s' if self.start_offset > 1 else '')
167
+ self.encoding_label = f"{self.encoding} ({self.start_offset_label})"
168
+ break
169
+
170
+ bytes_offset += 1
171
+
172
+ if decoded_str is not None:
173
+ return self._to_rich_text(decoded_str, bytes_offset)
174
+ else:
175
+ return self._failed_to_decode_msg_txt(last_exception)
176
+
177
+ def _to_rich_text(self, _string: str, bytes_offset: int = 0) -> Text:
178
+ """
179
+ Convert a decoded string to highlighted `Text` representation.
180
+
181
+ Args:
182
+ _string (str): The decoded string to convert.
183
+ bytes_offset (int): The byte offset used during decoding (for multi-byte encodings).
184
+ Returns:
185
+ Text: The rich `Text` representation of the decoded string with appropriate highlighting.
186
+ """
187
+ # Adjust where we start the highlighting given the multibyte nature of the encodings
188
+ log.debug(f"Stepping through {self.encoding} encoded string...")
189
+ txt = Text('', style=self.bytes_match.style_at_position(0))
190
+ current_byte_idx = 0
191
+
192
+ # Prevent unprintable chars other than newline. Some of them disfigure the terminal output permanently
193
+ if self.encoding in SINGLE_BYTE_ENCODINGS:
194
+ is_single_byte_encoding = True
195
+ unprintable_chars = ENCODINGS_TO_ATTEMPT[self.encoding]
196
+ else:
197
+ is_single_byte_encoding = False
198
+ unprintable_chars = {}
199
+
200
+ for _i, c in enumerate(_string):
201
+ char_bytes = bytes(c, self.encoding)
202
+ char_width = len(char_bytes)
203
+ style = self.bytes_match.style_at_position(current_byte_idx + bytes_offset)
204
+
205
+ # 10 is newline in single byte encodings
206
+ if c.isprintable() or (ord(c) == 10 and is_single_byte_encoding):
207
+ txt.append(c, style)
208
+ elif ord(c) == 9 and is_single_byte_encoding:
209
+ txt.append(unprintable_byte_to_text('\\t', style=style))
210
+ elif ord(c) in unprintable_chars:
211
+ txt.append(unprintable_byte_to_text(unprintable_chars[ord(c)], style=style))
212
+ else:
213
+ txt.append(unprintable_byte_to_text(f"CHAR{ord(c)}", style=style))
214
+
215
+ current_byte_idx += char_width
216
+
217
+ return txt
218
+
219
+ def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
220
+ """Set `self.failed_to_decode` flag and return a `Text` object with the error message."""
221
+ self.failed_to_decode = True
222
+ return prefix_with_style(f"(decode failed: {exception})", style='red dim italic')
@@ -0,0 +1,197 @@
1
+ """
2
+ Constants related to character encodings.
3
+
4
+ Helpful links:
5
+
6
+ * ISO-8859: [www.mit.edu/people/kenta/two/iso8859.html](https://www.mit.edu/people/kenta/two/iso8859.html)
7
+
8
+ * UTF-8: [www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec](https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec) # noqa: E501
9
+ """
10
+
11
+ # Bytes (TODO: why is this here?)
12
+ NEWLINE_BYTE = b"\n"
13
+
14
+ # String constants
15
+ ENCODING = 'encoding'
16
+ ASCII = 'ascii'
17
+ UTF_8 = 'utf-8'
18
+ UTF_16 = 'utf-16'
19
+ UTF_32 = 'utf-32'
20
+ ISO_8859_1 = 'iso-8859-1'
21
+ WINDOWS_1252 = 'windows-1252'
22
+
23
+
24
+ # Byte order marks
25
+ BOMS = {
26
+ b'\x2b\x2f\x76': 'UTF-7 BOM',
27
+ b'\xef\xbb\xbf': 'UTF-8 BOM',
28
+ b'\xfe\xff': 'UTF-16 BOM big-endian',
29
+ b'\xff\xfe': 'UTF-16 BOM little-endian',
30
+ b'\xff\xfe\x00\x00': 'UTF-32 BOM little-endian',
31
+ b'\x00\x00\xfe\xff': 'UTF-32 BOM big-endian',
32
+ b'\x0e\xfe\xff': 'SCSU BOM',
33
+ }
34
+
35
+
36
+ # ASCII characters that either print nothing, put the cursor in a weird place, or (worst of all) actively
37
+ # delete stuff you already printed
38
+ UNPRINTABLE_ASCII = {
39
+ 0: 'NUL', # 'Null',
40
+ 1: 'SOH', # 'StartHeading',
41
+ 2: 'STX', # 'StartText',
42
+ 3: 'ETX',
43
+ 4: 'EOT', # End of transmission
44
+ 5: 'ENQ', # 'Enquiry',
45
+ 6: 'ACK', # 'Acknowledgement',
46
+ 7: 'BEL', # 'Bell',
47
+ 8: 'BS', # 'BackSpace',
48
+ # 9: 'HT' # 'HorizontalTab',
49
+ # 10: 'LF', # 'LineFeed',
50
+ 11: 'VT', # 'VerticalTab',
51
+ 12: 'FF', # 'FormFeed', AKA 'NewPage'
52
+ 13: 'CR', # 'CarriageReturn',
53
+ 14: 'SO', # 'ShiftOut',
54
+ 15: 'SI', # 'ShiftIn',
55
+ 16: 'DLE', # 'DataLineEscape',
56
+ 17: 'DC1', # DeviceControl1',
57
+ 18: 'DC2', # 'DeviceControl2',
58
+ 19: 'DC3', # 'DeviceControl3',
59
+ 20: 'DC4', # 'DeviceControl4',
60
+ 21: 'NAK', # NegativeAcknowledgement',
61
+ 22: 'SYN', # 'SynchronousIdle',
62
+ 23: 'ETB', # 'EndTransmitBlock',
63
+ 24: 'CAN', # 'Cancel',
64
+ 25: 'EM', # 'EndMedium',
65
+ 26: 'SUB', # 'Substitute',
66
+ 27: 'ESC', # 'Escape',
67
+ 28: 'FS', # 'FileSeparator',
68
+ 29: 'GS', # 'GroupSeparator',
69
+ 30: 'RS', # 'RecordSeparator',
70
+ 31: 'US', # 'UnitSeparator',
71
+ 127: 'DEL', # Delete
72
+ }
73
+
74
+
75
+ def scrub_c1_control_chars(char_map: dict) -> None:
76
+ """
77
+ Fill in a `dict` with integer keys/values corresponding to where a given char encoding has no chars
78
+ because this range is for C1 control chars (AKA the "undefined" part of most character maps).
79
+ """
80
+ for i in range(128, 160):
81
+ char_map[i] = f"C1.CHAR{i}"
82
+
83
+
84
+ # ISO-8859-1 AKA "Latin-1". Basically ASCII but using more of 128-256 http://www.gammon.com.au/unicode/
85
+ UNPRINTABLE_ISO_8859_1 = UNPRINTABLE_ASCII.copy()
86
+ scrub_c1_control_chars(UNPRINTABLE_ISO_8859_1)
87
+
88
+ UNPRINTABLE_ISO_8859_1.update({
89
+ 129: 'HOP',
90
+ 141: 'RLF',
91
+ 160: 'NBSP',
92
+ 173: 'SHY',
93
+ })
94
+
95
+
96
+ # UTF-8 Makes no use of 128-256 on their own, only as continuation bytes.
97
+ # The C1 bytes can appear but only as continuations
98
+ # https://en.wikipedia.org/wiki/UTF-8
99
+ UNPRINTABLE_UTF_8 = UNPRINTABLE_ASCII.copy()
100
+
101
+ # C0, C1, FE, and FF, etc. *never* appear in UTF-8
102
+ UNPRINTABLE_UTF_8.update({
103
+ 192: 'C0',
104
+ 193: 'C1',
105
+ 245: 'F5',
106
+ 246: 'F6',
107
+ 247: 'F7',
108
+ 248: 'F8',
109
+ 249: 'F9',
110
+ 250: 'FA',
111
+ 251: 'FB',
112
+ 252: 'FC',
113
+ 253: 'FD',
114
+ 254: 'FE',
115
+ 255: 'FF',
116
+ })
117
+
118
+
119
+ # Win_1252 is a lot like other 256 char encodings but they colonized the C1 char DMZ in the middle
120
+ UNPRINTABLE_WIN_1252 = UNPRINTABLE_ASCII.copy()
121
+
122
+ UNPRINTABLE_WIN_1252.update({
123
+ 129: 'HOP', # High Octet Preset
124
+ 141: 'RLF', # Reverse Line Feed
125
+ 143: 'SS3', # Single shift 3
126
+ 144: 'DCS', # Device Control String
127
+ 147: 'STS', # Set transmit state
128
+ 160: 'NBSP', # Non-breaking space
129
+ })
130
+
131
+
132
+ # ISO-8859-7 #http://www.gammon.com.au/unicode/
133
+ UNPRINTABLE_ISO_8859_7 = UNPRINTABLE_ASCII.copy()
134
+ scrub_c1_control_chars(UNPRINTABLE_ISO_8859_7)
135
+
136
+ UNPRINTABLE_ISO_8859_7.update({
137
+ 174: 'AE',
138
+ 210: 'D2',
139
+ 255: 'FF'
140
+ })
141
+
142
+
143
+ # Keys are names of encodings we will attempt to decode with, values are dicts mapping the unprintable bytes
144
+ # in that encoding to appropriate string represenations of those unprintable bytes.
145
+ # Order matters here, as we will attempt the decoding in the order of the keys.
146
+ ENCODINGS_TO_ATTEMPT = {
147
+ ASCII: UNPRINTABLE_ASCII,
148
+ UTF_8: UNPRINTABLE_UTF_8,
149
+ UTF_16: None,
150
+ UTF_32: None, # UTF-16 and 32 are handled differently
151
+ ISO_8859_1: UNPRINTABLE_ISO_8859_1,
152
+ WINDOWS_1252: UNPRINTABLE_WIN_1252,
153
+ }
154
+
155
+ SINGLE_BYTE_ENCODINGS = [
156
+ ASCII,
157
+ ISO_8859_1,
158
+ WINDOWS_1252,
159
+ ]
160
+
161
+ # Keys are encodings that use multiple bytes to represent a single character, values are the possible offsets
162
+ # to attempt to use as the starting point for decoding in a given set of bytes.
163
+ WIDE_UTF_ENCODINGS = {
164
+ UTF_16: [0, 1],
165
+ UTF_32: [0, 1, 2, 3],
166
+ }
167
+
168
+
169
+ def encoding_offsets(encoding: str) -> list:
170
+ """Get possible offsets for a given encoding. If the encoding is not in `WIDE_UTF_ENCODINGS`, return `[0]`."""
171
+ return WIDE_UTF_ENCODINGS.get(encoding, [0])
172
+
173
+
174
+ def encoding_width(encoding: str) -> int:
175
+ """Get the width of a character in bytes for a given encoding, which is the number of possible offsets."""
176
+ return len(encoding_offsets(encoding))
177
+
178
+
179
+ def is_wide_utf(encoding: str) -> bool:
180
+ """Check if the encoding is a wide UTF encoding (UTF-16 or UTF-32)."""
181
+ return encoding in WIDE_UTF_ENCODINGS
182
+
183
+
184
+ # TODO: this is unused cruft (mostly Asian language encodings)
185
+ ENCODINGS = [
186
+ 'big5',
187
+ 'big5hkscs',
188
+ 'cp950',
189
+ 'gb2312',
190
+ 'gbk',
191
+ 'gb18030',
192
+ 'hz',
193
+ 'iso2022_jp_2',
194
+ 'utf-7',
195
+ 'utf-8',
196
+ 'utf-16',
197
+ ]
@@ -0,0 +1,83 @@
1
+ """
2
+ Helps with `chardet` library.
3
+ """
4
+ from typing import Any, Optional
5
+
6
+ from rich.text import Text
7
+
8
+ from yaralyzer.encoding_detection.character_encodings import ENCODING
9
+ from yaralyzer.helpers.rich_text_helper import (DIM_COUNTRY_THRESHOLD, meter_style,
10
+ prefix_with_style)
11
+
12
+ CONFIDENCE = 'confidence'
13
+ LANGUAGE = 'language'
14
+
15
+
16
+ class EncodingAssessment:
17
+ """
18
+ Class to smooth some of the rough edges around the `dict`s returned by `chardet.detect_all()`.
19
+
20
+ Attributes:
21
+ assessment (dict): The dict returned by `chardet.detect_all()`.
22
+ encoding (str): The encoding detected, in lowercase.
23
+ confidence (float): Confidence score from 0.0 to 100.0.
24
+ confidence_text (Text): Rich `Text` object representing the confidence with styling.
25
+ language (Optional[str]): The detected language, if any.
26
+ encoding_label (Text): Rich `Text` object for displaying the encoding with optional language info.
27
+ """
28
+
29
+ def __init__(self, assessment: dict) -> None:
30
+ """
31
+ Args:
32
+ assessment (dict): The `dict` returned by `chardet.detect_all()`.
33
+ """
34
+ self.assessment = assessment
35
+ self.encoding = assessment[ENCODING].lower()
36
+
37
+ # Shift confidence from 0-1.0 scale to 0-100.0 scale
38
+ self.confidence = 100.0 * (self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0)
39
+ self.confidence_text = prefix_with_style(f"{round(self.confidence, 1)}%", style=meter_style(self.confidence))
40
+
41
+ # Add detected language info and label if any language was detected
42
+ self.language = self._get_dict_empty_value_as_None(LANGUAGE)
43
+ self.set_encoding_label(self.language.title() if self.language else None)
44
+
45
+ @classmethod
46
+ def dummy_encoding_assessment(cls, encoding: str) -> 'EncodingAssessment':
47
+ """
48
+ Construct an empty `EncodingAssessment` to use as a dummy when `chardet` gives us nothing.
49
+
50
+ Args:
51
+ encoding (str): The encoding to use for the dummy assessment.
52
+ """
53
+ assessment = cls({ENCODING: encoding, CONFIDENCE: 0.0})
54
+ assessment.confidence_text = Text('none', 'no_attempt')
55
+ return assessment
56
+
57
+ def set_encoding_label(self, alt_text: Optional[str]) -> None:
58
+ """
59
+ Alt text is displayed below the encoding in slightly dimmer font.
60
+
61
+ Args:
62
+ alt_text (Optional[str]): Text to display along with the encoding (often the inferred language)
63
+ """
64
+ self.encoding_label = Text(self.encoding, 'encoding.header')
65
+
66
+ if alt_text is not None:
67
+ dim = 'dim' if (self.confidence or 0.0) < DIM_COUNTRY_THRESHOLD else ''
68
+ self.encoding_label.append(f" ({alt_text})", style=f"color(23) {dim}")
69
+
70
+ def __rich__(self) -> Text:
71
+ return Text('<Chardet(', 'white') + self.encoding_label + Text(':') + self.confidence_text + Text('>')
72
+
73
+ def __str__(self) -> str:
74
+ return self.__rich__().plain
75
+
76
+ def _get_dict_empty_value_as_None(self, key: str) -> Any:
77
+ """Return `None` if the value at `key` is an empty string, empty list, etc."""
78
+ value = self.assessment.get(key)
79
+
80
+ if isinstance(value, (dict, list, str)) and len(value) == 0:
81
+ return None
82
+ else:
83
+ return value
@@ -0,0 +1,145 @@
1
+ """
2
+ `EncodingDetector` class for managing chardet encoding detection.
3
+ """
4
+ from operator import attrgetter
5
+ from typing import List
6
+
7
+ import chardet
8
+ from rich import box
9
+ from rich.padding import Padding
10
+ from rich.table import Table
11
+
12
+ from yaralyzer.config import YaralyzerConfig
13
+ from yaralyzer.encoding_detection.encoding_assessment import ENCODING, EncodingAssessment
14
+ from yaralyzer.util.logging import log
15
+
16
+ CONFIDENCE_SCORE_RANGE = range(0, 101)
17
+
18
+
19
+ class EncodingDetector:
20
+ """
21
+ Manager class to ease dealing with the encoding detection library `chardet`.
22
+
23
+ Each instance of this class manages a `chardet.detect_all()` scan on a single set of bytes.
24
+
25
+ Attributes:
26
+ bytes (bytes): The bytes to analyze.
27
+ bytes_len (int): The length of the bytes.
28
+ table (Table): A rich `Table` object summarizing the chardet results.
29
+ assessments (List[EncodingAssessment]): List of `EncodingAssessment` objects from `chardet` results.
30
+ unique_assessments (List[EncodingAssessment]): Unique assessments by encoding, highest confidence only.
31
+ raw_chardet_assessments (List[dict]): Raw list of dicts returned by `chardet.detect_all()`.
32
+ force_decode_assessments (List[EncodingAssessment]): Assessments above force decode threshold.
33
+ force_display_assessments (List[EncodingAssessment]): Assessments above force display threshold.
34
+ has_any_idea (Optional[bool]): `True` if `chardet` had any idea what the encoding might be,
35
+ `False` if not, `None` if `chardet` wasn't run yet.
36
+ force_display_threshold (float): `[class variable]` Default confidence threshold for forcing display
37
+ in decoded table.
38
+ force_decode_threshold (float): `[class variable]` Default confidence threshold for forcing a decode attempt.
39
+ """
40
+
41
+ # Default value for encodings w/confidences below this will not be displayed in the decoded table
42
+ force_display_threshold = 20.0
43
+ # Default value for what chardet.detect() confidence % should we force a decode with an obscure encoding.
44
+ force_decode_threshold = 50.0
45
+
46
+ def __init__(self, _bytes: bytes) -> None:
47
+ """
48
+ Args:
49
+ _bytes (bytes): The bytes to analyze with `chardet`.
50
+ """
51
+ self.bytes = _bytes
52
+ self.bytes_len = len(_bytes)
53
+ self.table = _empty_chardet_results_table()
54
+
55
+ # Skip chardet if there's not enough bytes available
56
+ if not self.has_enough_bytes():
57
+ log.debug(f"{self.bytes_len} is not enough bytes to run chardet.detect()")
58
+ self._set_empty_results()
59
+ self.has_any_idea = None # not false!
60
+ return
61
+
62
+ # Unique by encoding, ignoring language. Ordered from highest to lowest confidence
63
+ self.unique_assessments = []
64
+ self.raw_chardet_assessments = chardet.detect_all(self.bytes, ignore_threshold=True)
65
+
66
+ if len(self.raw_chardet_assessments) == 1 and self.raw_chardet_assessments[0][ENCODING] is None:
67
+ log.info(f"chardet.detect() has no idea what the encoding is, result: {self.raw_chardet_assessments}")
68
+ self._set_empty_results()
69
+ self.has_any_idea = False
70
+ return
71
+
72
+ self.has_any_idea = True
73
+ self.assessments = [EncodingAssessment(a) for a in self.raw_chardet_assessments]
74
+ self._uniquify_results_and_build_table()
75
+ self.force_decode_assessments = self.assessments_above_confidence(type(self).force_decode_threshold)
76
+ self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
77
+
78
+ def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
79
+ """
80
+ Get the `chardet` assessment for a specific encoding.
81
+
82
+ Args:
83
+ encoding (str): The encoding to look for.
84
+
85
+ Returns:
86
+ EncodingAssessment: Assessment for the given encoding if it exists, otherwise a dummy with 0 confidence.
87
+ """
88
+ assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
89
+ return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
90
+
91
+ def has_enough_bytes(self) -> bool:
92
+ """Return `True` if we have enough bytes to run `chardet.detect()`."""
93
+ return self.bytes_len >= YaralyzerConfig.args.min_chardet_bytes
94
+
95
+ def assessments_above_confidence(self, cutoff: float) -> List[EncodingAssessment]:
96
+ """Return the assessments above the given confidence cutoff."""
97
+ return [a for a in self.unique_assessments if a.confidence >= cutoff]
98
+
99
+ def __rich__(self) -> Padding:
100
+ return Padding(self.table, (0, 0, 0, 0))
101
+
102
+ def _uniquify_results_and_build_table(self) -> None:
103
+ """Keep the highest result per encoding, ignoring the language `chardet` has indicated."""
104
+ already_seen_encodings = {}
105
+
106
+ for i, result in enumerate(self.assessments):
107
+ if result.confidence < YaralyzerConfig.args.min_chardet_table_confidence:
108
+ continue
109
+
110
+ self.table.add_row(f"{i + 1}", result.encoding_label, result.confidence_text)
111
+
112
+ # self.unique_assessments retains one result per encoding possibility (the highest confidence one)
113
+ # Some encodings are not language specific and for those we don't care about the language
114
+ if result.encoding not in already_seen_encodings:
115
+ self.unique_assessments.append(result)
116
+ already_seen_encodings[result.encoding] = result
117
+ else:
118
+ log.debug(f"Skipping chardet result {result} (already saw {already_seen_encodings[result.encoding]})")
119
+
120
+ self.unique_assessments.sort(key=attrgetter('confidence'), reverse=True)
121
+
122
+ def _set_empty_results(self) -> None:
123
+ """Set empty results for when `chardet` can't help us."""
124
+ self.assessments = []
125
+ self.unique_assessments = []
126
+ self.raw_chardet_assessments = []
127
+ self.force_decode_assessments = []
128
+ self.force_display_assessments = []
129
+
130
+
131
+ def _empty_chardet_results_table() -> Table:
132
+ """Returns an empty `Table` with appropriate columns for `chardet` results."""
133
+ table = Table(
134
+ 'Rank', 'Encoding', 'Confidence',
135
+ title='chardet.detect results',
136
+ title_style='color(153) italic dim',
137
+ header_style='off_white',
138
+ style='dim',
139
+ box=box.SIMPLE,
140
+ show_edge=False,
141
+ collapse_padding=True
142
+ )
143
+
144
+ table.columns[0].justify = 'right'
145
+ return table