yaralyzer 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of yaralyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,8 +1,17 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.0.9
4
+ * Raise `FileNotFoundError` instead of `ValueError` if provided YARA rules files or dirs don't exist
5
+
6
+ ### 1.0.8
7
+ * Bump `python-dotenv` to v1.1.1
8
+ * Use `mkdocs` and `lazydocs` to build automatic API documentation at https://michelcrypt4d4mus.github.io/yaralyzer/
9
+ * Drop python 3.9 support (required by `mkdocs-awesome-nav` package)
10
+
3
11
  ### 1.0.7
4
12
  * Add `Changelog` to PyPi URLs, add some more PyPi classifiers
5
13
  * Add `.flake8` config file and fix style errors
14
+ * Rename `prefix_with_plain_text_obj()` to `prefix_with_style()`
6
15
 
7
16
  ### 1.0.6
8
17
  * Add `Environment :: Console` and `Programming Language :: Python` to PyPi classifiers
yaralyzer/__init__.py CHANGED
@@ -24,6 +24,11 @@ PDFALYZER_MSG_TXT.append('https://github.com/michelcrypt4d4mus/pdfalyzer\n', sty
24
24
 
25
25
 
26
26
  def yaralyze():
27
+ """
28
+ Entry point for yaralyzer when invoked as a script.
29
+
30
+ Args are parsed from the command line and environment variables. See `yaralyze --help` for details.
31
+ """
27
32
  args = parse_arguments()
28
33
  output_basepath = None
29
34
 
@@ -50,13 +55,10 @@ def yaralyze():
50
55
 
51
56
  if args.export_txt:
52
57
  invoke_rich_export(console.save_text, output_basepath)
53
-
54
58
  if args.export_html:
55
59
  invoke_rich_export(console.save_html, output_basepath)
56
-
57
60
  if args.export_svg:
58
61
  invoke_rich_export(console.save_svg, output_basepath)
59
-
60
62
  if args.export_json:
61
63
  export_json(yaralyzer, output_basepath)
62
64
 
yaralyzer/bytes_match.py CHANGED
@@ -1,9 +1,5 @@
1
1
  """
2
- Simple class to keep track of regex matches against binary data. Basically an re.match object with
3
- some (not many) extra bells and whistles, most notably the surrounding_bytes property.
4
-
5
- pre_capture_len and post_capture_len refer to the regex sections before and after the capture group,
6
- e.g. a regex like '123(.*)x:' would have pre_capture_len of 3 and post_capture_len of 2.
2
+ `BytesMatch` class for tracking regex and YARA matches against binary data.
7
3
  """
8
4
  import re
9
5
  from typing import Iterator, Optional
@@ -19,6 +15,13 @@ from yaralyzer.output.rich_console import ALERT_STYLE, GREY_ADDRESS
19
15
 
20
16
 
21
17
  class BytesMatch:
18
+ """
19
+ Simple class to keep track of regex matches against binary data.
20
+
21
+ Basically a Regex `re.match` object with some (not many) extra bells and whistles, most notably
22
+ the `surrounding_bytes` property.
23
+ """
24
+
22
25
  def __init__(
23
26
  self,
24
27
  matched_against: bytes,
@@ -30,8 +33,16 @@ class BytesMatch:
30
33
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
31
34
  ) -> None:
32
35
  """
33
- Ordinal means it's the Nth match with this regex (not super important but useful)
34
- YARA makes it a little rouch to get the actual regex that matched. Can be done with plyara eventually.
36
+ Initialize a `BytesMatch` object representing a match against binary data.
37
+
38
+ Args:
39
+ matched_against (bytes): The full byte sequence that was searched.
40
+ start_idx (int): Start index of the match in the byte sequence.
41
+ length (int): Length of the match in bytes.
42
+ label (str): Label for the match (e.g., regex or YARA rule name).
43
+ ordinal (int): This was the Nth match for this pattern (used for labeling only).
44
+ match (Optional[re.Match]): Regex `match` object, if available.
45
+ highlight_style (str): Style to use for highlighting the match.
35
46
  """
36
47
  self.matched_against: bytes = matched_against
37
48
  self.start_idx: int = start_idx
@@ -58,6 +69,18 @@ class BytesMatch:
58
69
  ordinal: int,
59
70
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
60
71
  ) -> 'BytesMatch':
72
+ """
73
+ Alternate constructor to build a `BytesMatch` from a regex match object.
74
+
75
+ Args:
76
+ matched_against (bytes): The bytes searched.
77
+ match (re.Match): The regex `match` object.
78
+ ordinal (int): This was the Nth match for this pattern (used for labeling only).
79
+ highlight_style (str): Style for highlighting.
80
+
81
+ Returns:
82
+ BytesMatch: The constructed `BytesMatch` instance.
83
+ """
61
84
  return cls(matched_against, match.start(), len(match[0]), match.re.pattern, ordinal, match, highlight_style)
62
85
 
63
86
  @classmethod
@@ -70,7 +93,20 @@ class BytesMatch:
70
93
  ordinal: int,
71
94
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
72
95
  ) -> 'BytesMatch':
73
- """Build a BytesMatch from a yara string match. 'matched_against' is the set of bytes yara was run against."""
96
+ """
97
+ Alternate constructor to build a `BytesMatch` from a YARA string match instance.
98
+
99
+ Args:
100
+ matched_against (bytes): The bytes searched.
101
+ rule_name (str): Name of the YARA rule.
102
+ yara_str_match (StringMatch): YARA string match object.
103
+ yara_str_match_instance (StringMatchInstance): Instance of the string match.
104
+ ordinal (int): The Nth match for this pattern.
105
+ highlight_style (str): Style for highlighting.
106
+
107
+ Returns:
108
+ BytesMatch: The constructed BytesMatch instance.
109
+ """
74
110
  pattern_label = yara_str_match.identifier
75
111
 
76
112
  # Don't duplicate the labeling if rule_name and yara_str are the same
@@ -94,7 +130,17 @@ class BytesMatch:
94
130
  yara_match: dict,
95
131
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
96
132
  ) -> Iterator['BytesMatch']:
97
- """Iterator w/a BytesMatch for each string returned as part of a YARA match result dict."""
133
+ """
134
+ Yield a `BytesMatch` for each string returned as part of a YARA match result dict.
135
+
136
+ Args:
137
+ matched_against (bytes): The bytes searched.
138
+ yara_match (dict): YARA match result dictionary.
139
+ highlight_style (str): Style for highlighting.
140
+
141
+ Yields:
142
+ BytesMatch: For each string match in the YARA result.
143
+ """
98
144
  i = 0 # For numbered labeling
99
145
 
100
146
  # yara-python's internals changed with 4.3.0: https://github.com/VirusTotal/yara-python/releases/tag/v4.3.0
@@ -112,14 +158,27 @@ class BytesMatch:
112
158
  )
113
159
 
114
160
  def style_at_position(self, idx) -> str:
115
- """Get the style for the byte at position idx within the matched bytes"""
161
+ """
162
+ Get the style for the byte at position `idx` within the matched bytes.
163
+
164
+ Args:
165
+ idx (int): Index within the surrounding bytes.
166
+
167
+ Returns:
168
+ str: The style to use for this byte (highlight or greyed out).
169
+ """
116
170
  if idx < self.highlight_start_idx or idx >= self.highlight_end_idx:
117
171
  return GREY_ADDRESS
118
172
  else:
119
173
  return self.highlight_style
120
174
 
121
175
  def location(self) -> Text:
122
- """Returns a Text obj like '(start idx: 348190, end idx: 348228)'"""
176
+ """
177
+ Get a styled `Text` object describing the start and end index of the match.
178
+
179
+ Returns:
180
+ Text: Rich Text object like '(start idx: 348190, end idx: 348228)'.
181
+ """
123
182
  location_txt = prefix_with_style(
124
183
  f"(start idx: ",
125
184
  style='off_white',
@@ -133,13 +192,26 @@ class BytesMatch:
133
192
  return location_txt
134
193
 
135
194
  def is_decodable(self) -> bool:
136
- """True if SUPPRESS_DECODES_TABLE is false and length of self.bytes is between MIN/MAX_DECODE_LENGTH"""
195
+ """
196
+ Determine if the matched bytes should be decoded.
197
+
198
+ Whether the bytes are decodable depends on whether `SUPPRESS_DECODES_TABLE` is set
199
+ and whether the match length is between `MIN`/`MAX_DECODE_LENGTH`.
200
+
201
+ Returns:
202
+ bool: `True` if decodable, `False` otherwise.
203
+ """
137
204
  return self.match_length >= YaralyzerConfig.args.min_decode_length \
138
205
  and self.match_length <= YaralyzerConfig.args.max_decode_length \
139
206
  and not YaralyzerConfig.args.suppress_decodes_table
140
207
 
141
208
  def bytes_hashes_table(self) -> Table:
142
- """Helper function to build the MD5/SHA table for self.bytes"""
209
+ """
210
+ Build a table of MD5/SHA hashes for the matched bytes.
211
+
212
+ Returns:
213
+ Table: Rich `Table` object with hashes.
214
+ """
143
215
  return bytes_hashes_table(
144
216
  self.bytes,
145
217
  self.location().plain,
@@ -147,7 +219,12 @@ class BytesMatch:
147
219
  )
148
220
 
149
221
  def suppression_notice(self) -> Text:
150
- """Generate a message for when there are too few/too many bytes"""
222
+ """
223
+ Generate a message for when the match is too short or too long to decode.
224
+
225
+ Returns:
226
+ Text: Rich `Text` object with the suppression notice.
227
+ """
151
228
  txt = self.__rich__()
152
229
 
153
230
  if self.match_length < YaralyzerConfig.args.min_decode_length:
@@ -159,7 +236,12 @@ class BytesMatch:
159
236
  return txt
160
237
 
161
238
  def to_json(self) -> dict:
162
- """Convert this BytesMatch to a JSON-serializable dict."""
239
+ """
240
+ Convert this `BytesMatch` to a JSON-serializable dictionary.
241
+
242
+ Returns:
243
+ dict: Dictionary representation of the match, suitable for JSON serialization.
244
+ """
163
245
  json_dict = {
164
246
  'label': self.label,
165
247
  'match_length': self.match_length,
@@ -178,7 +260,13 @@ class BytesMatch:
178
260
  return json_dict
179
261
 
180
262
  def _find_surrounding_bytes(self, num_before: Optional[int] = None, num_after: Optional[int] = None) -> None:
181
- """Find the surrounding bytes, making sure not to step off the beginning or end"""
263
+ """
264
+ Find and set the bytes surrounding the match, ensuring indices stay within bounds.
265
+
266
+ Args:
267
+ num_before (Optional[int]): Number of bytes before the match to include.
268
+ num_after (Optional[int]): Number of bytes after the match to include.
269
+ """
182
270
  num_after = num_after or num_before or YaralyzerConfig.args.surrounding_bytes
183
271
  num_before = num_before or YaralyzerConfig.args.surrounding_bytes
184
272
  self.surrounding_start_idx: int = max(self.start_idx - num_before, 0)
@@ -186,6 +274,7 @@ class BytesMatch:
186
274
  self.surrounding_bytes: bytes = self.matched_against[self.surrounding_start_idx:self.surrounding_end_idx]
187
275
 
188
276
  def __rich__(self) -> Text:
277
+ """Get a rich `Text` representation of the match for display."""
189
278
  headline = prefix_with_style(str(self.match_length), style='number', root_style='decode.subheading')
190
279
  headline.append(f" bytes matching ")
191
280
  headline.append(f"{self.label} ", style=ALERT_STYLE if self.highlight_style == ALERT_STYLE else 'regex')
@@ -193,4 +282,5 @@ class BytesMatch:
193
282
  return headline + self.location()
194
283
 
195
284
  def __str__(self):
285
+ """Plain text (no rich colors) representation of the match for display."""
196
286
  return self.__rich__().plain
yaralyzer/config.py CHANGED
@@ -1,3 +1,6 @@
1
+ """
2
+ Configuration management for Yaralyzer.
3
+ """
1
4
  import logging
2
5
  from argparse import ArgumentParser, Namespace
3
6
  from os import environ
@@ -15,16 +18,20 @@ MEGABYTE = 1024 * KILOBYTE
15
18
 
16
19
  def config_var_name(env_var: str) -> str:
17
20
  """
18
- Get the name of env_var and strip off 'YARALYZER_', e.g.:
21
+ Get the name of `env_var` and strip off `YARALYZER_` prefix.
22
+
23
+ Example:
24
+ ```
19
25
  SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
20
26
  config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
27
+ ```
21
28
  """
22
29
  env_var = env_var.removeprefix("YARALYZER_")
23
30
  return f'{env_var=}'.partition('=')[0]
24
31
 
25
32
 
26
- def is_env_var_set_and_not_false(var_name):
27
- """Returns True if var_name is not empty and set to anything other than 'false' (capitalization agnostic)"""
33
+ def is_env_var_set_and_not_false(var_name: str) -> bool:
34
+ """Return `True` if `var_name` is not empty and set to anything other than "false" (capitalization agnostic)."""
28
35
  if var_name in environ:
29
36
  var_value = environ[var_name]
30
37
  return var_value is not None and len(var_value) > 0 and var_value.lower() != 'false'
@@ -32,12 +39,14 @@ def is_env_var_set_and_not_false(var_name):
32
39
  return False
33
40
 
34
41
 
35
- def is_invoked_by_pytest():
36
- """Return true if pytest is running"""
42
+ def is_invoked_by_pytest() -> bool:
43
+ """Return `True` if invoked in a `pytest` context."""
37
44
  return is_env_var_set_and_not_false(PYTEST_FLAG)
38
45
 
39
46
 
40
47
  class YaralyzerConfig:
48
+ """Handles parsing of command line args and environment variables for Yaralyzer."""
49
+
41
50
  # Passed through to yara.set_config()
42
51
  DEFAULT_MAX_MATCH_LENGTH = 100 * KILOBYTE
43
52
  DEFAULT_YARA_STACK_SIZE = 2 * 65536
@@ -76,11 +85,13 @@ class YaralyzerConfig:
76
85
 
77
86
  @classmethod
78
87
  def set_argument_parser(cls, parser: ArgumentParser) -> None:
88
+ """Sets the `_argument_parser` instance variable that will be used to parse command line args."""
79
89
  cls._argument_parser: ArgumentParser = parser
80
90
  cls._argparse_keys: List[str] = sorted([action.dest for action in parser._actions])
81
91
 
82
92
  @classmethod
83
93
  def set_args(cls, args: Namespace) -> None:
94
+ """Set the `args` class instance variable and update args with any environment variable overrides."""
84
95
  cls.args = args
85
96
 
86
97
  for option in cls._argparse_keys:
@@ -105,9 +116,11 @@ class YaralyzerConfig:
105
116
 
106
117
  @classmethod
107
118
  def set_default_args(cls):
119
+ """Set `self.args` to their defaults as if parsed from the command line."""
108
120
  cls.set_args(cls._argument_parser.parse_args(['dummy']))
109
121
 
110
122
  @classmethod
111
123
  def get_default_arg(cls, arg: str) -> Any:
124
+ """Return the default value for `arg` as defined by a `DEFAULT_` style class variable."""
112
125
  default_var = f"DEFAULT_{arg.upper()}"
113
126
  return vars(cls).get(default_var)
@@ -1,7 +1,5 @@
1
1
  """
2
- Class to handle attempting to decode a chunk of bytes into strings with various possible encodings.
3
- Leverages the chardet library to both guide what encodings are attempted as well as to rank decodings
4
- in the results.
2
+ `BytesDecoder` class for attempting to decode bytes with various encodings.
5
3
  """
6
4
  from collections import defaultdict
7
5
  from copy import deepcopy
@@ -34,7 +32,34 @@ SCORE_SCALER = 100.0
34
32
 
35
33
 
36
34
  class BytesDecoder:
35
+ """
36
+ Handles decoding a chunk of bytes into strings using various possible encodings, ranking and displaying results.
37
+
38
+ This class leverages the `chardet` library and custom logic to try multiple encodings, track decoding outcomes,
39
+ and present the results in a rich, user-friendly format. It is used to analyze and display the possible
40
+ interpretations of a byte sequence, especially in the context of YARA matches or binary analysis.
41
+
42
+ Attributes:
43
+ bytes_match (BytesMatch): The `BytesMatch` instance being decoded.
44
+ bytes (bytes): The bytes (including surrounding context) to decode.
45
+ label (str): Label for this decoding attempt.
46
+ was_match_decodable (dict): Tracks successful decodes per encoding.
47
+ was_match_force_decoded (dict): Tracks forced decodes per encoding.
48
+ was_match_undecodable (dict): Tracks failed decodes per encoding.
49
+ decoded_strings (dict): Maps encoding to decoded string.
50
+ undecoded_rows (list): Stores undecoded table rows.
51
+ decodings (list): List of DecodingAttempt objects for each encoding tried.
52
+ encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
53
+ """
54
+
37
55
  def __init__(self, bytes_match: 'BytesMatch', label: Optional[str] = None) -> None:
56
+ """
57
+ Initialize a `BytesDecoder` for attempting to decode a chunk of bytes using various encodings.
58
+
59
+ Args:
60
+ bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
61
+ label (Optional[str], optional): Optional label for this decoding attempt. Defaults to the match label.
62
+ """
38
63
  self.bytes_match = bytes_match
39
64
  self.bytes = bytes_match.surrounding_bytes
40
65
  self.label = label or bytes_match.label
@@ -51,7 +76,7 @@ class BytesDecoder:
51
76
  self.encoding_detector = EncodingDetector(self.bytes)
52
77
 
53
78
  def __rich_console__(self, _console: Console, options: ConsoleOptions) -> RenderResult:
54
- """Rich object generator (see Rich console docs)"""
79
+ """Rich object generator (see Rich console docs)."""
55
80
  yield NewLine(2)
56
81
  yield Align(self._decode_attempt_subheading(), CENTER)
57
82
 
@@ -70,7 +95,12 @@ class BytesDecoder:
70
95
  yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
71
96
 
72
97
  def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
73
- """First rows are the raw / hex views of the bytes, next rows are the attempted decodings"""
98
+ """
99
+ First rows are the raw / hex views of the bytes, next rows are the attempted decodings.
100
+
101
+ Args:
102
+ suppress_decodes (bool, optional): If `True` don't add decoding attempts to the table. Defaults to `False`.
103
+ """
74
104
  self.table = new_decoding_attempts_table(self.bytes_match)
75
105
 
76
106
  # Add the encoding rows to the table if not suppressed
@@ -102,20 +132,20 @@ class BytesDecoder:
102
132
  return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
103
133
 
104
134
  def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
105
- """Filter out the already decoded assessments from a set of assessments"""
135
+ """Filter out the already decoded assessments from a set of assessments."""
106
136
  return [a for a in assessments if not self._was_decoded(a.encoding)]
107
137
 
108
138
  def _was_decoded(self, encoding: str) -> bool:
109
- """Check whether a given encoding is in the table already"""
139
+ """Check whether a given encoding is in the table already."""
110
140
  return any(row.encoding == encoding for row in self.decodings)
111
141
 
112
142
  def _decode_attempt_subheading(self) -> Panel:
113
- """Generate a rich.Panel for displaying decode attempts"""
143
+ """Generate a rich.Panel for displaying decode attempts."""
114
144
  headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
115
145
  return Panel(headline, style='decode.subheading', expand=False)
116
146
 
117
147
  def _track_decode_stats(self) -> None:
118
- """Track stats about successful vs. forced vs. failed decode attempts"""
148
+ """Track stats about successful vs. forced vs. failed decode attempts."""
119
149
  for decoding in self.decodings:
120
150
  if decoding.failed_to_decode:
121
151
  self.was_match_undecodable[decoding.encoding] += 1
@@ -127,7 +157,7 @@ class BytesDecoder:
127
157
  self.was_match_force_decoded[decoding.encoding] += 1
128
158
 
129
159
  def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
130
- """Create a DecodingAttemptTable row from a DecodingAttempt."""
160
+ """Create a `DecodingAttemptTable` row from a `DecodingAttempt`."""
131
161
  assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
132
162
 
133
163
  # If the decoding can have a start offset add an appropriate extension to the encoding label
@@ -162,7 +192,7 @@ class BytesDecoder:
162
192
 
163
193
 
164
194
  def _build_encodings_metric_dict():
165
- """One key for each key in ENCODINGS_TO_ATTEMPT, values are all 0"""
195
+ """One key for each key in `ENCODINGS_TO_ATTEMPT`, values are all 0."""
166
196
  metrics_dict = defaultdict(lambda: 0)
167
197
 
168
198
  for encoding in ENCODINGS_TO_ATTEMPT.keys():
@@ -7,7 +7,7 @@ from typing import Optional
7
7
  from rich.markup import escape
8
8
  from rich.text import Text
9
9
 
10
- from yaralyzer.bytes_match import BytesMatch # Used to cause circular import issues
10
+ from yaralyzer.bytes_match import BytesMatch # Formerly caused circular import issues
11
11
  from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
12
12
  UTF_8, encoding_width, is_wide_utf)
13
13
  from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
@@ -17,8 +17,33 @@ from yaralyzer.util.logging import log
17
17
 
18
18
 
19
19
  class DecodingAttempt:
20
+ """
21
+ Manages the process of attempting to decode a chunk of bytes into a string using a specified encoding.
22
+
23
+ This class tries to decode the bytes using the provided encoding, handling both standard and custom decoding
24
+ strategies (including multi-byte encodings and forced decoding attempts). It tracks the outcome, highlights
25
+ the decoded output, and provides information about the decoding process.
26
+
27
+ Attributes:
28
+ bytes (bytes): The bytes (including context) to decode.
29
+ bytes_match (BytesMatch): The `BytesMatch` object containing match and context info.
30
+ encoding (str): The encoding to attempt.
31
+ encoding_label (str): Label for the encoding (may include offset info).
32
+ start_offset (int): Byte offset used for decoding (for multi-byte encodings).
33
+ start_offset_label (Optional[str]): String label for the offset, if used.
34
+ was_force_decoded (bool): True if a forced decode was attempted.
35
+ failed_to_decode (bool): True if decoding failed.
36
+ decoded_string (Text): The decoded string as a Rich `Text` object (with highlighting).
37
+ """
38
+
20
39
  def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
21
- # Args
40
+ """
41
+ Initialize a `DecodingAttempt` for a specific `encoding` on a given `BytesMatch`.
42
+
43
+ Args:
44
+ bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
45
+ encoding (str): The encoding to attempt for decoding the bytes.
46
+ """
22
47
  self.bytes = bytes_match.surrounding_bytes
23
48
  self.bytes_match = bytes_match
24
49
  self.encoding = encoding
@@ -30,15 +55,11 @@ class DecodingAttempt:
30
55
  self.failed_to_decode = False
31
56
  self.decoded_string = self._decode_bytes()
32
57
 
33
- def is_wide_utf_encoding(self) -> bool:
34
- """Returns True if the encoding is UTF-16 or UTF-32"""
35
- return is_wide_utf(self.encoding)
36
-
37
58
  def _decode_bytes(self) -> Text:
38
59
  """
39
- Tries builtin decode, hands off to other methods for harsher treatement
40
- (byte shifting for UTF-16/32 and custom decode for the rest) if that fails.
41
- Has side effect of setting 'self.decoded_string' value.
60
+ Tries builtin decode, hands off to other methods for harsher treatment (byte shifting for
61
+ UTF-16/32 and custom decode for the rest) if that fails. Has side effect of setting
62
+ `self.decoded_string` value.
42
63
  """
43
64
  try:
44
65
  decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
@@ -52,13 +73,15 @@ class DecodingAttempt:
52
73
 
53
74
  self.was_force_decoded = True
54
75
 
55
- if self.is_wide_utf_encoding():
76
+ if is_wide_utf(self.encoding):
56
77
  return self._decode_utf_multibyte()
57
78
  else:
58
- return self._custom_decode()
79
+ return self._custom_utf_decode()
59
80
 
60
- def _custom_decode(self) -> Text:
61
- """Returns a Text obj representing an attempt to force a UTF-8 encoding upon an array of bytes"""
81
+ def _custom_utf_decode(self) -> Text:
82
+ """
83
+ Returns a `Text` obj representing an attempt to force a UTF-8 encoding onto an array of bytes.
84
+ """
62
85
  log.info(f"Custom decoding {self.bytes_match} with {self.encoding}...")
63
86
  unprintable_char_map = ENCODINGS_TO_ATTEMPT.get(self.encoding)
64
87
  output = Text('', style='bytes.decoded')
@@ -116,7 +139,13 @@ class DecodingAttempt:
116
139
  return output
117
140
 
118
141
  def _decode_utf_multibyte(self) -> Text:
119
- """UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte."""
142
+ """
143
+ UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte
144
+ so we try several offsets until we find one that at least kind of works.
145
+
146
+ Returns:
147
+ Text: Rich `Text` object representing the decoded string with highlighting.
148
+ """
120
149
  char_width = encoding_width(self.encoding)
121
150
  last_exception = None
122
151
  decoded_str = None
@@ -146,7 +175,15 @@ class DecodingAttempt:
146
175
  return self._failed_to_decode_msg_txt(last_exception)
147
176
 
148
177
  def _to_rich_text(self, _string: str, bytes_offset: int = 0) -> Text:
149
- """Convert a decoded string to highlighted Text representation"""
178
+ """
179
+ Convert a decoded string to highlighted `Text` representation.
180
+
181
+ Args:
182
+ _string (str): The decoded string to convert.
183
+ bytes_offset (int): The byte offset used during decoding (for multi-byte encodings).
184
+ Returns:
185
+ Text: The rich `Text` representation of the decoded string with appropriate highlighting.
186
+ """
150
187
  # Adjust where we start the highlighting given the multibyte nature of the encodings
151
188
  log.debug(f"Stepping through {self.encoding} encoded string...")
152
189
  txt = Text('', style=self.bytes_match.style_at_position(0))
@@ -160,7 +197,7 @@ class DecodingAttempt:
160
197
  is_single_byte_encoding = False
161
198
  unprintable_chars = {}
162
199
 
163
- for i, c in enumerate(_string):
200
+ for _i, c in enumerate(_string):
164
201
  char_bytes = bytes(c, self.encoding)
165
202
  char_width = len(char_bytes)
166
203
  style = self.bytes_match.style_at_position(current_byte_idx + bytes_offset)
@@ -180,6 +217,6 @@ class DecodingAttempt:
180
217
  return txt
181
218
 
182
219
  def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
183
- """Set failed_to_decode flag and return a Text object with the error message."""
220
+ """Set `self.failed_to_decode` flag and return a `Text` object with the error message."""
184
221
  self.failed_to_decode = True
185
222
  return prefix_with_style(f"(decode failed: {exception})", style='red dim italic')
@@ -1,7 +1,11 @@
1
1
  """
2
- Constants related to character encodings
3
- * https://www.mit.edu/people/kenta/two/iso8859.html
4
- * https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec
2
+ Constants related to character encodings.
3
+
4
+ Helpful links:
5
+
6
+ * ISO-8859: [www.mit.edu/people/kenta/two/iso8859.html](https://www.mit.edu/people/kenta/two/iso8859.html)
7
+
8
+ * UTF-8: [www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec](https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec) # noqa: E501
5
9
  """
6
10
 
7
11
  # Bytes (TODO: why is this here?)
@@ -68,10 +72,10 @@ UNPRINTABLE_ASCII = {
68
72
  }
69
73
 
70
74
 
71
- def scrub_c1_control_chars(char_map):
75
+ def scrub_c1_control_chars(char_map: dict) -> None:
72
76
  """
73
- Fill in a dict with integer keys/values corresponding to where a given char encoding has no chars
74
- because this range is for C1 control chars (AKA the 'undefined' part of the character map)
77
+ Fill in a `dict` with integer keys/values corresponding to where a given char encoding has no chars
78
+ because this range is for C1 control chars (AKA the "undefined" part of most character maps).
75
79
  """
76
80
  for i in range(128, 160):
77
81
  char_map[i] = f"C1.CHAR{i}"
@@ -163,7 +167,7 @@ WIDE_UTF_ENCODINGS = {
163
167
 
164
168
 
165
169
  def encoding_offsets(encoding: str) -> list:
166
- """Get possible offsets for a given encoding. If the encoding is not in WIDE_UTF_ENCODINGS, return [0]."""
170
+ """Get possible offsets for a given encoding. If the encoding is not in `WIDE_UTF_ENCODINGS`, return `[0]`."""
167
171
  return WIDE_UTF_ENCODINGS.get(encoding, [0])
168
172
 
169
173