yaralyzer 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of yaralyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,8 +1,14 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.0.8
4
+ * Bump `python-dotenv` to v1.1.1
5
+ * Use `mkdocs` and `lazydocs` to build automatic API documentation at https://michelcrypt4d4mus.github.io/yaralyzer/
6
+ * Drop python 3.9 support (required by `mkdocs-awesome-nav` package)
7
+
3
8
  ### 1.0.7
4
9
  * Add `Changelog` to PyPi URLs, add some more PyPi classifiers
5
10
  * Add `.flake8` config file and fix style errors
11
+ * Rename `prefix_with_plain_text_obj()` to `prefix_with_style()`
6
12
 
7
13
  ### 1.0.6
8
14
  * Add `Environment :: Console` and `Programming Language :: Python` to PyPi classifiers
yaralyzer/__init__.py CHANGED
@@ -24,6 +24,11 @@ PDFALYZER_MSG_TXT.append('https://github.com/michelcrypt4d4mus/pdfalyzer\n', sty
24
24
 
25
25
 
26
26
  def yaralyze():
27
+ """
28
+ Entry point for yaralyzer when invoked as a script.
29
+
30
+ Args are parsed from the command line and environment variables. See yaralyzer --help for details.
31
+ """
27
32
  args = parse_arguments()
28
33
  output_basepath = None
29
34
 
yaralyzer/bytes_match.py CHANGED
@@ -1,10 +1,4 @@
1
- """
2
- Simple class to keep track of regex matches against binary data. Basically an re.match object with
3
- some (not many) extra bells and whistles, most notably the surrounding_bytes property.
4
-
5
- pre_capture_len and post_capture_len refer to the regex sections before and after the capture group,
6
- e.g. a regex like '123(.*)x:' would have pre_capture_len of 3 and post_capture_len of 2.
7
- """
1
+ """BytesMatch class for tracking regex and YARA matches against binary data."""
8
2
  import re
9
3
  from typing import Iterator, Optional
10
4
 
@@ -19,6 +13,16 @@ from yaralyzer.output.rich_console import ALERT_STYLE, GREY_ADDRESS
19
13
 
20
14
 
21
15
  class BytesMatch:
16
+ """
17
+ Simple class to keep track of regex matches against binary data.
18
+
19
+ Basically an re.match object with some (not many) extra bells and whistles, most notably
20
+ the surrounding_bytes property.
21
+
22
+ pre_capture_len and post_capture_len refer to the regex sections before and after the capture group,
23
+ e.g. a regex like '123(.*)x:' would have pre_capture_len of 3 and post_capture_len of 2.
24
+ """
25
+
22
26
  def __init__(
23
27
  self,
24
28
  matched_against: bytes,
@@ -30,8 +34,16 @@ class BytesMatch:
30
34
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
31
35
  ) -> None:
32
36
  """
33
- Ordinal means it's the Nth match with this regex (not super important but useful)
34
- YARA makes it a little rouch to get the actual regex that matched. Can be done with plyara eventually.
37
+ Initialize a BytesMatch object representing a match against binary data.
38
+
39
+ Args:
40
+ matched_against (bytes): The full byte sequence that was searched.
41
+ start_idx (int): Start index of the match in the byte sequence.
42
+ length (int): Length of the match in bytes.
43
+ label (str): Label for the match (e.g., regex or YARA rule name).
44
+ ordinal (int): The Nth match for this pattern.
45
+ match (Optional[re.Match]): Regex match object, if available.
46
+ highlight_style (str): Style to use for highlighting the match.
35
47
  """
36
48
  self.matched_against: bytes = matched_against
37
49
  self.start_idx: int = start_idx
@@ -58,6 +70,18 @@ class BytesMatch:
58
70
  ordinal: int,
59
71
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
60
72
  ) -> 'BytesMatch':
73
+ """
74
+ Create a BytesMatch from a regex match object.
75
+
76
+ Args:
77
+ matched_against (bytes): The bytes searched.
78
+ match (re.Match): The regex match object.
79
+ ordinal (int): The Nth match for this pattern.
80
+ highlight_style (str): Style for highlighting.
81
+
82
+ Returns:
83
+ BytesMatch: The constructed BytesMatch instance.
84
+ """
61
85
  return cls(matched_against, match.start(), len(match[0]), match.re.pattern, ordinal, match, highlight_style)
62
86
 
63
87
  @classmethod
@@ -70,7 +94,20 @@ class BytesMatch:
70
94
  ordinal: int,
71
95
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
72
96
  ) -> 'BytesMatch':
73
- """Build a BytesMatch from a yara string match. 'matched_against' is the set of bytes yara was run against."""
97
+ """
98
+ Build a BytesMatch from a YARA string match instance.
99
+
100
+ Args:
101
+ matched_against (bytes): The bytes searched.
102
+ rule_name (str): Name of the YARA rule.
103
+ yara_str_match (StringMatch): YARA string match object.
104
+ yara_str_match_instance (StringMatchInstance): Instance of the string match.
105
+ ordinal (int): The Nth match for this pattern.
106
+ highlight_style (str): Style for highlighting.
107
+
108
+ Returns:
109
+ BytesMatch: The constructed BytesMatch instance.
110
+ """
74
111
  pattern_label = yara_str_match.identifier
75
112
 
76
113
  # Don't duplicate the labeling if rule_name and yara_str are the same
@@ -94,7 +131,17 @@ class BytesMatch:
94
131
  yara_match: dict,
95
132
  highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
96
133
  ) -> Iterator['BytesMatch']:
97
- """Iterator w/a BytesMatch for each string returned as part of a YARA match result dict."""
134
+ """
135
+ Yield a BytesMatch for each string returned as part of a YARA match result dict.
136
+
137
+ Args:
138
+ matched_against (bytes): The bytes searched.
139
+ yara_match (dict): YARA match result dictionary.
140
+ highlight_style (str): Style for highlighting.
141
+
142
+ Yields:
143
+ BytesMatch: For each string match in the YARA result.
144
+ """
98
145
  i = 0 # For numbered labeling
99
146
 
100
147
  # yara-python's internals changed with 4.3.0: https://github.com/VirusTotal/yara-python/releases/tag/v4.3.0
@@ -112,14 +159,27 @@ class BytesMatch:
112
159
  )
113
160
 
114
161
  def style_at_position(self, idx) -> str:
115
- """Get the style for the byte at position idx within the matched bytes"""
162
+ """
163
+ Get the style for the byte at position idx within the matched bytes.
164
+
165
+ Args:
166
+ idx (int): Index within the surrounding bytes.
167
+
168
+ Returns:
169
+ str: The style to use for this byte (highlight or greyed out).
170
+ """
116
171
  if idx < self.highlight_start_idx or idx >= self.highlight_end_idx:
117
172
  return GREY_ADDRESS
118
173
  else:
119
174
  return self.highlight_style
120
175
 
121
176
  def location(self) -> Text:
122
- """Returns a Text obj like '(start idx: 348190, end idx: 348228)'"""
177
+ """
178
+ Get a styled Text object describing the start and end index of the match.
179
+
180
+ Returns:
181
+ Text: Rich Text object like '(start idx: 348190, end idx: 348228)'.
182
+ """
123
183
  location_txt = prefix_with_style(
124
184
  f"(start idx: ",
125
185
  style='off_white',
@@ -133,13 +193,26 @@ class BytesMatch:
133
193
  return location_txt
134
194
 
135
195
  def is_decodable(self) -> bool:
136
- """True if SUPPRESS_DECODES_TABLE is false and length of self.bytes is between MIN/MAX_DECODE_LENGTH"""
196
+ """
197
+ Determine if the matched bytes should be decoded.
198
+
199
+ Whether the bytes are decodable depends on whether SUPPRESS_DECODES_TABLE is set
200
+ and whether the match length is between MIN/MAX_DECODE_LENGTH.
201
+
202
+ Returns:
203
+ bool: True if decodable, False otherwise.
204
+ """
137
205
  return self.match_length >= YaralyzerConfig.args.min_decode_length \
138
206
  and self.match_length <= YaralyzerConfig.args.max_decode_length \
139
207
  and not YaralyzerConfig.args.suppress_decodes_table
140
208
 
141
209
  def bytes_hashes_table(self) -> Table:
142
- """Helper function to build the MD5/SHA table for self.bytes"""
210
+ """
211
+ Build a table of MD5/SHA hashes for the matched bytes.
212
+
213
+ Returns:
214
+ Table: Rich Table object with hashes.
215
+ """
143
216
  return bytes_hashes_table(
144
217
  self.bytes,
145
218
  self.location().plain,
@@ -147,7 +220,12 @@ class BytesMatch:
147
220
  )
148
221
 
149
222
  def suppression_notice(self) -> Text:
150
- """Generate a message for when there are too few/too many bytes"""
223
+ """
224
+ Generate a message for when the match is too short or too long to decode.
225
+
226
+ Returns:
227
+ Text: Rich Text object with the suppression notice.
228
+ """
151
229
  txt = self.__rich__()
152
230
 
153
231
  if self.match_length < YaralyzerConfig.args.min_decode_length:
@@ -159,7 +237,12 @@ class BytesMatch:
159
237
  return txt
160
238
 
161
239
  def to_json(self) -> dict:
162
- """Convert this BytesMatch to a JSON-serializable dict."""
240
+ """
241
+ Convert this BytesMatch to a JSON-serializable dictionary.
242
+
243
+ Returns:
244
+ dict: Dictionary representation of the match, suitable for JSON serialization.
245
+ """
163
246
  json_dict = {
164
247
  'label': self.label,
165
248
  'match_length': self.match_length,
@@ -178,7 +261,13 @@ class BytesMatch:
178
261
  return json_dict
179
262
 
180
263
  def _find_surrounding_bytes(self, num_before: Optional[int] = None, num_after: Optional[int] = None) -> None:
181
- """Find the surrounding bytes, making sure not to step off the beginning or end"""
264
+ """
265
+ Find and set the bytes surrounding the match, ensuring indices stay within bounds.
266
+
267
+ Args:
268
+ num_before (Optional[int]): Number of bytes before the match to include.
269
+ num_after (Optional[int]): Number of bytes after the match to include.
270
+ """
182
271
  num_after = num_after or num_before or YaralyzerConfig.args.surrounding_bytes
183
272
  num_before = num_before or YaralyzerConfig.args.surrounding_bytes
184
273
  self.surrounding_start_idx: int = max(self.start_idx - num_before, 0)
@@ -186,6 +275,7 @@ class BytesMatch:
186
275
  self.surrounding_bytes: bytes = self.matched_against[self.surrounding_start_idx:self.surrounding_end_idx]
187
276
 
188
277
  def __rich__(self) -> Text:
278
+ """Get a rich Text representation of the match for display."""
189
279
  headline = prefix_with_style(str(self.match_length), style='number', root_style='decode.subheading')
190
280
  headline.append(f" bytes matching ")
191
281
  headline.append(f"{self.label} ", style=ALERT_STYLE if self.highlight_style == ALERT_STYLE else 'regex')
@@ -193,4 +283,5 @@ class BytesMatch:
193
283
  return headline + self.location()
194
284
 
195
285
  def __str__(self):
286
+ """Plain text (no rich colors) representation of the match for display."""
196
287
  return self.__rich__().plain
yaralyzer/config.py CHANGED
@@ -1,3 +1,6 @@
1
+ """
2
+ Configuration management for Yaralyzer.
3
+ """
1
4
  import logging
2
5
  from argparse import ArgumentParser, Namespace
3
6
  from os import environ
@@ -15,16 +18,19 @@ MEGABYTE = 1024 * KILOBYTE
15
18
 
16
19
  def config_var_name(env_var: str) -> str:
17
20
  """
18
- Get the name of env_var and strip off 'YARALYZER_', e.g.:
19
- SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
20
- config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
21
+ Get the name of env_var and strip off 'YARALYZER_' prefix.
22
+
23
+ Example:
24
+ $ SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
25
+ $ config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
26
+
21
27
  """
22
28
  env_var = env_var.removeprefix("YARALYZER_")
23
29
  return f'{env_var=}'.partition('=')[0]
24
30
 
25
31
 
26
32
  def is_env_var_set_and_not_false(var_name):
27
- """Returns True if var_name is not empty and set to anything other than 'false' (capitalization agnostic)"""
33
+ """Return True if var_name is not empty and set to anything other than 'false' (capitalization agnostic)."""
28
34
  if var_name in environ:
29
35
  var_value = environ[var_name]
30
36
  return var_value is not None and len(var_value) > 0 and var_value.lower() != 'false'
@@ -33,11 +39,13 @@ def is_env_var_set_and_not_false(var_name):
33
39
 
34
40
 
35
41
  def is_invoked_by_pytest():
36
- """Return true if pytest is running"""
42
+ """Return true if pytest is running."""
37
43
  return is_env_var_set_and_not_false(PYTEST_FLAG)
38
44
 
39
45
 
40
46
  class YaralyzerConfig:
47
+ """Handles parsing of command line args and environment variables for Yaralyzer."""
48
+
41
49
  # Passed through to yara.set_config()
42
50
  DEFAULT_MAX_MATCH_LENGTH = 100 * KILOBYTE
43
51
  DEFAULT_YARA_STACK_SIZE = 2 * 65536
@@ -76,11 +84,13 @@ class YaralyzerConfig:
76
84
 
77
85
  @classmethod
78
86
  def set_argument_parser(cls, parser: ArgumentParser) -> None:
87
+ """Sets the _argument_parser instance variable that will be used to parse command line args."""
79
88
  cls._argument_parser: ArgumentParser = parser
80
89
  cls._argparse_keys: List[str] = sorted([action.dest for action in parser._actions])
81
90
 
82
91
  @classmethod
83
92
  def set_args(cls, args: Namespace) -> None:
93
+ """Set the args class instance variable and update args with any environment variable overrides."""
84
94
  cls.args = args
85
95
 
86
96
  for option in cls._argparse_keys:
@@ -105,9 +115,11 @@ class YaralyzerConfig:
105
115
 
106
116
  @classmethod
107
117
  def set_default_args(cls):
118
+ """Set args to their defaults as if parsed from the command line."""
108
119
  cls.set_args(cls._argument_parser.parse_args(['dummy']))
109
120
 
110
121
  @classmethod
111
122
  def get_default_arg(cls, arg: str) -> Any:
123
+ """Return the default value for arg as defined by a DEFAULT_ style class variable."""
112
124
  default_var = f"DEFAULT_{arg.upper()}"
113
125
  return vars(cls).get(default_var)
@@ -1,8 +1,4 @@
1
- """
2
- Class to handle attempting to decode a chunk of bytes into strings with various possible encodings.
3
- Leverages the chardet library to both guide what encodings are attempted as well as to rank decodings
4
- in the results.
5
- """
1
+ """BytesDecoder class for attempting to decode bytes with various encodings."""
6
2
  from collections import defaultdict
7
3
  from copy import deepcopy
8
4
  from operator import attrgetter
@@ -34,7 +30,33 @@ SCORE_SCALER = 100.0
34
30
 
35
31
 
36
32
  class BytesDecoder:
33
+ """
34
+ Class to handle attempting to decode a chunk of bytes into strings with various possible encodings.
35
+
36
+ Leverages the chardet library to both guide what encodings are attempted as well as to rank decodings
37
+ in the results.
38
+ """
39
+
37
40
  def __init__(self, bytes_match: 'BytesMatch', label: Optional[str] = None) -> None:
41
+ """
42
+ Initialize a BytesDecoder for attempting to decode a chunk of bytes using various encodings.
43
+
44
+ Args:
45
+ bytes_match (BytesMatch): The BytesMatch object containing the bytes to decode and match metadata.
46
+ label (Optional[str], optional): Optional label for this decoding attempt. Defaults to the match label.
47
+
48
+ Attributes:
49
+ bytes_match (BytesMatch): The BytesMatch instance being decoded.
50
+ bytes (bytes): The bytes (including surrounding context) to decode.
51
+ label (str): Label for this decoding attempt.
52
+ was_match_decodable (dict): Tracks successful decodes per encoding.
53
+ was_match_force_decoded (dict): Tracks forced decodes per encoding.
54
+ was_match_undecodable (dict): Tracks failed decodes per encoding.
55
+ decoded_strings (dict): Maps encoding to decoded string.
56
+ undecoded_rows (list): Stores undecoded table rows.
57
+ decodings (list): List of DecodingAttempt objects for each encoding tried.
58
+ encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
59
+ """
38
60
  self.bytes_match = bytes_match
39
61
  self.bytes = bytes_match.surrounding_bytes
40
62
  self.label = label or bytes_match.label
@@ -51,7 +73,7 @@ class BytesDecoder:
51
73
  self.encoding_detector = EncodingDetector(self.bytes)
52
74
 
53
75
  def __rich_console__(self, _console: Console, options: ConsoleOptions) -> RenderResult:
54
- """Rich object generator (see Rich console docs)"""
76
+ """Rich object generator (see Rich console docs)."""
55
77
  yield NewLine(2)
56
78
  yield Align(self._decode_attempt_subheading(), CENTER)
57
79
 
@@ -70,7 +92,7 @@ class BytesDecoder:
70
92
  yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
71
93
 
72
94
  def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
73
- """First rows are the raw / hex views of the bytes, next rows are the attempted decodings"""
95
+ """First rows are the raw / hex views of the bytes, next rows are the attempted decodings."""
74
96
  self.table = new_decoding_attempts_table(self.bytes_match)
75
97
 
76
98
  # Add the encoding rows to the table if not suppressed
@@ -115,7 +137,7 @@ class BytesDecoder:
115
137
  return Panel(headline, style='decode.subheading', expand=False)
116
138
 
117
139
  def _track_decode_stats(self) -> None:
118
- """Track stats about successful vs. forced vs. failed decode attempts"""
140
+ """Track stats about successful vs. forced vs. failed decode attempts."""
119
141
  for decoding in self.decodings:
120
142
  if decoding.failed_to_decode:
121
143
  self.was_match_undecodable[decoding.encoding] += 1
@@ -162,7 +184,7 @@ class BytesDecoder:
162
184
 
163
185
 
164
186
  def _build_encodings_metric_dict():
165
- """One key for each key in ENCODINGS_TO_ATTEMPT, values are all 0"""
187
+ """One key for each key in ENCODINGS_TO_ATTEMPT, values are all 0."""
166
188
  metrics_dict = defaultdict(lambda: 0)
167
189
 
168
190
  for encoding in ENCODINGS_TO_ATTEMPT.keys():
@@ -1,13 +1,11 @@
1
- """
2
- Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
3
- """
1
+ """Class to manage attempting to decode a chunk of bytes into strings with a given encoding."""
4
2
  from sys import byteorder
5
3
  from typing import Optional
6
4
 
7
5
  from rich.markup import escape
8
6
  from rich.text import Text
9
7
 
10
- from yaralyzer.bytes_match import BytesMatch # Used to cause circular import issues
8
+ from yaralyzer.bytes_match import BytesMatch # Formerly caused circular import issues
11
9
  from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
12
10
  UTF_8, encoding_width, is_wide_utf)
13
11
  from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
@@ -17,6 +15,8 @@ from yaralyzer.util.logging import log
17
15
 
18
16
 
19
17
  class DecodingAttempt:
18
+ """Class to manage attempting to decode a chunk of bytes into strings with a given encoding."""
19
+
20
20
  def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
21
21
  # Args
22
22
  self.bytes = bytes_match.surrounding_bytes
@@ -31,7 +31,7 @@ class DecodingAttempt:
31
31
  self.decoded_string = self._decode_bytes()
32
32
 
33
33
  def is_wide_utf_encoding(self) -> bool:
34
- """Returns True if the encoding is UTF-16 or UTF-32"""
34
+ """Returns True if the encoding is UTF-16 or UTF-32."""
35
35
  return is_wide_utf(self.encoding)
36
36
 
37
37
  def _decode_bytes(self) -> Text:
@@ -58,7 +58,7 @@ class DecodingAttempt:
58
58
  return self._custom_decode()
59
59
 
60
60
  def _custom_decode(self) -> Text:
61
- """Returns a Text obj representing an attempt to force a UTF-8 encoding upon an array of bytes"""
61
+ """Returns a Text obj representing an attempt to force a UTF-8 encoding upon an array of bytes."""
62
62
  log.info(f"Custom decoding {self.bytes_match} with {self.encoding}...")
63
63
  unprintable_char_map = ENCODINGS_TO_ATTEMPT.get(self.encoding)
64
64
  output = Text('', style='bytes.decoded')
@@ -146,7 +146,7 @@ class DecodingAttempt:
146
146
  return self._failed_to_decode_msg_txt(last_exception)
147
147
 
148
148
  def _to_rich_text(self, _string: str, bytes_offset: int = 0) -> Text:
149
- """Convert a decoded string to highlighted Text representation"""
149
+ """Convert a decoded string to highlighted Text representation."""
150
150
  # Adjust where we start the highlighting given the multibyte nature of the encodings
151
151
  log.debug(f"Stepping through {self.encoding} encoded string...")
152
152
  txt = Text('', style=self.bytes_match.style_at_position(0))
@@ -1,5 +1,6 @@
1
1
  """
2
- Constants related to character encodings
2
+ Constants related to character encodings.
3
+
3
4
  * https://www.mit.edu/people/kenta/two/iso8859.html
4
5
  * https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec
5
6
  """
@@ -1,5 +1,5 @@
1
1
  """
2
- Class to smooth some of the rough edges around the dicts returned by chardet.detect_all()
2
+ Help with chardet library.
3
3
  """
4
4
  from typing import Any, Optional
5
5
 
@@ -14,7 +14,13 @@ LANGUAGE = 'language'
14
14
 
15
15
 
16
16
  class EncodingAssessment:
17
+ """Class to smooth some of the rough edges around the dicts returned by chardet.detect_all()"""
18
+
17
19
  def __init__(self, assessment: dict) -> None:
20
+ """
21
+ Args:
22
+ assessment (dict): The dict returned by chardet.detect_all().
23
+ """
18
24
  self.assessment = assessment
19
25
  self.encoding = assessment[ENCODING].lower()
20
26
 
@@ -27,7 +33,7 @@ class EncodingAssessment:
27
33
  self.set_encoding_label(self.language.title() if self.language else None)
28
34
 
29
35
  @classmethod
30
- def dummy_encoding_assessment(cls, encoding) -> 'EncodingAssessment':
36
+ def dummy_encoding_assessment(cls, encoding: str) -> 'EncodingAssessment':
31
37
  """Generate an empty EncodingAssessment to use as a dummy when chardet gives us nothing."""
32
38
  assessment = cls({ENCODING: encoding, CONFIDENCE: 0.0})
33
39
  assessment.confidence_text = Text('none', 'no_attempt')
@@ -1,7 +1,4 @@
1
- """
2
- Manager class to ease dealing with the chardet encoding detection library 'chardet'.
3
- Each instance of this class manages a chardet.detect_all() scan on a single set of bytes.
4
- """
1
+ """EncodingDetector class for managing chardet encoding detection."""
5
2
  from operator import attrgetter
6
3
  from typing import List
7
4
 
@@ -18,10 +15,15 @@ CONFIDENCE_SCORE_RANGE = range(0, 101)
18
15
 
19
16
 
20
17
  class EncodingDetector:
21
- # 10 as in 10%, 0.02, etc. Encodings w/confidences below this will not be displayed in the decoded table
18
+ """
19
+ Manager class to ease dealing with the chardet encoding detection library 'chardet'.
20
+ Each instance of this class manages a chardet.detect_all() scan on a single set of bytes.
21
+ """
22
+
23
+ # Default value for encodings w/confidences below this will not be displayed in the decoded table
22
24
  force_display_threshold = 20.0
23
25
 
24
- # At what chardet.detect() confidence % should we force a decode with an obscure encoding?
26
+ # Default value for what chardet.detect() confidence % should we force a decode with an obscure encoding.
25
27
  force_decode_threshold = 50.0
26
28
 
27
29
  def __init__(self, _bytes: bytes) -> None:
@@ -53,21 +55,23 @@ class EncodingDetector:
53
55
  self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
54
56
 
55
57
  def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
56
- """If chardet produced one, return it, otherwise return a dummy node with confidence of 0"""
58
+ """If chardet produced one, return it, otherwise return a dummy node with confidence of 0."""
57
59
  assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
58
60
  return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
59
61
 
60
62
  def has_enough_bytes(self) -> bool:
63
+ """Return true if we have enough bytes to run chardet.detect()."""
61
64
  return self.bytes_len >= YaralyzerConfig.args.min_chardet_bytes
62
65
 
63
66
  def assessments_above_confidence(self, cutoff: float) -> List[EncodingAssessment]:
67
+ """Return the assessments above the given confidence cutoff."""
64
68
  return [a for a in self.unique_assessments if a.confidence >= cutoff]
65
69
 
66
70
  def __rich__(self) -> Padding:
67
71
  return Padding(self.table, (0, 0, 0, 0))
68
72
 
69
73
  def _uniquify_results_and_build_table(self) -> None:
70
- """Keep the highest result per encoding, ignoring the language chardet has indicated"""
74
+ """Keep the highest result per encoding, ignoring the language chardet has indicated."""
71
75
  already_seen_encodings = {}
72
76
 
73
77
  for i, result in enumerate(self.assessments):
@@ -87,6 +91,7 @@ class EncodingDetector:
87
91
  self.unique_assessments.sort(key=attrgetter('confidence'), reverse=True)
88
92
 
89
93
  def _set_empty_results(self) -> None:
94
+ """Set empty results for when chardet can't help us."""
90
95
  self.assessments = []
91
96
  self.unique_assessments = []
92
97
  self.raw_chardet_assessments = []
@@ -95,7 +100,7 @@ class EncodingDetector:
95
100
 
96
101
 
97
102
  def _empty_chardet_results_table():
98
- """Returns a fresh table"""
103
+ """Returns a fresh table."""
99
104
  table = Table(
100
105
  'Rank', 'Encoding', 'Confidence',
101
106
  title='chardet.detect results',