yaralyzer 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yaralyzer/__init__.py ADDED
@@ -0,0 +1,76 @@
1
+ import code
2
+ import yara as python_yara
3
+ from os import environ, getcwd, path
4
+
5
+ from dotenv import load_dotenv
6
+ from rich.text import Text
7
+
8
+ # load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
9
+ if not environ.get('INVOKED_BY_PYTEST', False):
10
+ for dotenv_file in [path.join(dir, '.yaralyzer') for dir in [getcwd(), path.expanduser('~')]]:
11
+ if path.exists(dotenv_file):
12
+ load_dotenv(dotenv_path=dotenv_file)
13
+ break
14
+
15
+ from yaralyzer.helpers.rich_text_helper import print_fatal_error_and_exit
16
+ from yaralyzer.output.file_export import export_json, invoke_rich_export
17
+ from yaralyzer.output.rich_console import console
18
+ from yaralyzer.util.argument_parser import get_export_basepath, parse_arguments
19
+ from yaralyzer.yara.error import yara_error_msg
20
+ from yaralyzer.yara.yara_rule_builder import HEX, REGEX
21
+ from yaralyzer.yaralyzer import Yaralyzer
22
+
23
+ PDFALYZER_MSG = "\nIf you are analyzing a PDF you may be interested in The Pdfalyzer, birthplace of The Yaralyzer:"
24
+ PDFALYZER_MSG_TXT = Text(PDFALYZER_MSG, style='bright_white bold')
25
+ PDFALYZER_MSG_TXT.append('\n -> ', style='bright_white')
26
+ PDFALYZER_MSG_TXT.append('https://github.com/michelcrypt4d4mus/pdfalyzer\n', style='bright_cyan underline')
27
+
28
+
29
+ def yaralyze():
30
+ """
31
+ Entry point for yaralyzer when invoked as a script.
32
+
33
+ Args are parsed from the command line and environment variables. See `yaralyze --help` for details.
34
+ """
35
+ args = parse_arguments()
36
+ output_basepath = None
37
+
38
+ if args.yara_rules_files:
39
+ yaralyzer = Yaralyzer.for_rules_files(args.yara_rules_files, args.file_to_scan_path)
40
+ elif args.yara_rules_dirs:
41
+ yaralyzer = Yaralyzer.for_rules_dirs(args.yara_rules_dirs, args.file_to_scan_path)
42
+ elif args.regex_patterns or args.hex_patterns:
43
+ yaralyzer = Yaralyzer.for_patterns(
44
+ args.regex_patterns or args.hex_patterns,
45
+ HEX if args.hex_patterns else REGEX,
46
+ args.file_to_scan_path,
47
+ pattern_label=args.patterns_label,
48
+ regex_modifier=args.regex_modifier)
49
+ else:
50
+ raise RuntimeError("No pattern or YARA file to scan against.")
51
+
52
+ if args.output_dir:
53
+ output_basepath = get_export_basepath(args, yaralyzer)
54
+ console.print(f"Will render yaralyzer data to '{output_basepath}'...", style='yellow')
55
+ console.record = True
56
+
57
+ try:
58
+ yaralyzer.yaralyze()
59
+ except python_yara.Error as e:
60
+ print_fatal_error_and_exit(yara_error_msg(e))
61
+
62
+ if args.export_txt:
63
+ invoke_rich_export(console.save_text, output_basepath)
64
+ if args.export_html:
65
+ invoke_rich_export(console.save_html, output_basepath)
66
+ if args.export_svg:
67
+ invoke_rich_export(console.save_svg, output_basepath)
68
+ if args.export_json:
69
+ export_json(yaralyzer, output_basepath)
70
+
71
+ if args.file_to_scan_path.endswith('.pdf'):
72
+ console.print(PDFALYZER_MSG_TXT)
73
+
74
+ # Drop into interactive shell if requested
75
+ if args.interact:
76
+ code.interact(local=locals())
@@ -0,0 +1,276 @@
1
+ """
2
+ `BytesMatch` class for tracking regex and YARA matches against binary data.
3
+ """
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from typing import Iterator, Optional
7
+
8
+ from rich.table import Table
9
+ from rich.text import Text
10
+ from yara import StringMatch, StringMatchInstance
11
+
12
+ from yaralyzer.config import YaralyzerConfig
13
+ from yaralyzer.helpers.rich_text_helper import prefix_with_style
14
+ from yaralyzer.output.file_hashes_table import bytes_hashes_table
15
+ from yaralyzer.output.rich_console import ALERT_STYLE, GREY_ADDRESS
16
+
17
+
18
+ @dataclass
19
+ class BytesMatch:
20
+ """
21
+ Simple class to keep track of regex matches against binary data.
22
+
23
+ Basically a Regex `re.match` object with some (not many) extra bells and whistles, most notably
24
+ the `surrounding_bytes` property.
25
+
26
+ Args:
27
+ matched_against (bytes): The full byte sequence that was searched.
28
+ start_idx (int): Start index of the match in the byte sequence.
29
+ match_length (int): Length of the match in bytes.
30
+ label (str): Label for the match (e.g., regex or YARA rule name).
31
+ ordinal (int): This was the Nth match for this pattern (used for labeling only).
32
+ match (Optional[re.Match]): Regex `match` object, if available.
33
+ highlight_style (str): Style to use for highlighting the match.
34
+
35
+ Attributes:
36
+ end_idx (int): End index of the match in the byte sequence.
37
+ bytes: (bytes): The bytes that matched the regex.
38
+ """
39
+ matched_against: bytes
40
+ start_idx: int
41
+ match_length: int
42
+ label: str
43
+ ordinal: int
44
+ match: re.Match | None = None # It's rough to get the regex from yara :(
45
+ highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
46
+ end_idx: int = field(init=False)
47
+ match_grooups: tuple = field(init=False)
48
+ highlight_start_idx: int = field(init=False)
49
+ highlight_end_idx: int = field(init=False)
50
+ surrounding_start_idx: int = field(init=False)
51
+ surrounding_end_idx: int = field(init=False)
52
+ surrounding_bytes: bytes = field(init=False)
53
+
54
+ def __post_init__(self):
55
+ self.end_idx: int = self.start_idx + self.match_length
56
+ self.bytes = self.matched_against[self.start_idx:self.end_idx] # TODO: Maybe should be called "matched_bytes"
57
+ self.match_groups: Optional[tuple] = self.match.groups() if self.match else None
58
+ num_after = YaralyzerConfig.args.surrounding_bytes
59
+ num_before = YaralyzerConfig.args.surrounding_bytes
60
+ # Adjust the highlighting start point in case this match is very early or late in the stream
61
+ self.surrounding_start_idx: int = max(self.start_idx - num_before, 0)
62
+ self.surrounding_end_idx: int = min(self.end_idx + num_after, len(self.matched_against))
63
+ self.surrounding_bytes: bytes = self.matched_against[self.surrounding_start_idx:self.surrounding_end_idx]
64
+ self.highlight_start_idx = self.start_idx - self.surrounding_start_idx
65
+ self.highlight_end_idx = self.highlight_start_idx + self.match_length
66
+
67
+ @classmethod
68
+ def from_regex_match(
69
+ cls,
70
+ matched_against: bytes,
71
+ match: re.Match,
72
+ ordinal: int,
73
+ highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
74
+ ) -> 'BytesMatch':
75
+ """
76
+ Alternate constructor to build a `BytesMatch` from a regex match object.
77
+
78
+ Args:
79
+ matched_against (bytes): The bytes searched.
80
+ match (re.Match): The regex `match` object.
81
+ ordinal (int): This was the Nth match for this pattern (used for labeling only).
82
+ highlight_style (str): Style for highlighting.
83
+
84
+ Returns:
85
+ BytesMatch: The constructed `BytesMatch` instance.
86
+ """
87
+ return cls(matched_against, match.start(), len(match[0]), match.re.pattern, ordinal, match, highlight_style)
88
+
89
+ @classmethod
90
+ def from_yara_str(
91
+ cls,
92
+ matched_against: bytes,
93
+ rule_name: str,
94
+ yara_str_match: StringMatch,
95
+ yara_str_match_instance: StringMatchInstance,
96
+ ordinal: int,
97
+ highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
98
+ ) -> 'BytesMatch':
99
+ """
100
+ Alternate constructor to build a `BytesMatch` from a YARA string match instance.
101
+
102
+ Args:
103
+ matched_against (bytes): The bytes searched.
104
+ rule_name (str): Name of the YARA rule.
105
+ yara_str_match (StringMatch): YARA string match object.
106
+ yara_str_match_instance (StringMatchInstance): Instance of the string match.
107
+ ordinal (int): The Nth match for this pattern.
108
+ highlight_style (str): Style for highlighting.
109
+
110
+ Returns:
111
+ BytesMatch: The constructed BytesMatch instance.
112
+ """
113
+ pattern_label = yara_str_match.identifier
114
+
115
+ # Don't duplicate the labeling if rule_name and yara_str are the same
116
+ if pattern_label == '$' + rule_name:
117
+ label = pattern_label
118
+ else:
119
+ label = rule_name + ': ' + pattern_label
120
+
121
+ return cls(
122
+ matched_against=matched_against,
123
+ start_idx=yara_str_match_instance.offset,
124
+ match_length=yara_str_match_instance.matched_length,
125
+ label=label,
126
+ ordinal=ordinal,
127
+ highlight_style=highlight_style
128
+ )
129
+
130
+ @classmethod
131
+ def from_yara_match(
132
+ cls,
133
+ matched_against: bytes,
134
+ yara_match: dict,
135
+ highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
136
+ ) -> Iterator['BytesMatch']:
137
+ """
138
+ Yield a `BytesMatch` for each string returned as part of a YARA match result dict.
139
+
140
+ Args:
141
+ matched_against (bytes): The bytes searched.
142
+ yara_match (dict): YARA match result dictionary.
143
+ highlight_style (str): Style for highlighting.
144
+
145
+ Yields:
146
+ BytesMatch: For each string match in the YARA result.
147
+ """
148
+ i = 0 # For numbered labeling
149
+
150
+ # yara-python's internals changed with 4.3.0: https://github.com/VirusTotal/yara-python/releases/tag/v4.3.0
151
+ for yara_str_match in yara_match['strings']:
152
+ for yara_str_match_instance in yara_str_match.instances:
153
+ i += 1
154
+
155
+ yield cls.from_yara_str(
156
+ matched_against,
157
+ yara_match['rule'],
158
+ yara_str_match,
159
+ yara_str_match_instance,
160
+ i,
161
+ highlight_style
162
+ )
163
+
164
+ def style_at_position(self, idx) -> str:
165
+ """
166
+ Get the style for the byte at position `idx` within the matched bytes.
167
+
168
+ Args:
169
+ idx (int): Index within the surrounding bytes.
170
+
171
+ Returns:
172
+ str: The style to use for this byte (highlight or greyed out).
173
+ """
174
+ if idx < self.highlight_start_idx or idx >= self.highlight_end_idx:
175
+ return GREY_ADDRESS
176
+ else:
177
+ return self.highlight_style
178
+
179
+ def location(self) -> Text:
180
+ """
181
+ Get a styled `Text` object describing the start and end index of the match.
182
+
183
+ Returns:
184
+ Text: Rich Text object like '(start idx: 348190, end idx: 348228)'.
185
+ """
186
+ location_txt = prefix_with_style(
187
+ f"(start idx: ",
188
+ style='off_white',
189
+ root_style='decode.subheading'
190
+ )
191
+
192
+ location_txt.append(str(self.start_idx), style='number')
193
+ location_txt.append(', end idx: ', style='off_white')
194
+ location_txt.append(str(self.end_idx), style='number')
195
+ location_txt.append(')', style='off_white')
196
+ return location_txt
197
+
198
+ def is_decodable(self) -> bool:
199
+ """
200
+ Determine if the matched bytes should be decoded.
201
+
202
+ Whether the bytes are decodable depends on whether `SUPPRESS_DECODES_TABLE` is set
203
+ and whether the match length is between `MIN`/`MAX_DECODE_LENGTH`.
204
+
205
+ Returns:
206
+ bool: `True` if decodable, `False` otherwise.
207
+ """
208
+ return self.match_length >= YaralyzerConfig.args.min_decode_length \
209
+ and self.match_length <= YaralyzerConfig.args.max_decode_length \
210
+ and not YaralyzerConfig.args.suppress_decodes_table
211
+
212
+ def bytes_hashes_table(self) -> Table:
213
+ """
214
+ Build a table of MD5/SHA hashes for the matched bytes.
215
+
216
+ Returns:
217
+ Table: Rich `Table` object with hashes.
218
+ """
219
+ return bytes_hashes_table(
220
+ self.bytes,
221
+ self.location().plain,
222
+ 'center'
223
+ )
224
+
225
+ def suppression_notice(self) -> Text:
226
+ """
227
+ Generate a message for when the match is too short or too long to decode.
228
+
229
+ Returns:
230
+ Text: Rich `Text` object with the suppression notice.
231
+ """
232
+ txt = self.__rich__()
233
+
234
+ if self.match_length < YaralyzerConfig.args.min_decode_length:
235
+ txt = Text('Too little to actually attempt decode at ', style='grey') + txt
236
+ else:
237
+ txt.append(" too long to decode ")
238
+ txt.append(f"(--max-decode-length is {YaralyzerConfig.args.max_decode_length} bytes)", style='grey')
239
+
240
+ return txt
241
+
242
+ def to_json(self) -> dict:
243
+ """
244
+ Convert this `BytesMatch` to a JSON-serializable dictionary.
245
+
246
+ Returns:
247
+ dict: Dictionary representation of the match, suitable for JSON serialization.
248
+ """
249
+ json_dict = {
250
+ 'label': self.label,
251
+ 'match_length': self.match_length,
252
+ 'matched_bytes': self.bytes.hex(),
253
+ 'ordinal': self.ordinal,
254
+ 'start_idx': self.start_idx,
255
+ 'end_idx': self.end_idx,
256
+ 'surrounding_bytes': self.surrounding_bytes.hex(),
257
+ 'surrounding_start_idx': self.surrounding_start_idx,
258
+ 'surrounding_end_idx': self.surrounding_end_idx,
259
+ }
260
+
261
+ if self.match:
262
+ json_dict['pattern'] = self.match.re.pattern
263
+
264
+ return json_dict
265
+
266
+ def __rich__(self) -> Text:
267
+ """Get a rich `Text` representation of the match for display."""
268
+ headline = prefix_with_style(str(self.match_length), style='number', root_style='decode.subheading')
269
+ headline.append(f" bytes matching ")
270
+ headline.append(f"{self.label} ", style=ALERT_STYLE if self.highlight_style == ALERT_STYLE else 'regex')
271
+ headline.append('at ')
272
+ return headline + self.location()
273
+
274
+ def __str__(self):
275
+ """Plain text (no rich colors) representation of the match for display."""
276
+ return self.__rich__().plain
yaralyzer/config.py ADDED
@@ -0,0 +1,126 @@
1
+ """
2
+ Configuration management for Yaralyzer.
3
+ """
4
+ import logging
5
+ from argparse import ArgumentParser, Namespace
6
+ from os import environ
7
+ from typing import Any, List
8
+
9
+ from rich.console import Console
10
+
11
+ YARALYZE = 'yaralyze'
12
+ YARALYZER = f"{YARALYZE}r".upper()
13
+ PYTEST_FLAG = 'INVOKED_BY_PYTEST'
14
+
15
+ KILOBYTE = 1024
16
+ MEGABYTE = 1024 * KILOBYTE
17
+
18
+
19
+ def config_var_name(env_var: str) -> str:
20
+ """
21
+ Get the name of `env_var` and strip off `YARALYZER_` prefix.
22
+
23
+ Example:
24
+ ```
25
+ SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
26
+ config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
27
+ ```
28
+ """
29
+ env_var = env_var.removeprefix("YARALYZER_")
30
+ return f'{env_var=}'.partition('=')[0]
31
+
32
+
33
+ def is_env_var_set_and_not_false(var_name: str) -> bool:
34
+ """Return `True` if `var_name` is not empty and set to anything other than "false" (capitalization agnostic)."""
35
+ if var_name in environ:
36
+ var_value = environ[var_name]
37
+ return var_value is not None and len(var_value) > 0 and var_value.lower() != 'false'
38
+ else:
39
+ return False
40
+
41
+
42
+ def is_invoked_by_pytest() -> bool:
43
+ """Return `True` if invoked in a `pytest` context."""
44
+ return is_env_var_set_and_not_false(PYTEST_FLAG)
45
+
46
+
47
+ class YaralyzerConfig:
48
+ """Handles parsing of command line args and environment variables for Yaralyzer."""
49
+
50
+ # Passed through to yara.set_config()
51
+ DEFAULT_MAX_MATCH_LENGTH = 100 * KILOBYTE
52
+ DEFAULT_YARA_STACK_SIZE = 2 * 65536
53
+
54
+ # Skip decoding binary matches under/over these lengths
55
+ DEFAULT_MIN_DECODE_LENGTH = 1
56
+ DEFAULT_MAX_DECODE_LENGTH = 256
57
+
58
+ # chardet.detect() related
59
+ DEFAULT_MIN_CHARDET_TABLE_CONFIDENCE = 2
60
+ DEFAULT_MIN_CHARDET_BYTES = 9
61
+
62
+ # Number of bytes to show before/after byte previews and decodes. Configured by command line or env var
63
+ DEFAULT_SURROUNDING_BYTES = 64
64
+
65
+ LOG_DIR_ENV_VAR = 'YARALYZER_LOG_DIR'
66
+ LOG_DIR = environ.get(LOG_DIR_ENV_VAR)
67
+ LOG_LEVEL_ENV_VAR = f"{YARALYZER}_LOG_LEVEL"
68
+ LOG_LEVEL = logging.getLevelName(environ.get(LOG_LEVEL_ENV_VAR, 'WARN'))
69
+
70
+ if LOG_DIR and not is_invoked_by_pytest():
71
+ Console(color_system='256').print(f"Writing logs to '{LOG_DIR}' instead of stderr/stdout...", style='dim')
72
+
73
+ HIGHLIGHT_STYLE = 'orange1'
74
+
75
+ ONLY_CLI_ARGS = [
76
+ 'debug',
77
+ 'help',
78
+ 'hex_patterns',
79
+ 'interact',
80
+ 'patterns_label',
81
+ 'regex_patterns',
82
+ 'regex_modifier',
83
+ 'version'
84
+ ]
85
+
86
+ @classmethod
87
+ def set_argument_parser(cls, parser: ArgumentParser) -> None:
88
+ """Sets the `_argument_parser` instance variable that will be used to parse command line args."""
89
+ cls._argument_parser: ArgumentParser = parser
90
+ cls._argparse_keys: List[str] = sorted([action.dest for action in parser._actions])
91
+
92
+ @classmethod
93
+ def set_args(cls, args: Namespace) -> None:
94
+ """Set the `args` class instance variable and update args with any environment variable overrides."""
95
+ cls.args = args
96
+
97
+ for option in cls._argparse_keys:
98
+ if option.startswith('export') or option in cls.ONLY_CLI_ARGS:
99
+ continue
100
+
101
+ arg_value = vars(args)[option]
102
+ env_var = f"{YARALYZER}_{option.upper()}"
103
+ env_value = environ.get(env_var)
104
+ default_value = cls.get_default_arg(option)
105
+ # print(f"option: {option}, arg_value: {arg_value}, env_var: {env_var}, env_value: {env_value}, default: {default_value}") # noqa: E501
106
+
107
+ # TODO: as is you can't override env vars with CLI args
108
+ if isinstance(arg_value, bool):
109
+ setattr(args, option, arg_value or is_env_var_set_and_not_false(env_var))
110
+ elif isinstance(arg_value, (int, float)):
111
+ # Check against defaults to avoid overriding env var configured options
112
+ if arg_value == default_value and env_value is not None:
113
+ setattr(args, option, int(env_value) or arg_value) # TODO: float args not handled
114
+ else:
115
+ setattr(args, option, arg_value or env_value)
116
+
117
+ @classmethod
118
+ def set_default_args(cls):
119
+ """Set `self.args` to their defaults as if parsed from the command line."""
120
+ cls.set_args(cls._argument_parser.parse_args(['dummy']))
121
+
122
+ @classmethod
123
+ def get_default_arg(cls, arg: str) -> Any:
124
+ """Return the default value for `arg` as defined by a `DEFAULT_` style class variable."""
125
+ default_var = f"DEFAULT_{arg.upper()}"
126
+ return vars(cls).get(default_var)
@@ -0,0 +1,207 @@
1
+ """
2
+ `BytesDecoder` class for attempting to decode bytes with various encodings.
3
+ """
4
+ from collections import defaultdict
5
+ from copy import deepcopy
6
+ from operator import attrgetter
7
+ from typing import List, Optional
8
+
9
+ from rich.align import Align
10
+ from rich.console import Console, ConsoleOptions, NewLine, RenderResult
11
+ from rich.panel import Panel
12
+ from rich.table import Table
13
+ from rich.text import Text
14
+
15
+ from yaralyzer.bytes_match import BytesMatch # Used to cause circular import issues
16
+ from yaralyzer.config import YaralyzerConfig
17
+ from yaralyzer.decoding.decoding_attempt import DecodingAttempt
18
+ from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT
19
+ from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
20
+ from yaralyzer.encoding_detection.encoding_detector import EncodingDetector
21
+ from yaralyzer.helpers.dict_helper import get_dict_key_by_value
22
+ from yaralyzer.helpers.rich_text_helper import CENTER, DECODING_ERRORS_MSG, NO_DECODING_ERRORS_MSG
23
+ from yaralyzer.output.decoding_attempts_table import new_decoding_attempts_table
24
+ from yaralyzer.output.decoding_table_row import DecodingTableRow
25
+ from yaralyzer.util.logging import log
26
+
27
+ # A 2-tuple that can be indexed by booleans of messages used in the table to show true vs. false
28
+ WAS_DECODABLE_YES_NO = [NO_DECODING_ERRORS_MSG, DECODING_ERRORS_MSG]
29
+
30
+ # Multiply chardet scores by 100 (again) to make sorting the table easy
31
+ SCORE_SCALER = 100.0
32
+
33
+
34
+ class BytesDecoder:
35
+ """
36
+ Handles decoding a chunk of bytes into strings using various possible encodings, ranking and displaying results.
37
+
38
+ This class leverages the `chardet` library and custom logic to try multiple encodings, track decoding outcomes,
39
+ and present the results in a rich, user-friendly format. It is used to analyze and display the possible
40
+ interpretations of a byte sequence, especially in the context of YARA matches or binary analysis.
41
+
42
+ Attributes:
43
+ bytes_match (BytesMatch): The `BytesMatch` instance being decoded.
44
+ bytes (bytes): The bytes (including surrounding context) to decode.
45
+ label (str): Label for this decoding attempt.
46
+ was_match_decodable (dict): Tracks successful decodes per encoding.
47
+ was_match_force_decoded (dict): Tracks forced decodes per encoding.
48
+ was_match_undecodable (dict): Tracks failed decodes per encoding.
49
+ decoded_strings (dict): Maps encoding to decoded string.
50
+ undecoded_rows (list): Stores undecoded table rows.
51
+ decodings (list): List of DecodingAttempt objects for each encoding tried.
52
+ encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
53
+ """
54
+
55
+ def __init__(self, bytes_match: 'BytesMatch', label: Optional[str] = None) -> None:
56
+ """
57
+ Initialize a `BytesDecoder` for attempting to decode a chunk of bytes using various encodings.
58
+
59
+ Args:
60
+ bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
61
+ label (Optional[str], optional): Optional label for this decoding attempt. Defaults to the match label.
62
+ """
63
+ self.bytes_match = bytes_match
64
+ self.bytes = bytes_match.surrounding_bytes
65
+ self.label = label or bytes_match.label
66
+
67
+ # Empty table/metrics/etc
68
+ self.was_match_decodable = _build_encodings_metric_dict()
69
+ self.was_match_force_decoded = _build_encodings_metric_dict()
70
+ self.was_match_undecodable = _build_encodings_metric_dict()
71
+ self.decoded_strings = {} # dict[encoding: decoded string]
72
+ self.undecoded_rows = []
73
+ self.decodings = []
74
+
75
+ # Note we send both the match and surrounding bytes used when detecting the encoding
76
+ self.encoding_detector = EncodingDetector(self.bytes)
77
+
78
+ def __rich_console__(self, _console: Console, options: ConsoleOptions) -> RenderResult:
79
+ """Rich object generator (see Rich console docs)."""
80
+ yield NewLine(2)
81
+ yield Align(self._decode_attempt_subheading(), CENTER)
82
+
83
+ if not YaralyzerConfig.args.suppress_chardet:
84
+ yield NewLine()
85
+ yield Align(self.encoding_detector, CENTER)
86
+ yield NewLine()
87
+
88
+ # In standalone mode we always print the hex/raw bytes
89
+ if self.bytes_match.is_decodable():
90
+ yield self._build_decodings_table()
91
+ elif YaralyzerConfig.args.standalone_mode:
92
+ yield self._build_decodings_table(True)
93
+
94
+ yield NewLine()
95
+ yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
96
+
97
+ def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
98
+ """
99
+ First rows are the raw / hex views of the bytes, next rows are the attempted decodings.
100
+
101
+ Args:
102
+ suppress_decodes (bool, optional): If `True` don't add decoding attempts to the table. Defaults to `False`.
103
+ """
104
+ self.table = new_decoding_attempts_table(self.bytes_match)
105
+
106
+ # Add the encoding rows to the table if not suppressed
107
+ if not (YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes):
108
+ self.decodings = [DecodingAttempt(self.bytes_match, encoding) for encoding in ENCODINGS_TO_ATTEMPT]
109
+ # Attempt decodings we don't usually attempt if chardet is insistent enough
110
+ forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
111
+ self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
112
+
113
+ # If we still haven't decoded chardet's top choice, decode it
114
+ if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
115
+ chardet_top_encoding = self._forced_displays()[0].encoding
116
+ log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
117
+ self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
118
+
119
+ # Build the table rows from the decoding attempts
120
+ rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
121
+
122
+ # Add assessments with no decode attempt
123
+ rows += [
124
+ DecodingTableRow.from_undecoded_assessment(a, a.confidence * SCORE_SCALER)
125
+ for a in self._forced_displays()
126
+ ]
127
+
128
+ self._track_decode_stats()
129
+
130
+ for row in sorted(rows, key=attrgetter('sort_score', 'encoding_label_plain'), reverse=True):
131
+ self.table.add_row(*row.to_row_list())
132
+
133
+ return self.table
134
+
135
+ # TODO: rename this to something that makes more sense, maybe assessments_over_display_threshold()?
136
+ def _forced_displays(self) -> List[EncodingAssessment]:
137
+ """Returns assessments over the display threshold that are not yet decoded."""
138
+ return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
139
+
140
+ def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
141
+ """Filter out the already decoded assessments from a set of assessments."""
142
+ return [a for a in assessments if not self._was_decoded(a.encoding)]
143
+
144
+ def _was_decoded(self, encoding: str) -> bool:
145
+ """Check whether a given encoding is in the table already."""
146
+ return any(row.encoding == encoding for row in self.decodings)
147
+
148
+ def _decode_attempt_subheading(self) -> Panel:
149
+ """Generate a rich.Panel for displaying decode attempts."""
150
+ headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
151
+ return Panel(headline, style='decode.subheading', expand=False)
152
+
153
+ def _track_decode_stats(self) -> None:
154
+ """Track stats about successful vs. forced vs. failed decode attempts."""
155
+ for decoding in self.decodings:
156
+ if decoding.failed_to_decode:
157
+ self.was_match_undecodable[decoding.encoding] += 1
158
+ continue
159
+
160
+ self.was_match_decodable[decoding.encoding] += 1
161
+
162
+ if decoding.was_force_decoded:
163
+ self.was_match_force_decoded[decoding.encoding] += 1
164
+
165
+ def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
166
+ """Create a `DecodingAttemptTable` row from a `DecodingAttempt`."""
167
+ assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
168
+
169
+ # If the decoding can have a start offset add an appropriate extension to the encoding label
170
+ if decoding.start_offset_label:
171
+ if assessment.language:
172
+ log.warning(f"{decoding.encoding} offset {decoding.start_offset} AND language '{assessment.language}'")
173
+ else:
174
+ assessment = deepcopy(assessment)
175
+ assessment.set_encoding_label(decoding.start_offset_label)
176
+
177
+ plain_decoded_string = decoding.decoded_string.plain
178
+ sort_score = assessment.confidence * SCORE_SCALER
179
+
180
+ # If the decoding result is a duplicate of a previous decoding, replace the decoded text
181
+ # with "same output as X" where X is the previous encoding that gave the same result.
182
+ if plain_decoded_string in self.decoded_strings.values():
183
+ encoding_with_same_output = get_dict_key_by_value(self.decoded_strings, plain_decoded_string)
184
+ display_text = Text('same output as ', style='color(66) dim italic')
185
+ display_text.append(encoding_with_same_output, style=ENCODING).append('...', style='white')
186
+ else:
187
+ self.decoded_strings[decoding.encoding_label] = plain_decoded_string
188
+ display_text = decoding.decoded_string
189
+
190
+ # Set failures negative, shave off a little for forced decodes
191
+ if decoding.failed_to_decode:
192
+ sort_score = (sort_score * -1) - 100
193
+ elif decoding.was_force_decoded:
194
+ sort_score -= 10
195
+
196
+ was_forced = WAS_DECODABLE_YES_NO[int(decoding.was_force_decoded)]
197
+ return DecodingTableRow.from_decoded_assessment(assessment, was_forced, display_text, sort_score)
198
+
199
+
200
+ def _build_encodings_metric_dict():
201
+ """One key for each key in `ENCODINGS_TO_ATTEMPT`, values are all 0."""
202
+ metrics_dict = defaultdict(lambda: 0)
203
+
204
+ for encoding in ENCODINGS_TO_ATTEMPT.keys():
205
+ metrics_dict[encoding] = 0
206
+
207
+ return metrics_dict