yaralyzer 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .yaralyzer.example +65 -0
- CHANGELOG.md +128 -0
- LICENSE +674 -0
- yaralyzer/__init__.py +76 -0
- yaralyzer/bytes_match.py +276 -0
- yaralyzer/config.py +126 -0
- yaralyzer/decoding/bytes_decoder.py +207 -0
- yaralyzer/decoding/decoding_attempt.py +222 -0
- yaralyzer/encoding_detection/character_encodings.py +197 -0
- yaralyzer/encoding_detection/encoding_assessment.py +83 -0
- yaralyzer/encoding_detection/encoding_detector.py +145 -0
- yaralyzer/helpers/bytes_helper.py +268 -0
- yaralyzer/helpers/dict_helper.py +8 -0
- yaralyzer/helpers/file_helper.py +49 -0
- yaralyzer/helpers/list_helper.py +16 -0
- yaralyzer/helpers/rich_text_helper.py +150 -0
- yaralyzer/helpers/string_helper.py +34 -0
- yaralyzer/output/decoding_attempts_table.py +82 -0
- yaralyzer/output/decoding_table_row.py +60 -0
- yaralyzer/output/file_export.py +111 -0
- yaralyzer/output/file_hashes_table.py +82 -0
- yaralyzer/output/regex_match_metrics.py +97 -0
- yaralyzer/output/rich_console.py +114 -0
- yaralyzer/util/argument_parser.py +297 -0
- yaralyzer/util/logging.py +135 -0
- yaralyzer/yara/error.py +90 -0
- yaralyzer/yara/yara_match.py +160 -0
- yaralyzer/yara/yara_rule_builder.py +164 -0
- yaralyzer/yaralyzer.py +304 -0
- yaralyzer-1.0.11.dist-info/LICENSE +674 -0
- yaralyzer-1.0.11.dist-info/METADATA +151 -0
- yaralyzer-1.0.11.dist-info/RECORD +34 -0
- yaralyzer-1.0.11.dist-info/WHEEL +4 -0
- yaralyzer-1.0.11.dist-info/entry_points.txt +4 -0
yaralyzer/__init__.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import code
|
|
2
|
+
import yara as python_yara
|
|
3
|
+
from os import environ, getcwd, path
|
|
4
|
+
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
|
|
8
|
+
# load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
|
|
9
|
+
if not environ.get('INVOKED_BY_PYTEST', False):
|
|
10
|
+
for dotenv_file in [path.join(dir, '.yaralyzer') for dir in [getcwd(), path.expanduser('~')]]:
|
|
11
|
+
if path.exists(dotenv_file):
|
|
12
|
+
load_dotenv(dotenv_path=dotenv_file)
|
|
13
|
+
break
|
|
14
|
+
|
|
15
|
+
from yaralyzer.helpers.rich_text_helper import print_fatal_error_and_exit
|
|
16
|
+
from yaralyzer.output.file_export import export_json, invoke_rich_export
|
|
17
|
+
from yaralyzer.output.rich_console import console
|
|
18
|
+
from yaralyzer.util.argument_parser import get_export_basepath, parse_arguments
|
|
19
|
+
from yaralyzer.yara.error import yara_error_msg
|
|
20
|
+
from yaralyzer.yara.yara_rule_builder import HEX, REGEX
|
|
21
|
+
from yaralyzer.yaralyzer import Yaralyzer
|
|
22
|
+
|
|
23
|
+
PDFALYZER_MSG = "\nIf you are analyzing a PDF you may be interested in The Pdfalyzer, birthplace of The Yaralyzer:"
|
|
24
|
+
PDFALYZER_MSG_TXT = Text(PDFALYZER_MSG, style='bright_white bold')
|
|
25
|
+
PDFALYZER_MSG_TXT.append('\n -> ', style='bright_white')
|
|
26
|
+
PDFALYZER_MSG_TXT.append('https://github.com/michelcrypt4d4mus/pdfalyzer\n', style='bright_cyan underline')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def yaralyze():
|
|
30
|
+
"""
|
|
31
|
+
Entry point for yaralyzer when invoked as a script.
|
|
32
|
+
|
|
33
|
+
Args are parsed from the command line and environment variables. See `yaralyze --help` for details.
|
|
34
|
+
"""
|
|
35
|
+
args = parse_arguments()
|
|
36
|
+
output_basepath = None
|
|
37
|
+
|
|
38
|
+
if args.yara_rules_files:
|
|
39
|
+
yaralyzer = Yaralyzer.for_rules_files(args.yara_rules_files, args.file_to_scan_path)
|
|
40
|
+
elif args.yara_rules_dirs:
|
|
41
|
+
yaralyzer = Yaralyzer.for_rules_dirs(args.yara_rules_dirs, args.file_to_scan_path)
|
|
42
|
+
elif args.regex_patterns or args.hex_patterns:
|
|
43
|
+
yaralyzer = Yaralyzer.for_patterns(
|
|
44
|
+
args.regex_patterns or args.hex_patterns,
|
|
45
|
+
HEX if args.hex_patterns else REGEX,
|
|
46
|
+
args.file_to_scan_path,
|
|
47
|
+
pattern_label=args.patterns_label,
|
|
48
|
+
regex_modifier=args.regex_modifier)
|
|
49
|
+
else:
|
|
50
|
+
raise RuntimeError("No pattern or YARA file to scan against.")
|
|
51
|
+
|
|
52
|
+
if args.output_dir:
|
|
53
|
+
output_basepath = get_export_basepath(args, yaralyzer)
|
|
54
|
+
console.print(f"Will render yaralyzer data to '{output_basepath}'...", style='yellow')
|
|
55
|
+
console.record = True
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
yaralyzer.yaralyze()
|
|
59
|
+
except python_yara.Error as e:
|
|
60
|
+
print_fatal_error_and_exit(yara_error_msg(e))
|
|
61
|
+
|
|
62
|
+
if args.export_txt:
|
|
63
|
+
invoke_rich_export(console.save_text, output_basepath)
|
|
64
|
+
if args.export_html:
|
|
65
|
+
invoke_rich_export(console.save_html, output_basepath)
|
|
66
|
+
if args.export_svg:
|
|
67
|
+
invoke_rich_export(console.save_svg, output_basepath)
|
|
68
|
+
if args.export_json:
|
|
69
|
+
export_json(yaralyzer, output_basepath)
|
|
70
|
+
|
|
71
|
+
if args.file_to_scan_path.endswith('.pdf'):
|
|
72
|
+
console.print(PDFALYZER_MSG_TXT)
|
|
73
|
+
|
|
74
|
+
# Drop into interactive shell if requested
|
|
75
|
+
if args.interact:
|
|
76
|
+
code.interact(local=locals())
|
yaralyzer/bytes_match.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`BytesMatch` class for tracking regex and YARA matches against binary data.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Iterator, Optional
|
|
7
|
+
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
from rich.text import Text
|
|
10
|
+
from yara import StringMatch, StringMatchInstance
|
|
11
|
+
|
|
12
|
+
from yaralyzer.config import YaralyzerConfig
|
|
13
|
+
from yaralyzer.helpers.rich_text_helper import prefix_with_style
|
|
14
|
+
from yaralyzer.output.file_hashes_table import bytes_hashes_table
|
|
15
|
+
from yaralyzer.output.rich_console import ALERT_STYLE, GREY_ADDRESS
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BytesMatch:
|
|
20
|
+
"""
|
|
21
|
+
Simple class to keep track of regex matches against binary data.
|
|
22
|
+
|
|
23
|
+
Basically a Regex `re.match` object with some (not many) extra bells and whistles, most notably
|
|
24
|
+
the `surrounding_bytes` property.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
matched_against (bytes): The full byte sequence that was searched.
|
|
28
|
+
start_idx (int): Start index of the match in the byte sequence.
|
|
29
|
+
match_length (int): Length of the match in bytes.
|
|
30
|
+
label (str): Label for the match (e.g., regex or YARA rule name).
|
|
31
|
+
ordinal (int): This was the Nth match for this pattern (used for labeling only).
|
|
32
|
+
match (Optional[re.Match]): Regex `match` object, if available.
|
|
33
|
+
highlight_style (str): Style to use for highlighting the match.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
end_idx (int): End index of the match in the byte sequence.
|
|
37
|
+
bytes: (bytes): The bytes that matched the regex.
|
|
38
|
+
"""
|
|
39
|
+
matched_against: bytes
|
|
40
|
+
start_idx: int
|
|
41
|
+
match_length: int
|
|
42
|
+
label: str
|
|
43
|
+
ordinal: int
|
|
44
|
+
match: re.Match | None = None # It's rough to get the regex from yara :(
|
|
45
|
+
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
46
|
+
end_idx: int = field(init=False)
|
|
47
|
+
match_grooups: tuple = field(init=False)
|
|
48
|
+
highlight_start_idx: int = field(init=False)
|
|
49
|
+
highlight_end_idx: int = field(init=False)
|
|
50
|
+
surrounding_start_idx: int = field(init=False)
|
|
51
|
+
surrounding_end_idx: int = field(init=False)
|
|
52
|
+
surrounding_bytes: bytes = field(init=False)
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
self.end_idx: int = self.start_idx + self.match_length
|
|
56
|
+
self.bytes = self.matched_against[self.start_idx:self.end_idx] # TODO: Maybe should be called "matched_bytes"
|
|
57
|
+
self.match_groups: Optional[tuple] = self.match.groups() if self.match else None
|
|
58
|
+
num_after = YaralyzerConfig.args.surrounding_bytes
|
|
59
|
+
num_before = YaralyzerConfig.args.surrounding_bytes
|
|
60
|
+
# Adjust the highlighting start point in case this match is very early or late in the stream
|
|
61
|
+
self.surrounding_start_idx: int = max(self.start_idx - num_before, 0)
|
|
62
|
+
self.surrounding_end_idx: int = min(self.end_idx + num_after, len(self.matched_against))
|
|
63
|
+
self.surrounding_bytes: bytes = self.matched_against[self.surrounding_start_idx:self.surrounding_end_idx]
|
|
64
|
+
self.highlight_start_idx = self.start_idx - self.surrounding_start_idx
|
|
65
|
+
self.highlight_end_idx = self.highlight_start_idx + self.match_length
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_regex_match(
|
|
69
|
+
cls,
|
|
70
|
+
matched_against: bytes,
|
|
71
|
+
match: re.Match,
|
|
72
|
+
ordinal: int,
|
|
73
|
+
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
74
|
+
) -> 'BytesMatch':
|
|
75
|
+
"""
|
|
76
|
+
Alternate constructor to build a `BytesMatch` from a regex match object.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
matched_against (bytes): The bytes searched.
|
|
80
|
+
match (re.Match): The regex `match` object.
|
|
81
|
+
ordinal (int): This was the Nth match for this pattern (used for labeling only).
|
|
82
|
+
highlight_style (str): Style for highlighting.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
BytesMatch: The constructed `BytesMatch` instance.
|
|
86
|
+
"""
|
|
87
|
+
return cls(matched_against, match.start(), len(match[0]), match.re.pattern, ordinal, match, highlight_style)
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def from_yara_str(
|
|
91
|
+
cls,
|
|
92
|
+
matched_against: bytes,
|
|
93
|
+
rule_name: str,
|
|
94
|
+
yara_str_match: StringMatch,
|
|
95
|
+
yara_str_match_instance: StringMatchInstance,
|
|
96
|
+
ordinal: int,
|
|
97
|
+
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
98
|
+
) -> 'BytesMatch':
|
|
99
|
+
"""
|
|
100
|
+
Alternate constructor to build a `BytesMatch` from a YARA string match instance.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
matched_against (bytes): The bytes searched.
|
|
104
|
+
rule_name (str): Name of the YARA rule.
|
|
105
|
+
yara_str_match (StringMatch): YARA string match object.
|
|
106
|
+
yara_str_match_instance (StringMatchInstance): Instance of the string match.
|
|
107
|
+
ordinal (int): The Nth match for this pattern.
|
|
108
|
+
highlight_style (str): Style for highlighting.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
BytesMatch: The constructed BytesMatch instance.
|
|
112
|
+
"""
|
|
113
|
+
pattern_label = yara_str_match.identifier
|
|
114
|
+
|
|
115
|
+
# Don't duplicate the labeling if rule_name and yara_str are the same
|
|
116
|
+
if pattern_label == '$' + rule_name:
|
|
117
|
+
label = pattern_label
|
|
118
|
+
else:
|
|
119
|
+
label = rule_name + ': ' + pattern_label
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
matched_against=matched_against,
|
|
123
|
+
start_idx=yara_str_match_instance.offset,
|
|
124
|
+
match_length=yara_str_match_instance.matched_length,
|
|
125
|
+
label=label,
|
|
126
|
+
ordinal=ordinal,
|
|
127
|
+
highlight_style=highlight_style
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def from_yara_match(
|
|
132
|
+
cls,
|
|
133
|
+
matched_against: bytes,
|
|
134
|
+
yara_match: dict,
|
|
135
|
+
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
136
|
+
) -> Iterator['BytesMatch']:
|
|
137
|
+
"""
|
|
138
|
+
Yield a `BytesMatch` for each string returned as part of a YARA match result dict.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
matched_against (bytes): The bytes searched.
|
|
142
|
+
yara_match (dict): YARA match result dictionary.
|
|
143
|
+
highlight_style (str): Style for highlighting.
|
|
144
|
+
|
|
145
|
+
Yields:
|
|
146
|
+
BytesMatch: For each string match in the YARA result.
|
|
147
|
+
"""
|
|
148
|
+
i = 0 # For numbered labeling
|
|
149
|
+
|
|
150
|
+
# yara-python's internals changed with 4.3.0: https://github.com/VirusTotal/yara-python/releases/tag/v4.3.0
|
|
151
|
+
for yara_str_match in yara_match['strings']:
|
|
152
|
+
for yara_str_match_instance in yara_str_match.instances:
|
|
153
|
+
i += 1
|
|
154
|
+
|
|
155
|
+
yield cls.from_yara_str(
|
|
156
|
+
matched_against,
|
|
157
|
+
yara_match['rule'],
|
|
158
|
+
yara_str_match,
|
|
159
|
+
yara_str_match_instance,
|
|
160
|
+
i,
|
|
161
|
+
highlight_style
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def style_at_position(self, idx) -> str:
|
|
165
|
+
"""
|
|
166
|
+
Get the style for the byte at position `idx` within the matched bytes.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
idx (int): Index within the surrounding bytes.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
str: The style to use for this byte (highlight or greyed out).
|
|
173
|
+
"""
|
|
174
|
+
if idx < self.highlight_start_idx or idx >= self.highlight_end_idx:
|
|
175
|
+
return GREY_ADDRESS
|
|
176
|
+
else:
|
|
177
|
+
return self.highlight_style
|
|
178
|
+
|
|
179
|
+
def location(self) -> Text:
|
|
180
|
+
"""
|
|
181
|
+
Get a styled `Text` object describing the start and end index of the match.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Text: Rich Text object like '(start idx: 348190, end idx: 348228)'.
|
|
185
|
+
"""
|
|
186
|
+
location_txt = prefix_with_style(
|
|
187
|
+
f"(start idx: ",
|
|
188
|
+
style='off_white',
|
|
189
|
+
root_style='decode.subheading'
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
location_txt.append(str(self.start_idx), style='number')
|
|
193
|
+
location_txt.append(', end idx: ', style='off_white')
|
|
194
|
+
location_txt.append(str(self.end_idx), style='number')
|
|
195
|
+
location_txt.append(')', style='off_white')
|
|
196
|
+
return location_txt
|
|
197
|
+
|
|
198
|
+
def is_decodable(self) -> bool:
|
|
199
|
+
"""
|
|
200
|
+
Determine if the matched bytes should be decoded.
|
|
201
|
+
|
|
202
|
+
Whether the bytes are decodable depends on whether `SUPPRESS_DECODES_TABLE` is set
|
|
203
|
+
and whether the match length is between `MIN`/`MAX_DECODE_LENGTH`.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
bool: `True` if decodable, `False` otherwise.
|
|
207
|
+
"""
|
|
208
|
+
return self.match_length >= YaralyzerConfig.args.min_decode_length \
|
|
209
|
+
and self.match_length <= YaralyzerConfig.args.max_decode_length \
|
|
210
|
+
and not YaralyzerConfig.args.suppress_decodes_table
|
|
211
|
+
|
|
212
|
+
def bytes_hashes_table(self) -> Table:
|
|
213
|
+
"""
|
|
214
|
+
Build a table of MD5/SHA hashes for the matched bytes.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Table: Rich `Table` object with hashes.
|
|
218
|
+
"""
|
|
219
|
+
return bytes_hashes_table(
|
|
220
|
+
self.bytes,
|
|
221
|
+
self.location().plain,
|
|
222
|
+
'center'
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def suppression_notice(self) -> Text:
|
|
226
|
+
"""
|
|
227
|
+
Generate a message for when the match is too short or too long to decode.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Text: Rich `Text` object with the suppression notice.
|
|
231
|
+
"""
|
|
232
|
+
txt = self.__rich__()
|
|
233
|
+
|
|
234
|
+
if self.match_length < YaralyzerConfig.args.min_decode_length:
|
|
235
|
+
txt = Text('Too little to actually attempt decode at ', style='grey') + txt
|
|
236
|
+
else:
|
|
237
|
+
txt.append(" too long to decode ")
|
|
238
|
+
txt.append(f"(--max-decode-length is {YaralyzerConfig.args.max_decode_length} bytes)", style='grey')
|
|
239
|
+
|
|
240
|
+
return txt
|
|
241
|
+
|
|
242
|
+
def to_json(self) -> dict:
|
|
243
|
+
"""
|
|
244
|
+
Convert this `BytesMatch` to a JSON-serializable dictionary.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
dict: Dictionary representation of the match, suitable for JSON serialization.
|
|
248
|
+
"""
|
|
249
|
+
json_dict = {
|
|
250
|
+
'label': self.label,
|
|
251
|
+
'match_length': self.match_length,
|
|
252
|
+
'matched_bytes': self.bytes.hex(),
|
|
253
|
+
'ordinal': self.ordinal,
|
|
254
|
+
'start_idx': self.start_idx,
|
|
255
|
+
'end_idx': self.end_idx,
|
|
256
|
+
'surrounding_bytes': self.surrounding_bytes.hex(),
|
|
257
|
+
'surrounding_start_idx': self.surrounding_start_idx,
|
|
258
|
+
'surrounding_end_idx': self.surrounding_end_idx,
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if self.match:
|
|
262
|
+
json_dict['pattern'] = self.match.re.pattern
|
|
263
|
+
|
|
264
|
+
return json_dict
|
|
265
|
+
|
|
266
|
+
def __rich__(self) -> Text:
|
|
267
|
+
"""Get a rich `Text` representation of the match for display."""
|
|
268
|
+
headline = prefix_with_style(str(self.match_length), style='number', root_style='decode.subheading')
|
|
269
|
+
headline.append(f" bytes matching ")
|
|
270
|
+
headline.append(f"{self.label} ", style=ALERT_STYLE if self.highlight_style == ALERT_STYLE else 'regex')
|
|
271
|
+
headline.append('at ')
|
|
272
|
+
return headline + self.location()
|
|
273
|
+
|
|
274
|
+
def __str__(self):
|
|
275
|
+
"""Plain text (no rich colors) representation of the match for display."""
|
|
276
|
+
return self.__rich__().plain
|
yaralyzer/config.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for Yaralyzer.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from argparse import ArgumentParser, Namespace
|
|
6
|
+
from os import environ
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
YARALYZE = 'yaralyze'
|
|
12
|
+
YARALYZER = f"{YARALYZE}r".upper()
|
|
13
|
+
PYTEST_FLAG = 'INVOKED_BY_PYTEST'
|
|
14
|
+
|
|
15
|
+
KILOBYTE = 1024
|
|
16
|
+
MEGABYTE = 1024 * KILOBYTE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def config_var_name(env_var: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Get the name of `env_var` and strip off `YARALYZER_` prefix.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```
|
|
25
|
+
SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
|
|
26
|
+
config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
|
|
27
|
+
```
|
|
28
|
+
"""
|
|
29
|
+
env_var = env_var.removeprefix("YARALYZER_")
|
|
30
|
+
return f'{env_var=}'.partition('=')[0]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_env_var_set_and_not_false(var_name: str) -> bool:
|
|
34
|
+
"""Return `True` if `var_name` is not empty and set to anything other than "false" (capitalization agnostic)."""
|
|
35
|
+
if var_name in environ:
|
|
36
|
+
var_value = environ[var_name]
|
|
37
|
+
return var_value is not None and len(var_value) > 0 and var_value.lower() != 'false'
|
|
38
|
+
else:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_invoked_by_pytest() -> bool:
|
|
43
|
+
"""Return `True` if invoked in a `pytest` context."""
|
|
44
|
+
return is_env_var_set_and_not_false(PYTEST_FLAG)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class YaralyzerConfig:
|
|
48
|
+
"""Handles parsing of command line args and environment variables for Yaralyzer."""
|
|
49
|
+
|
|
50
|
+
# Passed through to yara.set_config()
|
|
51
|
+
DEFAULT_MAX_MATCH_LENGTH = 100 * KILOBYTE
|
|
52
|
+
DEFAULT_YARA_STACK_SIZE = 2 * 65536
|
|
53
|
+
|
|
54
|
+
# Skip decoding binary matches under/over these lengths
|
|
55
|
+
DEFAULT_MIN_DECODE_LENGTH = 1
|
|
56
|
+
DEFAULT_MAX_DECODE_LENGTH = 256
|
|
57
|
+
|
|
58
|
+
# chardet.detect() related
|
|
59
|
+
DEFAULT_MIN_CHARDET_TABLE_CONFIDENCE = 2
|
|
60
|
+
DEFAULT_MIN_CHARDET_BYTES = 9
|
|
61
|
+
|
|
62
|
+
# Number of bytes to show before/after byte previews and decodes. Configured by command line or env var
|
|
63
|
+
DEFAULT_SURROUNDING_BYTES = 64
|
|
64
|
+
|
|
65
|
+
LOG_DIR_ENV_VAR = 'YARALYZER_LOG_DIR'
|
|
66
|
+
LOG_DIR = environ.get(LOG_DIR_ENV_VAR)
|
|
67
|
+
LOG_LEVEL_ENV_VAR = f"{YARALYZER}_LOG_LEVEL"
|
|
68
|
+
LOG_LEVEL = logging.getLevelName(environ.get(LOG_LEVEL_ENV_VAR, 'WARN'))
|
|
69
|
+
|
|
70
|
+
if LOG_DIR and not is_invoked_by_pytest():
|
|
71
|
+
Console(color_system='256').print(f"Writing logs to '{LOG_DIR}' instead of stderr/stdout...", style='dim')
|
|
72
|
+
|
|
73
|
+
HIGHLIGHT_STYLE = 'orange1'
|
|
74
|
+
|
|
75
|
+
ONLY_CLI_ARGS = [
|
|
76
|
+
'debug',
|
|
77
|
+
'help',
|
|
78
|
+
'hex_patterns',
|
|
79
|
+
'interact',
|
|
80
|
+
'patterns_label',
|
|
81
|
+
'regex_patterns',
|
|
82
|
+
'regex_modifier',
|
|
83
|
+
'version'
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def set_argument_parser(cls, parser: ArgumentParser) -> None:
|
|
88
|
+
"""Sets the `_argument_parser` instance variable that will be used to parse command line args."""
|
|
89
|
+
cls._argument_parser: ArgumentParser = parser
|
|
90
|
+
cls._argparse_keys: List[str] = sorted([action.dest for action in parser._actions])
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def set_args(cls, args: Namespace) -> None:
|
|
94
|
+
"""Set the `args` class instance variable and update args with any environment variable overrides."""
|
|
95
|
+
cls.args = args
|
|
96
|
+
|
|
97
|
+
for option in cls._argparse_keys:
|
|
98
|
+
if option.startswith('export') or option in cls.ONLY_CLI_ARGS:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
arg_value = vars(args)[option]
|
|
102
|
+
env_var = f"{YARALYZER}_{option.upper()}"
|
|
103
|
+
env_value = environ.get(env_var)
|
|
104
|
+
default_value = cls.get_default_arg(option)
|
|
105
|
+
# print(f"option: {option}, arg_value: {arg_value}, env_var: {env_var}, env_value: {env_value}, default: {default_value}") # noqa: E501
|
|
106
|
+
|
|
107
|
+
# TODO: as is you can't override env vars with CLI args
|
|
108
|
+
if isinstance(arg_value, bool):
|
|
109
|
+
setattr(args, option, arg_value or is_env_var_set_and_not_false(env_var))
|
|
110
|
+
elif isinstance(arg_value, (int, float)):
|
|
111
|
+
# Check against defaults to avoid overriding env var configured options
|
|
112
|
+
if arg_value == default_value and env_value is not None:
|
|
113
|
+
setattr(args, option, int(env_value) or arg_value) # TODO: float args not handled
|
|
114
|
+
else:
|
|
115
|
+
setattr(args, option, arg_value or env_value)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def set_default_args(cls):
|
|
119
|
+
"""Set `self.args` to their defaults as if parsed from the command line."""
|
|
120
|
+
cls.set_args(cls._argument_parser.parse_args(['dummy']))
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def get_default_arg(cls, arg: str) -> Any:
|
|
124
|
+
"""Return the default value for `arg` as defined by a `DEFAULT_` style class variable."""
|
|
125
|
+
default_var = f"DEFAULT_{arg.upper()}"
|
|
126
|
+
return vars(cls).get(default_var)
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`BytesDecoder` class for attempting to decode bytes with various encodings.
|
|
3
|
+
"""
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
from operator import attrgetter
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from rich.align import Align
|
|
10
|
+
from rich.console import Console, ConsoleOptions, NewLine, RenderResult
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
14
|
+
|
|
15
|
+
from yaralyzer.bytes_match import BytesMatch # Used to cause circular import issues
|
|
16
|
+
from yaralyzer.config import YaralyzerConfig
|
|
17
|
+
from yaralyzer.decoding.decoding_attempt import DecodingAttempt
|
|
18
|
+
from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT
|
|
19
|
+
from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
|
|
20
|
+
from yaralyzer.encoding_detection.encoding_detector import EncodingDetector
|
|
21
|
+
from yaralyzer.helpers.dict_helper import get_dict_key_by_value
|
|
22
|
+
from yaralyzer.helpers.rich_text_helper import CENTER, DECODING_ERRORS_MSG, NO_DECODING_ERRORS_MSG
|
|
23
|
+
from yaralyzer.output.decoding_attempts_table import new_decoding_attempts_table
|
|
24
|
+
from yaralyzer.output.decoding_table_row import DecodingTableRow
|
|
25
|
+
from yaralyzer.util.logging import log
|
|
26
|
+
|
|
27
|
+
# A 2-tuple that can be indexed by booleans of messages used in the table to show true vs. false
|
|
28
|
+
WAS_DECODABLE_YES_NO = [NO_DECODING_ERRORS_MSG, DECODING_ERRORS_MSG]
|
|
29
|
+
|
|
30
|
+
# Multiply chardet scores by 100 (again) to make sorting the table easy
|
|
31
|
+
SCORE_SCALER = 100.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BytesDecoder:
|
|
35
|
+
"""
|
|
36
|
+
Handles decoding a chunk of bytes into strings using various possible encodings, ranking and displaying results.
|
|
37
|
+
|
|
38
|
+
This class leverages the `chardet` library and custom logic to try multiple encodings, track decoding outcomes,
|
|
39
|
+
and present the results in a rich, user-friendly format. It is used to analyze and display the possible
|
|
40
|
+
interpretations of a byte sequence, especially in the context of YARA matches or binary analysis.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
bytes_match (BytesMatch): The `BytesMatch` instance being decoded.
|
|
44
|
+
bytes (bytes): The bytes (including surrounding context) to decode.
|
|
45
|
+
label (str): Label for this decoding attempt.
|
|
46
|
+
was_match_decodable (dict): Tracks successful decodes per encoding.
|
|
47
|
+
was_match_force_decoded (dict): Tracks forced decodes per encoding.
|
|
48
|
+
was_match_undecodable (dict): Tracks failed decodes per encoding.
|
|
49
|
+
decoded_strings (dict): Maps encoding to decoded string.
|
|
50
|
+
undecoded_rows (list): Stores undecoded table rows.
|
|
51
|
+
decodings (list): List of DecodingAttempt objects for each encoding tried.
|
|
52
|
+
encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, bytes_match: 'BytesMatch', label: Optional[str] = None) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Initialize a `BytesDecoder` for attempting to decode a chunk of bytes using various encodings.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
|
|
61
|
+
label (Optional[str], optional): Optional label for this decoding attempt. Defaults to the match label.
|
|
62
|
+
"""
|
|
63
|
+
self.bytes_match = bytes_match
|
|
64
|
+
self.bytes = bytes_match.surrounding_bytes
|
|
65
|
+
self.label = label or bytes_match.label
|
|
66
|
+
|
|
67
|
+
# Empty table/metrics/etc
|
|
68
|
+
self.was_match_decodable = _build_encodings_metric_dict()
|
|
69
|
+
self.was_match_force_decoded = _build_encodings_metric_dict()
|
|
70
|
+
self.was_match_undecodable = _build_encodings_metric_dict()
|
|
71
|
+
self.decoded_strings = {} # dict[encoding: decoded string]
|
|
72
|
+
self.undecoded_rows = []
|
|
73
|
+
self.decodings = []
|
|
74
|
+
|
|
75
|
+
# Note we send both the match and surrounding bytes used when detecting the encoding
|
|
76
|
+
self.encoding_detector = EncodingDetector(self.bytes)
|
|
77
|
+
|
|
78
|
+
def __rich_console__(self, _console: Console, options: ConsoleOptions) -> RenderResult:
|
|
79
|
+
"""Rich object generator (see Rich console docs)."""
|
|
80
|
+
yield NewLine(2)
|
|
81
|
+
yield Align(self._decode_attempt_subheading(), CENTER)
|
|
82
|
+
|
|
83
|
+
if not YaralyzerConfig.args.suppress_chardet:
|
|
84
|
+
yield NewLine()
|
|
85
|
+
yield Align(self.encoding_detector, CENTER)
|
|
86
|
+
yield NewLine()
|
|
87
|
+
|
|
88
|
+
# In standalone mode we always print the hex/raw bytes
|
|
89
|
+
if self.bytes_match.is_decodable():
|
|
90
|
+
yield self._build_decodings_table()
|
|
91
|
+
elif YaralyzerConfig.args.standalone_mode:
|
|
92
|
+
yield self._build_decodings_table(True)
|
|
93
|
+
|
|
94
|
+
yield NewLine()
|
|
95
|
+
yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
|
|
96
|
+
|
|
97
|
+
def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
|
|
98
|
+
"""
|
|
99
|
+
First rows are the raw / hex views of the bytes, next rows are the attempted decodings.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
suppress_decodes (bool, optional): If `True` don't add decoding attempts to the table. Defaults to `False`.
|
|
103
|
+
"""
|
|
104
|
+
self.table = new_decoding_attempts_table(self.bytes_match)
|
|
105
|
+
|
|
106
|
+
# Add the encoding rows to the table if not suppressed
|
|
107
|
+
if not (YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes):
|
|
108
|
+
self.decodings = [DecodingAttempt(self.bytes_match, encoding) for encoding in ENCODINGS_TO_ATTEMPT]
|
|
109
|
+
# Attempt decodings we don't usually attempt if chardet is insistent enough
|
|
110
|
+
forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
|
|
111
|
+
self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
|
|
112
|
+
|
|
113
|
+
# If we still haven't decoded chardet's top choice, decode it
|
|
114
|
+
if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
|
|
115
|
+
chardet_top_encoding = self._forced_displays()[0].encoding
|
|
116
|
+
log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
|
|
117
|
+
self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
|
|
118
|
+
|
|
119
|
+
# Build the table rows from the decoding attempts
|
|
120
|
+
rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
|
|
121
|
+
|
|
122
|
+
# Add assessments with no decode attempt
|
|
123
|
+
rows += [
|
|
124
|
+
DecodingTableRow.from_undecoded_assessment(a, a.confidence * SCORE_SCALER)
|
|
125
|
+
for a in self._forced_displays()
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
self._track_decode_stats()
|
|
129
|
+
|
|
130
|
+
for row in sorted(rows, key=attrgetter('sort_score', 'encoding_label_plain'), reverse=True):
|
|
131
|
+
self.table.add_row(*row.to_row_list())
|
|
132
|
+
|
|
133
|
+
return self.table
|
|
134
|
+
|
|
135
|
+
# TODO: rename this to something that makes more sense, maybe assessments_over_display_threshold()?
|
|
136
|
+
def _forced_displays(self) -> List[EncodingAssessment]:
|
|
137
|
+
"""Returns assessments over the display threshold that are not yet decoded."""
|
|
138
|
+
return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
|
|
139
|
+
|
|
140
|
+
def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
|
|
141
|
+
"""Filter out the already decoded assessments from a set of assessments."""
|
|
142
|
+
return [a for a in assessments if not self._was_decoded(a.encoding)]
|
|
143
|
+
|
|
144
|
+
def _was_decoded(self, encoding: str) -> bool:
|
|
145
|
+
"""Check whether a given encoding is in the table already."""
|
|
146
|
+
return any(row.encoding == encoding for row in self.decodings)
|
|
147
|
+
|
|
148
|
+
def _decode_attempt_subheading(self) -> Panel:
|
|
149
|
+
"""Generate a rich.Panel for displaying decode attempts."""
|
|
150
|
+
headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
|
|
151
|
+
return Panel(headline, style='decode.subheading', expand=False)
|
|
152
|
+
|
|
153
|
+
def _track_decode_stats(self) -> None:
|
|
154
|
+
"""Track stats about successful vs. forced vs. failed decode attempts."""
|
|
155
|
+
for decoding in self.decodings:
|
|
156
|
+
if decoding.failed_to_decode:
|
|
157
|
+
self.was_match_undecodable[decoding.encoding] += 1
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
self.was_match_decodable[decoding.encoding] += 1
|
|
161
|
+
|
|
162
|
+
if decoding.was_force_decoded:
|
|
163
|
+
self.was_match_force_decoded[decoding.encoding] += 1
|
|
164
|
+
|
|
165
|
+
def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
|
|
166
|
+
"""Create a `DecodingAttemptTable` row from a `DecodingAttempt`."""
|
|
167
|
+
assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
|
|
168
|
+
|
|
169
|
+
# If the decoding can have a start offset add an appropriate extension to the encoding label
|
|
170
|
+
if decoding.start_offset_label:
|
|
171
|
+
if assessment.language:
|
|
172
|
+
log.warning(f"{decoding.encoding} offset {decoding.start_offset} AND language '{assessment.language}'")
|
|
173
|
+
else:
|
|
174
|
+
assessment = deepcopy(assessment)
|
|
175
|
+
assessment.set_encoding_label(decoding.start_offset_label)
|
|
176
|
+
|
|
177
|
+
plain_decoded_string = decoding.decoded_string.plain
|
|
178
|
+
sort_score = assessment.confidence * SCORE_SCALER
|
|
179
|
+
|
|
180
|
+
# If the decoding result is a duplicate of a previous decoding, replace the decoded text
|
|
181
|
+
# with "same output as X" where X is the previous encoding that gave the same result.
|
|
182
|
+
if plain_decoded_string in self.decoded_strings.values():
|
|
183
|
+
encoding_with_same_output = get_dict_key_by_value(self.decoded_strings, plain_decoded_string)
|
|
184
|
+
display_text = Text('same output as ', style='color(66) dim italic')
|
|
185
|
+
display_text.append(encoding_with_same_output, style=ENCODING).append('...', style='white')
|
|
186
|
+
else:
|
|
187
|
+
self.decoded_strings[decoding.encoding_label] = plain_decoded_string
|
|
188
|
+
display_text = decoding.decoded_string
|
|
189
|
+
|
|
190
|
+
# Set failures negative, shave off a little for forced decodes
|
|
191
|
+
if decoding.failed_to_decode:
|
|
192
|
+
sort_score = (sort_score * -1) - 100
|
|
193
|
+
elif decoding.was_force_decoded:
|
|
194
|
+
sort_score -= 10
|
|
195
|
+
|
|
196
|
+
was_forced = WAS_DECODABLE_YES_NO[int(decoding.was_force_decoded)]
|
|
197
|
+
return DecodingTableRow.from_decoded_assessment(assessment, was_forced, display_text, sort_score)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _build_encodings_metric_dict():
|
|
201
|
+
"""One key for each key in `ENCODINGS_TO_ATTEMPT`, values are all 0."""
|
|
202
|
+
metrics_dict = defaultdict(lambda: 0)
|
|
203
|
+
|
|
204
|
+
for encoding in ENCODINGS_TO_ATTEMPT.keys():
|
|
205
|
+
metrics_dict[encoding] = 0
|
|
206
|
+
|
|
207
|
+
return metrics_dict
|