yaralyzer 1.0.8__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of yaralyzer might be problematic. Click here for more details.
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/CHANGELOG.md +3 -0
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/PKG-INFO +5 -6
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/README.md +4 -5
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/pyproject.toml +1 -1
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/__init__.py +1 -4
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/bytes_match.py +23 -24
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/config.py +13 -12
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/decoding/bytes_decoder.py +33 -25
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/decoding/decoding_attempt.py +55 -18
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/encoding_detection/character_encodings.py +9 -6
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/encoding_detection/encoding_assessment.py +26 -6
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/encoding_detection/encoding_detector.py +39 -10
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/bytes_helper.py +19 -18
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/file_helper.py +20 -13
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/rich_text_helper.py +10 -9
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/output/decoding_attempts_table.py +43 -9
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/output/file_export.py +23 -7
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/output/file_hashes_table.py +9 -8
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/output/regex_match_metrics.py +28 -6
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/output/rich_console.py +19 -17
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/util/argument_parser.py +11 -3
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/util/logging.py +31 -16
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/yara/yara_match.py +40 -17
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/yara/yara_rule_builder.py +55 -11
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/yaralyzer.py +90 -20
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/.yaralyzer.example +0 -0
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/LICENSE +0 -0
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/dict_helper.py +0 -0
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/list_helper.py +0 -0
- {yaralyzer-1.0.8 → yaralyzer-1.0.9}/yaralyzer/helpers/string_helper.py +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.0.9
|
|
4
|
+
* Raise `FileNotFoundError` instead of `ValueError` if provided YARA rules files or dirs don't exist
|
|
5
|
+
|
|
3
6
|
### 1.0.8
|
|
4
7
|
* Bump `python-dotenv` to v1.1.1
|
|
5
8
|
* Use `mkdocs` and `lazydocs` to build automatic API documentation at https://michelcrypt4d4mus.github.io/yaralyzer/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: yaralyzer
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: Visualize and force decode YARA and regex matches found in a file or byte stream with colors. Lots of colors.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/yaralyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -78,7 +78,7 @@ YARA just tells you the byte position and the matched string but it can't tell y
|
|
|
78
78
|
|
|
79
79
|
Enter **The Yaralyzer**, which lets you quickly scan the regions around matches while also showing you what those regions would look like if they were forced into various character encodings.
|
|
80
80
|
|
|
81
|
-
|
|
81
|
+
**The Yaralyzer** isn't a malware reversing tool. It can't do all the things a tool like [CyberChef](https://gchq.github.io/CyberChef/) does and it doesn't try to. It's more intended to give you a quick visual overview of suspect regions in the binary so you can hone in on the areas you might want to inspect with a more serious tool like [CyberChef](https://gchq.github.io/CyberChef/).
|
|
82
82
|
|
|
83
83
|
# Installation
|
|
84
84
|
Install it with [`pipx`](https://pypa.github.io/pipx/) or `pip3`. `pipx` is a marginally better solution as it guarantees any packages installed with it will be isolated from the rest of your local python environment. Of course if you don't really have a local python environment this is a moot point and you can feel free to install with `pip`/`pip3`.
|
|
@@ -86,6 +86,7 @@ Install it with [`pipx`](https://pypa.github.io/pipx/) or `pip3`. `pipx` is a ma
|
|
|
86
86
|
pipx install yaralyzer
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
+
|
|
89
90
|
# Usage
|
|
90
91
|
Run `yaralyze -h` to see the command line options (screenshot below).
|
|
91
92
|
|
|
@@ -99,7 +100,7 @@ If you place a file called `.yaralyzer` in your home directory or the current wo
|
|
|
99
100
|
Only one `.yaralyzer` file will be loaded and the working directory's `.yaralyzer` takes precedence over the home directory's `.yaralyzer`.
|
|
100
101
|
|
|
101
102
|
### As A Library
|
|
102
|
-
[`Yaralyzer`](yaralyzer/yaralyzer.py) is the main class. It has a variety of constructors supporting:
|
|
103
|
+
[`Yaralyzer`](yaralyzer/yaralyzer.py) is the main class. Auto generated documentation for `Yaralyzer`'s various classes and methods can be found [here](https://michelcrypt4d4mus.github.io/yaralyzer/). It has a variety of [alternate constructors](https://michelcrypt4d4mus.github.io/yaralyzer/api/yaralyzer/) supporting:
|
|
103
104
|
|
|
104
105
|
1. Precompiled YARA rules
|
|
105
106
|
1. Creating a YARA rule from a string
|
|
@@ -108,7 +109,7 @@ Only one `.yaralyzer` file will be loaded and the working directory's `.yaralyze
|
|
|
108
109
|
1. Scanning `bytes`
|
|
109
110
|
1. Scanning a file
|
|
110
111
|
|
|
111
|
-
Should you want to iterate over the `BytesMatch` (like a `re.Match` object for a YARA match) and `BytesDecoder` (tracks decoding attempt stats) objects
|
|
112
|
+
Should you want to iterate over the [`BytesMatch`](https://michelcrypt4d4mus.github.io/yaralyzer/api/bytes_match/) (like a `re.Match` object for a YARA match) and [`BytesDecoder`](https://michelcrypt4d4mus.github.io/yaralyzer/api/bytes_decoder/) (tracks decoding attempt stats) objects used by The Yaralyzer, you can do so like this:
|
|
112
113
|
|
|
113
114
|
```python
|
|
114
115
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
@@ -119,8 +120,6 @@ for bytes_match, bytes_decoder in yaralyzer.match_iterator():
|
|
|
119
120
|
do_stuff()
|
|
120
121
|
```
|
|
121
122
|
|
|
122
|
-
#### API Documentation
|
|
123
|
-
Auto generated documentation for Yaralyzer's various classes and methods can be found [here](https://michelcrypt4d4mus.github.io/yaralyzer/).
|
|
124
123
|
|
|
125
124
|
# Example Output
|
|
126
125
|
The Yaralyzer can export visualizations to HTML, ANSI colored text, and SVG vector images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich) as well as a (somewhat limited) plain text JSON format. SVGs can be turned into `png` format images with a tool like [Inkscape](https://inkscape.org/) or `cairosvg`. In our experience they both work though we've seen some glitchiness with `cairosvg`.
|
|
@@ -45,7 +45,7 @@ YARA just tells you the byte position and the matched string but it can't tell y
|
|
|
45
45
|
|
|
46
46
|
Enter **The Yaralyzer**, which lets you quickly scan the regions around matches while also showing you what those regions would look like if they were forced into various character encodings.
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
**The Yaralyzer** isn't a malware reversing tool. It can't do all the things a tool like [CyberChef](https://gchq.github.io/CyberChef/) does and it doesn't try to. It's more intended to give you a quick visual overview of suspect regions in the binary so you can hone in on the areas you might want to inspect with a more serious tool like [CyberChef](https://gchq.github.io/CyberChef/).
|
|
49
49
|
|
|
50
50
|
# Installation
|
|
51
51
|
Install it with [`pipx`](https://pypa.github.io/pipx/) or `pip3`. `pipx` is a marginally better solution as it guarantees any packages installed with it will be isolated from the rest of your local python environment. Of course if you don't really have a local python environment this is a moot point and you can feel free to install with `pip`/`pip3`.
|
|
@@ -53,6 +53,7 @@ Install it with [`pipx`](https://pypa.github.io/pipx/) or `pip3`. `pipx` is a ma
|
|
|
53
53
|
pipx install yaralyzer
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
|
|
56
57
|
# Usage
|
|
57
58
|
Run `yaralyze -h` to see the command line options (screenshot below).
|
|
58
59
|
|
|
@@ -66,7 +67,7 @@ If you place a file called `.yaralyzer` in your home directory or the current wo
|
|
|
66
67
|
Only one `.yaralyzer` file will be loaded and the working directory's `.yaralyzer` takes precedence over the home directory's `.yaralyzer`.
|
|
67
68
|
|
|
68
69
|
### As A Library
|
|
69
|
-
[`Yaralyzer`](yaralyzer/yaralyzer.py) is the main class. It has a variety of constructors supporting:
|
|
70
|
+
[`Yaralyzer`](yaralyzer/yaralyzer.py) is the main class. Auto generated documentation for `Yaralyzer`'s various classes and methods can be found [here](https://michelcrypt4d4mus.github.io/yaralyzer/). It has a variety of [alternate constructors](https://michelcrypt4d4mus.github.io/yaralyzer/api/yaralyzer/) supporting:
|
|
70
71
|
|
|
71
72
|
1. Precompiled YARA rules
|
|
72
73
|
1. Creating a YARA rule from a string
|
|
@@ -75,7 +76,7 @@ Only one `.yaralyzer` file will be loaded and the working directory's `.yaralyze
|
|
|
75
76
|
1. Scanning `bytes`
|
|
76
77
|
1. Scanning a file
|
|
77
78
|
|
|
78
|
-
Should you want to iterate over the `BytesMatch` (like a `re.Match` object for a YARA match) and `BytesDecoder` (tracks decoding attempt stats) objects
|
|
79
|
+
Should you want to iterate over the [`BytesMatch`](https://michelcrypt4d4mus.github.io/yaralyzer/api/bytes_match/) (like a `re.Match` object for a YARA match) and [`BytesDecoder`](https://michelcrypt4d4mus.github.io/yaralyzer/api/bytes_decoder/) (tracks decoding attempt stats) objects used by The Yaralyzer, you can do so like this:
|
|
79
80
|
|
|
80
81
|
```python
|
|
81
82
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
@@ -86,8 +87,6 @@ for bytes_match, bytes_decoder in yaralyzer.match_iterator():
|
|
|
86
87
|
do_stuff()
|
|
87
88
|
```
|
|
88
89
|
|
|
89
|
-
#### API Documentation
|
|
90
|
-
Auto generated documentation for Yaralyzer's various classes and methods can be found [here](https://michelcrypt4d4mus.github.io/yaralyzer/).
|
|
91
90
|
|
|
92
91
|
# Example Output
|
|
93
92
|
The Yaralyzer can export visualizations to HTML, ANSI colored text, and SVG vector images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich) as well as a (somewhat limited) plain text JSON format. SVGs can be turned into `png` format images with a tool like [Inkscape](https://inkscape.org/) or `cairosvg`. In our experience they both work though we've seen some glitchiness with `cairosvg`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "yaralyzer"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.9"
|
|
4
4
|
description = "Visualize and force decode YARA and regex matches found in a file or byte stream with colors. Lots of colors."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -27,7 +27,7 @@ def yaralyze():
|
|
|
27
27
|
"""
|
|
28
28
|
Entry point for yaralyzer when invoked as a script.
|
|
29
29
|
|
|
30
|
-
Args are parsed from the command line and environment variables. See
|
|
30
|
+
Args are parsed from the command line and environment variables. See `yaralyze --help` for details.
|
|
31
31
|
"""
|
|
32
32
|
args = parse_arguments()
|
|
33
33
|
output_basepath = None
|
|
@@ -55,13 +55,10 @@ def yaralyze():
|
|
|
55
55
|
|
|
56
56
|
if args.export_txt:
|
|
57
57
|
invoke_rich_export(console.save_text, output_basepath)
|
|
58
|
-
|
|
59
58
|
if args.export_html:
|
|
60
59
|
invoke_rich_export(console.save_html, output_basepath)
|
|
61
|
-
|
|
62
60
|
if args.export_svg:
|
|
63
61
|
invoke_rich_export(console.save_svg, output_basepath)
|
|
64
|
-
|
|
65
62
|
if args.export_json:
|
|
66
63
|
export_json(yaralyzer, output_basepath)
|
|
67
64
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
`BytesMatch` class for tracking regex and YARA matches against binary data.
|
|
3
|
+
"""
|
|
2
4
|
import re
|
|
3
5
|
from typing import Iterator, Optional
|
|
4
6
|
|
|
@@ -16,11 +18,8 @@ class BytesMatch:
|
|
|
16
18
|
"""
|
|
17
19
|
Simple class to keep track of regex matches against binary data.
|
|
18
20
|
|
|
19
|
-
Basically
|
|
20
|
-
the surrounding_bytes property.
|
|
21
|
-
|
|
22
|
-
pre_capture_len and post_capture_len refer to the regex sections before and after the capture group,
|
|
23
|
-
e.g. a regex like '123(.*)x:' would have pre_capture_len of 3 and post_capture_len of 2.
|
|
21
|
+
Basically a Regex `re.match` object with some (not many) extra bells and whistles, most notably
|
|
22
|
+
the `surrounding_bytes` property.
|
|
24
23
|
"""
|
|
25
24
|
|
|
26
25
|
def __init__(
|
|
@@ -34,15 +33,15 @@ class BytesMatch:
|
|
|
34
33
|
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
35
34
|
) -> None:
|
|
36
35
|
"""
|
|
37
|
-
Initialize a BytesMatch object representing a match against binary data.
|
|
36
|
+
Initialize a `BytesMatch` object representing a match against binary data.
|
|
38
37
|
|
|
39
38
|
Args:
|
|
40
39
|
matched_against (bytes): The full byte sequence that was searched.
|
|
41
40
|
start_idx (int): Start index of the match in the byte sequence.
|
|
42
41
|
length (int): Length of the match in bytes.
|
|
43
42
|
label (str): Label for the match (e.g., regex or YARA rule name).
|
|
44
|
-
ordinal (int):
|
|
45
|
-
match (Optional[re.Match]): Regex match object, if available.
|
|
43
|
+
ordinal (int): This was the Nth match for this pattern (used for labeling only).
|
|
44
|
+
match (Optional[re.Match]): Regex `match` object, if available.
|
|
46
45
|
highlight_style (str): Style to use for highlighting the match.
|
|
47
46
|
"""
|
|
48
47
|
self.matched_against: bytes = matched_against
|
|
@@ -71,16 +70,16 @@ class BytesMatch:
|
|
|
71
70
|
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
72
71
|
) -> 'BytesMatch':
|
|
73
72
|
"""
|
|
74
|
-
|
|
73
|
+
Alternate constructor to build a `BytesMatch` from a regex match object.
|
|
75
74
|
|
|
76
75
|
Args:
|
|
77
76
|
matched_against (bytes): The bytes searched.
|
|
78
|
-
match (re.Match): The regex match object.
|
|
79
|
-
ordinal (int):
|
|
77
|
+
match (re.Match): The regex `match` object.
|
|
78
|
+
ordinal (int): This was the Nth match for this pattern (used for labeling only).
|
|
80
79
|
highlight_style (str): Style for highlighting.
|
|
81
80
|
|
|
82
81
|
Returns:
|
|
83
|
-
BytesMatch: The constructed BytesMatch instance.
|
|
82
|
+
BytesMatch: The constructed `BytesMatch` instance.
|
|
84
83
|
"""
|
|
85
84
|
return cls(matched_against, match.start(), len(match[0]), match.re.pattern, ordinal, match, highlight_style)
|
|
86
85
|
|
|
@@ -95,7 +94,7 @@ class BytesMatch:
|
|
|
95
94
|
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
96
95
|
) -> 'BytesMatch':
|
|
97
96
|
"""
|
|
98
|
-
|
|
97
|
+
Alternate constructor to build a `BytesMatch` from a YARA string match instance.
|
|
99
98
|
|
|
100
99
|
Args:
|
|
101
100
|
matched_against (bytes): The bytes searched.
|
|
@@ -132,7 +131,7 @@ class BytesMatch:
|
|
|
132
131
|
highlight_style: str = YaralyzerConfig.HIGHLIGHT_STYLE
|
|
133
132
|
) -> Iterator['BytesMatch']:
|
|
134
133
|
"""
|
|
135
|
-
Yield a BytesMatch for each string returned as part of a YARA match result dict.
|
|
134
|
+
Yield a `BytesMatch` for each string returned as part of a YARA match result dict.
|
|
136
135
|
|
|
137
136
|
Args:
|
|
138
137
|
matched_against (bytes): The bytes searched.
|
|
@@ -160,7 +159,7 @@ class BytesMatch:
|
|
|
160
159
|
|
|
161
160
|
def style_at_position(self, idx) -> str:
|
|
162
161
|
"""
|
|
163
|
-
Get the style for the byte at position idx within the matched bytes.
|
|
162
|
+
Get the style for the byte at position `idx` within the matched bytes.
|
|
164
163
|
|
|
165
164
|
Args:
|
|
166
165
|
idx (int): Index within the surrounding bytes.
|
|
@@ -175,7 +174,7 @@ class BytesMatch:
|
|
|
175
174
|
|
|
176
175
|
def location(self) -> Text:
|
|
177
176
|
"""
|
|
178
|
-
Get a styled Text object describing the start and end index of the match.
|
|
177
|
+
Get a styled `Text` object describing the start and end index of the match.
|
|
179
178
|
|
|
180
179
|
Returns:
|
|
181
180
|
Text: Rich Text object like '(start idx: 348190, end idx: 348228)'.
|
|
@@ -196,11 +195,11 @@ class BytesMatch:
|
|
|
196
195
|
"""
|
|
197
196
|
Determine if the matched bytes should be decoded.
|
|
198
197
|
|
|
199
|
-
Whether the bytes are decodable depends on whether SUPPRESS_DECODES_TABLE is set
|
|
200
|
-
and whether the match length is between MIN
|
|
198
|
+
Whether the bytes are decodable depends on whether `SUPPRESS_DECODES_TABLE` is set
|
|
199
|
+
and whether the match length is between `MIN`/`MAX_DECODE_LENGTH`.
|
|
201
200
|
|
|
202
201
|
Returns:
|
|
203
|
-
bool: True if decodable, False otherwise.
|
|
202
|
+
bool: `True` if decodable, `False` otherwise.
|
|
204
203
|
"""
|
|
205
204
|
return self.match_length >= YaralyzerConfig.args.min_decode_length \
|
|
206
205
|
and self.match_length <= YaralyzerConfig.args.max_decode_length \
|
|
@@ -211,7 +210,7 @@ class BytesMatch:
|
|
|
211
210
|
Build a table of MD5/SHA hashes for the matched bytes.
|
|
212
211
|
|
|
213
212
|
Returns:
|
|
214
|
-
Table: Rich Table object with hashes.
|
|
213
|
+
Table: Rich `Table` object with hashes.
|
|
215
214
|
"""
|
|
216
215
|
return bytes_hashes_table(
|
|
217
216
|
self.bytes,
|
|
@@ -224,7 +223,7 @@ class BytesMatch:
|
|
|
224
223
|
Generate a message for when the match is too short or too long to decode.
|
|
225
224
|
|
|
226
225
|
Returns:
|
|
227
|
-
Text: Rich Text object with the suppression notice.
|
|
226
|
+
Text: Rich `Text` object with the suppression notice.
|
|
228
227
|
"""
|
|
229
228
|
txt = self.__rich__()
|
|
230
229
|
|
|
@@ -238,7 +237,7 @@ class BytesMatch:
|
|
|
238
237
|
|
|
239
238
|
def to_json(self) -> dict:
|
|
240
239
|
"""
|
|
241
|
-
Convert this BytesMatch to a JSON-serializable dictionary.
|
|
240
|
+
Convert this `BytesMatch` to a JSON-serializable dictionary.
|
|
242
241
|
|
|
243
242
|
Returns:
|
|
244
243
|
dict: Dictionary representation of the match, suitable for JSON serialization.
|
|
@@ -275,7 +274,7 @@ class BytesMatch:
|
|
|
275
274
|
self.surrounding_bytes: bytes = self.matched_against[self.surrounding_start_idx:self.surrounding_end_idx]
|
|
276
275
|
|
|
277
276
|
def __rich__(self) -> Text:
|
|
278
|
-
"""Get a rich Text representation of the match for display."""
|
|
277
|
+
"""Get a rich `Text` representation of the match for display."""
|
|
279
278
|
headline = prefix_with_style(str(self.match_length), style='number', root_style='decode.subheading')
|
|
280
279
|
headline.append(f" bytes matching ")
|
|
281
280
|
headline.append(f"{self.label} ", style=ALERT_STYLE if self.highlight_style == ALERT_STYLE else 'regex')
|
|
@@ -18,19 +18,20 @@ MEGABYTE = 1024 * KILOBYTE
|
|
|
18
18
|
|
|
19
19
|
def config_var_name(env_var: str) -> str:
|
|
20
20
|
"""
|
|
21
|
-
Get the name of env_var and strip off
|
|
21
|
+
Get the name of `env_var` and strip off `YARALYZER_` prefix.
|
|
22
22
|
|
|
23
23
|
Example:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
```
|
|
25
|
+
SURROUNDING_BYTES_ENV_VAR = 'YARALYZER_SURROUNDING_BYTES'
|
|
26
|
+
config_var_name(SURROUNDING_BYTES_ENV_VAR) => 'SURROUNDING_BYTES'
|
|
27
|
+
```
|
|
27
28
|
"""
|
|
28
29
|
env_var = env_var.removeprefix("YARALYZER_")
|
|
29
30
|
return f'{env_var=}'.partition('=')[0]
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
def is_env_var_set_and_not_false(var_name):
|
|
33
|
-
"""Return True if var_name is not empty and set to anything other than
|
|
33
|
+
def is_env_var_set_and_not_false(var_name: str) -> bool:
|
|
34
|
+
"""Return `True` if `var_name` is not empty and set to anything other than "false" (capitalization agnostic)."""
|
|
34
35
|
if var_name in environ:
|
|
35
36
|
var_value = environ[var_name]
|
|
36
37
|
return var_value is not None and len(var_value) > 0 and var_value.lower() != 'false'
|
|
@@ -38,8 +39,8 @@ def is_env_var_set_and_not_false(var_name):
|
|
|
38
39
|
return False
|
|
39
40
|
|
|
40
41
|
|
|
41
|
-
def is_invoked_by_pytest():
|
|
42
|
-
"""Return
|
|
42
|
+
def is_invoked_by_pytest() -> bool:
|
|
43
|
+
"""Return `True` if invoked in a `pytest` context."""
|
|
43
44
|
return is_env_var_set_and_not_false(PYTEST_FLAG)
|
|
44
45
|
|
|
45
46
|
|
|
@@ -84,13 +85,13 @@ class YaralyzerConfig:
|
|
|
84
85
|
|
|
85
86
|
@classmethod
|
|
86
87
|
def set_argument_parser(cls, parser: ArgumentParser) -> None:
|
|
87
|
-
"""Sets the _argument_parser instance variable that will be used to parse command line args."""
|
|
88
|
+
"""Sets the `_argument_parser` instance variable that will be used to parse command line args."""
|
|
88
89
|
cls._argument_parser: ArgumentParser = parser
|
|
89
90
|
cls._argparse_keys: List[str] = sorted([action.dest for action in parser._actions])
|
|
90
91
|
|
|
91
92
|
@classmethod
|
|
92
93
|
def set_args(cls, args: Namespace) -> None:
|
|
93
|
-
"""Set the args class instance variable and update args with any environment variable overrides."""
|
|
94
|
+
"""Set the `args` class instance variable and update args with any environment variable overrides."""
|
|
94
95
|
cls.args = args
|
|
95
96
|
|
|
96
97
|
for option in cls._argparse_keys:
|
|
@@ -115,11 +116,11 @@ class YaralyzerConfig:
|
|
|
115
116
|
|
|
116
117
|
@classmethod
|
|
117
118
|
def set_default_args(cls):
|
|
118
|
-
"""Set args to their defaults as if parsed from the command line."""
|
|
119
|
+
"""Set `self.args` to their defaults as if parsed from the command line."""
|
|
119
120
|
cls.set_args(cls._argument_parser.parse_args(['dummy']))
|
|
120
121
|
|
|
121
122
|
@classmethod
|
|
122
123
|
def get_default_arg(cls, arg: str) -> Any:
|
|
123
|
-
"""Return the default value for arg as defined by a DEFAULT_ style class variable."""
|
|
124
|
+
"""Return the default value for `arg` as defined by a `DEFAULT_` style class variable."""
|
|
124
125
|
default_var = f"DEFAULT_{arg.upper()}"
|
|
125
126
|
return vars(cls).get(default_var)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
`BytesDecoder` class for attempting to decode bytes with various encodings.
|
|
3
|
+
"""
|
|
2
4
|
from collections import defaultdict
|
|
3
5
|
from copy import deepcopy
|
|
4
6
|
from operator import attrgetter
|
|
@@ -31,31 +33,32 @@ SCORE_SCALER = 100.0
|
|
|
31
33
|
|
|
32
34
|
class BytesDecoder:
|
|
33
35
|
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
Handles decoding a chunk of bytes into strings using various possible encodings, ranking and displaying results.
|
|
37
|
+
|
|
38
|
+
This class leverages the `chardet` library and custom logic to try multiple encodings, track decoding outcomes,
|
|
39
|
+
and present the results in a rich, user-friendly format. It is used to analyze and display the possible
|
|
40
|
+
interpretations of a byte sequence, especially in the context of YARA matches or binary analysis.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
bytes_match (BytesMatch): The `BytesMatch` instance being decoded.
|
|
44
|
+
bytes (bytes): The bytes (including surrounding context) to decode.
|
|
45
|
+
label (str): Label for this decoding attempt.
|
|
46
|
+
was_match_decodable (dict): Tracks successful decodes per encoding.
|
|
47
|
+
was_match_force_decoded (dict): Tracks forced decodes per encoding.
|
|
48
|
+
was_match_undecodable (dict): Tracks failed decodes per encoding.
|
|
49
|
+
decoded_strings (dict): Maps encoding to decoded string.
|
|
50
|
+
undecoded_rows (list): Stores undecoded table rows.
|
|
51
|
+
decodings (list): List of DecodingAttempt objects for each encoding tried.
|
|
52
|
+
encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
|
|
38
53
|
"""
|
|
39
54
|
|
|
40
55
|
def __init__(self, bytes_match: 'BytesMatch', label: Optional[str] = None) -> None:
|
|
41
56
|
"""
|
|
42
|
-
Initialize a BytesDecoder for attempting to decode a chunk of bytes using various encodings.
|
|
57
|
+
Initialize a `BytesDecoder` for attempting to decode a chunk of bytes using various encodings.
|
|
43
58
|
|
|
44
59
|
Args:
|
|
45
|
-
bytes_match (BytesMatch): The BytesMatch object containing the bytes to decode and match metadata.
|
|
60
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
|
|
46
61
|
label (Optional[str], optional): Optional label for this decoding attempt. Defaults to the match label.
|
|
47
|
-
|
|
48
|
-
Attributes:
|
|
49
|
-
bytes_match (BytesMatch): The BytesMatch instance being decoded.
|
|
50
|
-
bytes (bytes): The bytes (including surrounding context) to decode.
|
|
51
|
-
label (str): Label for this decoding attempt.
|
|
52
|
-
was_match_decodable (dict): Tracks successful decodes per encoding.
|
|
53
|
-
was_match_force_decoded (dict): Tracks forced decodes per encoding.
|
|
54
|
-
was_match_undecodable (dict): Tracks failed decodes per encoding.
|
|
55
|
-
decoded_strings (dict): Maps encoding to decoded string.
|
|
56
|
-
undecoded_rows (list): Stores undecoded table rows.
|
|
57
|
-
decodings (list): List of DecodingAttempt objects for each encoding tried.
|
|
58
|
-
encoding_detector (EncodingDetector): Used to detect and assess possible encodings.
|
|
59
62
|
"""
|
|
60
63
|
self.bytes_match = bytes_match
|
|
61
64
|
self.bytes = bytes_match.surrounding_bytes
|
|
@@ -92,7 +95,12 @@ class BytesDecoder:
|
|
|
92
95
|
yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
|
|
93
96
|
|
|
94
97
|
def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
|
|
95
|
-
"""
|
|
98
|
+
"""
|
|
99
|
+
First rows are the raw / hex views of the bytes, next rows are the attempted decodings.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
suppress_decodes (bool, optional): If `True` don't add decoding attempts to the table. Defaults to `False`.
|
|
103
|
+
"""
|
|
96
104
|
self.table = new_decoding_attempts_table(self.bytes_match)
|
|
97
105
|
|
|
98
106
|
# Add the encoding rows to the table if not suppressed
|
|
@@ -124,15 +132,15 @@ class BytesDecoder:
|
|
|
124
132
|
return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
|
|
125
133
|
|
|
126
134
|
def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
|
|
127
|
-
"""Filter out the already decoded assessments from a set of assessments"""
|
|
135
|
+
"""Filter out the already decoded assessments from a set of assessments."""
|
|
128
136
|
return [a for a in assessments if not self._was_decoded(a.encoding)]
|
|
129
137
|
|
|
130
138
|
def _was_decoded(self, encoding: str) -> bool:
|
|
131
|
-
"""Check whether a given encoding is in the table already"""
|
|
139
|
+
"""Check whether a given encoding is in the table already."""
|
|
132
140
|
return any(row.encoding == encoding for row in self.decodings)
|
|
133
141
|
|
|
134
142
|
def _decode_attempt_subheading(self) -> Panel:
|
|
135
|
-
"""Generate a rich.Panel for displaying decode attempts"""
|
|
143
|
+
"""Generate a rich.Panel for displaying decode attempts."""
|
|
136
144
|
headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
|
|
137
145
|
return Panel(headline, style='decode.subheading', expand=False)
|
|
138
146
|
|
|
@@ -149,7 +157,7 @@ class BytesDecoder:
|
|
|
149
157
|
self.was_match_force_decoded[decoding.encoding] += 1
|
|
150
158
|
|
|
151
159
|
def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
|
|
152
|
-
"""Create a DecodingAttemptTable row from a DecodingAttempt
|
|
160
|
+
"""Create a `DecodingAttemptTable` row from a `DecodingAttempt`."""
|
|
153
161
|
assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
|
|
154
162
|
|
|
155
163
|
# If the decoding can have a start offset add an appropriate extension to the encoding label
|
|
@@ -184,7 +192,7 @@ class BytesDecoder:
|
|
|
184
192
|
|
|
185
193
|
|
|
186
194
|
def _build_encodings_metric_dict():
|
|
187
|
-
"""One key for each key in ENCODINGS_TO_ATTEMPT
|
|
195
|
+
"""One key for each key in `ENCODINGS_TO_ATTEMPT`, values are all 0."""
|
|
188
196
|
metrics_dict = defaultdict(lambda: 0)
|
|
189
197
|
|
|
190
198
|
for encoding in ENCODINGS_TO_ATTEMPT.keys():
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
|
|
3
|
+
"""
|
|
2
4
|
from sys import byteorder
|
|
3
5
|
from typing import Optional
|
|
4
6
|
|
|
@@ -15,10 +17,33 @@ from yaralyzer.util.logging import log
|
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
class DecodingAttempt:
|
|
18
|
-
"""
|
|
20
|
+
"""
|
|
21
|
+
Manages the process of attempting to decode a chunk of bytes into a string using a specified encoding.
|
|
22
|
+
|
|
23
|
+
This class tries to decode the bytes using the provided encoding, handling both standard and custom decoding
|
|
24
|
+
strategies (including multi-byte encodings and forced decoding attempts). It tracks the outcome, highlights
|
|
25
|
+
the decoded output, and provides information about the decoding process.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
bytes (bytes): The bytes (including context) to decode.
|
|
29
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing match and context info.
|
|
30
|
+
encoding (str): The encoding to attempt.
|
|
31
|
+
encoding_label (str): Label for the encoding (may include offset info).
|
|
32
|
+
start_offset (int): Byte offset used for decoding (for multi-byte encodings).
|
|
33
|
+
start_offset_label (Optional[str]): String label for the offset, if used.
|
|
34
|
+
was_force_decoded (bool): True if a forced decode was attempted.
|
|
35
|
+
failed_to_decode (bool): True if decoding failed.
|
|
36
|
+
decoded_string (Text): The decoded string as a Rich `Text` object (with highlighting).
|
|
37
|
+
"""
|
|
19
38
|
|
|
20
39
|
def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
|
|
21
|
-
|
|
40
|
+
"""
|
|
41
|
+
Initialize a `DecodingAttempt` for a specific `encoding` on a given `BytesMatch`.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to decode and match metadata.
|
|
45
|
+
encoding (str): The encoding to attempt for decoding the bytes.
|
|
46
|
+
"""
|
|
22
47
|
self.bytes = bytes_match.surrounding_bytes
|
|
23
48
|
self.bytes_match = bytes_match
|
|
24
49
|
self.encoding = encoding
|
|
@@ -30,15 +55,11 @@ class DecodingAttempt:
|
|
|
30
55
|
self.failed_to_decode = False
|
|
31
56
|
self.decoded_string = self._decode_bytes()
|
|
32
57
|
|
|
33
|
-
def is_wide_utf_encoding(self) -> bool:
|
|
34
|
-
"""Returns True if the encoding is UTF-16 or UTF-32."""
|
|
35
|
-
return is_wide_utf(self.encoding)
|
|
36
|
-
|
|
37
58
|
def _decode_bytes(self) -> Text:
|
|
38
59
|
"""
|
|
39
|
-
Tries builtin decode, hands off to other methods for harsher
|
|
40
|
-
|
|
41
|
-
|
|
60
|
+
Tries builtin decode, hands off to other methods for harsher treatment (byte shifting for
|
|
61
|
+
UTF-16/32 and custom decode for the rest) if that fails. Has side effect of setting
|
|
62
|
+
`self.decoded_string` value.
|
|
42
63
|
"""
|
|
43
64
|
try:
|
|
44
65
|
decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
|
|
@@ -52,13 +73,15 @@ class DecodingAttempt:
|
|
|
52
73
|
|
|
53
74
|
self.was_force_decoded = True
|
|
54
75
|
|
|
55
|
-
if self.
|
|
76
|
+
if is_wide_utf(self.encoding):
|
|
56
77
|
return self._decode_utf_multibyte()
|
|
57
78
|
else:
|
|
58
|
-
return self.
|
|
79
|
+
return self._custom_utf_decode()
|
|
59
80
|
|
|
60
|
-
def
|
|
61
|
-
"""
|
|
81
|
+
def _custom_utf_decode(self) -> Text:
|
|
82
|
+
"""
|
|
83
|
+
Returns a `Text` obj representing an attempt to force a UTF-8 encoding onto an array of bytes.
|
|
84
|
+
"""
|
|
62
85
|
log.info(f"Custom decoding {self.bytes_match} with {self.encoding}...")
|
|
63
86
|
unprintable_char_map = ENCODINGS_TO_ATTEMPT.get(self.encoding)
|
|
64
87
|
output = Text('', style='bytes.decoded')
|
|
@@ -116,7 +139,13 @@ class DecodingAttempt:
|
|
|
116
139
|
return output
|
|
117
140
|
|
|
118
141
|
def _decode_utf_multibyte(self) -> Text:
|
|
119
|
-
"""
|
|
142
|
+
"""
|
|
143
|
+
UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte
|
|
144
|
+
so we try several offsets until we find one that at least kind of works.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Text: Rich `Text` object representing the decoded string with highlighting.
|
|
148
|
+
"""
|
|
120
149
|
char_width = encoding_width(self.encoding)
|
|
121
150
|
last_exception = None
|
|
122
151
|
decoded_str = None
|
|
@@ -146,7 +175,15 @@ class DecodingAttempt:
|
|
|
146
175
|
return self._failed_to_decode_msg_txt(last_exception)
|
|
147
176
|
|
|
148
177
|
def _to_rich_text(self, _string: str, bytes_offset: int = 0) -> Text:
|
|
149
|
-
"""
|
|
178
|
+
"""
|
|
179
|
+
Convert a decoded string to highlighted `Text` representation.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
_string (str): The decoded string to convert.
|
|
183
|
+
bytes_offset (int): The byte offset used during decoding (for multi-byte encodings).
|
|
184
|
+
Returns:
|
|
185
|
+
Text: The rich `Text` representation of the decoded string with appropriate highlighting.
|
|
186
|
+
"""
|
|
150
187
|
# Adjust where we start the highlighting given the multibyte nature of the encodings
|
|
151
188
|
log.debug(f"Stepping through {self.encoding} encoded string...")
|
|
152
189
|
txt = Text('', style=self.bytes_match.style_at_position(0))
|
|
@@ -160,7 +197,7 @@ class DecodingAttempt:
|
|
|
160
197
|
is_single_byte_encoding = False
|
|
161
198
|
unprintable_chars = {}
|
|
162
199
|
|
|
163
|
-
for
|
|
200
|
+
for _i, c in enumerate(_string):
|
|
164
201
|
char_bytes = bytes(c, self.encoding)
|
|
165
202
|
char_width = len(char_bytes)
|
|
166
203
|
style = self.bytes_match.style_at_position(current_byte_idx + bytes_offset)
|
|
@@ -180,6 +217,6 @@ class DecodingAttempt:
|
|
|
180
217
|
return txt
|
|
181
218
|
|
|
182
219
|
def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
|
|
183
|
-
"""Set failed_to_decode flag and return a Text object with the error message."""
|
|
220
|
+
"""Set `self.failed_to_decode` flag and return a `Text` object with the error message."""
|
|
184
221
|
self.failed_to_decode = True
|
|
185
222
|
return prefix_with_style(f"(decode failed: {exception})", style='red dim italic')
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Constants related to character encodings.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
Helpful links:
|
|
5
|
+
|
|
6
|
+
* ISO-8859: [www.mit.edu/people/kenta/two/iso8859.html](https://www.mit.edu/people/kenta/two/iso8859.html)
|
|
7
|
+
|
|
8
|
+
* UTF-8: [www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec](https://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=dec) # noqa: E501
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Bytes (TODO: why is this here?)
|
|
@@ -69,10 +72,10 @@ UNPRINTABLE_ASCII = {
|
|
|
69
72
|
}
|
|
70
73
|
|
|
71
74
|
|
|
72
|
-
def scrub_c1_control_chars(char_map):
|
|
75
|
+
def scrub_c1_control_chars(char_map: dict) -> None:
|
|
73
76
|
"""
|
|
74
|
-
Fill in a dict with integer keys/values corresponding to where a given char encoding has no chars
|
|
75
|
-
because this range is for C1 control chars (AKA the
|
|
77
|
+
Fill in a `dict` with integer keys/values corresponding to where a given char encoding has no chars
|
|
78
|
+
because this range is for C1 control chars (AKA the "undefined" part of most character maps).
|
|
76
79
|
"""
|
|
77
80
|
for i in range(128, 160):
|
|
78
81
|
char_map[i] = f"C1.CHAR{i}"
|
|
@@ -164,7 +167,7 @@ WIDE_UTF_ENCODINGS = {
|
|
|
164
167
|
|
|
165
168
|
|
|
166
169
|
def encoding_offsets(encoding: str) -> list:
|
|
167
|
-
"""Get possible offsets for a given encoding. If the encoding is not in WIDE_UTF_ENCODINGS
|
|
170
|
+
"""Get possible offsets for a given encoding. If the encoding is not in `WIDE_UTF_ENCODINGS`, return `[0]`."""
|
|
168
171
|
return WIDE_UTF_ENCODINGS.get(encoding, [0])
|
|
169
172
|
|
|
170
173
|
|