yaralyzer 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of yaralyzer might be problematic. Click here for more details.
- CHANGELOG.md +3 -0
- yaralyzer/__init__.py +1 -4
- yaralyzer/bytes_match.py +23 -24
- yaralyzer/config.py +13 -12
- yaralyzer/decoding/bytes_decoder.py +33 -25
- yaralyzer/decoding/decoding_attempt.py +55 -18
- yaralyzer/encoding_detection/character_encodings.py +9 -6
- yaralyzer/encoding_detection/encoding_assessment.py +26 -6
- yaralyzer/encoding_detection/encoding_detector.py +39 -10
- yaralyzer/helpers/bytes_helper.py +19 -18
- yaralyzer/helpers/file_helper.py +20 -13
- yaralyzer/helpers/rich_text_helper.py +10 -9
- yaralyzer/output/decoding_attempts_table.py +43 -9
- yaralyzer/output/file_export.py +23 -7
- yaralyzer/output/file_hashes_table.py +9 -8
- yaralyzer/output/regex_match_metrics.py +28 -6
- yaralyzer/output/rich_console.py +19 -17
- yaralyzer/util/argument_parser.py +11 -3
- yaralyzer/util/logging.py +31 -16
- yaralyzer/yara/yara_match.py +40 -17
- yaralyzer/yara/yara_rule_builder.py +55 -11
- yaralyzer/yaralyzer.py +90 -20
- {yaralyzer-1.0.8.dist-info → yaralyzer-1.0.9.dist-info}/METADATA +5 -6
- yaralyzer-1.0.9.dist-info/RECORD +32 -0
- yaralyzer-1.0.8.dist-info/RECORD +0 -32
- {yaralyzer-1.0.8.dist-info → yaralyzer-1.0.9.dist-info}/LICENSE +0 -0
- {yaralyzer-1.0.8.dist-info → yaralyzer-1.0.9.dist-info}/WHEEL +0 -0
- {yaralyzer-1.0.8.dist-info → yaralyzer-1.0.9.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
`EncodingDetector` class for managing chardet encoding detection.
|
|
3
|
+
"""
|
|
2
4
|
from operator import attrgetter
|
|
3
5
|
from typing import List
|
|
4
6
|
|
|
@@ -16,17 +18,36 @@ CONFIDENCE_SCORE_RANGE = range(0, 101)
|
|
|
16
18
|
|
|
17
19
|
class EncodingDetector:
|
|
18
20
|
"""
|
|
19
|
-
Manager class to ease dealing with the
|
|
20
|
-
|
|
21
|
+
Manager class to ease dealing with the encoding detection library `chardet`.
|
|
22
|
+
|
|
23
|
+
Each instance of this class manages a `chardet.detect_all()` scan on a single set of bytes.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
bytes (bytes): The bytes to analyze.
|
|
27
|
+
bytes_len (int): The length of the bytes.
|
|
28
|
+
table (Table): A rich `Table` object summarizing the chardet results.
|
|
29
|
+
assessments (List[EncodingAssessment]): List of `EncodingAssessment` objects from `chardet` results.
|
|
30
|
+
unique_assessments (List[EncodingAssessment]): Unique assessments by encoding, highest confidence only.
|
|
31
|
+
raw_chardet_assessments (List[dict]): Raw list of dicts returned by `chardet.detect_all()`.
|
|
32
|
+
force_decode_assessments (List[EncodingAssessment]): Assessments above force decode threshold.
|
|
33
|
+
force_display_assessments (List[EncodingAssessment]): Assessments above force display threshold.
|
|
34
|
+
has_any_idea (Optional[bool]): `True` if `chardet` had any idea what the encoding might be,
|
|
35
|
+
`False` if not, `None` if `chardet` wasn't run yet.
|
|
36
|
+
force_display_threshold (float): `[class variable]` Default confidence threshold for forcing display
|
|
37
|
+
in decoded table.
|
|
38
|
+
force_decode_threshold (float): `[class variable]` Default confidence threshold for forcing a decode attempt.
|
|
21
39
|
"""
|
|
22
40
|
|
|
23
41
|
# Default value for encodings w/confidences below this will not be displayed in the decoded table
|
|
24
42
|
force_display_threshold = 20.0
|
|
25
|
-
|
|
26
43
|
# Default value for what chardet.detect() confidence % should we force a decode with an obscure encoding.
|
|
27
44
|
force_decode_threshold = 50.0
|
|
28
45
|
|
|
29
46
|
def __init__(self, _bytes: bytes) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Args:
|
|
49
|
+
_bytes (bytes): The bytes to analyze with `chardet`.
|
|
50
|
+
"""
|
|
30
51
|
self.bytes = _bytes
|
|
31
52
|
self.bytes_len = len(_bytes)
|
|
32
53
|
self.table = _empty_chardet_results_table()
|
|
@@ -55,12 +76,20 @@ class EncodingDetector:
|
|
|
55
76
|
self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
|
|
56
77
|
|
|
57
78
|
def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
|
|
58
|
-
"""
|
|
79
|
+
"""
|
|
80
|
+
Get the `chardet` assessment for a specific encoding.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
encoding (str): The encoding to look for.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
EncodingAssessment: Assessment for the given encoding if it exists, otherwise a dummy with 0 confidence.
|
|
87
|
+
"""
|
|
59
88
|
assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
|
|
60
89
|
return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
|
|
61
90
|
|
|
62
91
|
def has_enough_bytes(self) -> bool:
|
|
63
|
-
"""Return
|
|
92
|
+
"""Return `True` if we have enough bytes to run `chardet.detect()`."""
|
|
64
93
|
return self.bytes_len >= YaralyzerConfig.args.min_chardet_bytes
|
|
65
94
|
|
|
66
95
|
def assessments_above_confidence(self, cutoff: float) -> List[EncodingAssessment]:
|
|
@@ -71,7 +100,7 @@ class EncodingDetector:
|
|
|
71
100
|
return Padding(self.table, (0, 0, 0, 0))
|
|
72
101
|
|
|
73
102
|
def _uniquify_results_and_build_table(self) -> None:
|
|
74
|
-
"""Keep the highest result per encoding, ignoring the language chardet has indicated."""
|
|
103
|
+
"""Keep the highest result per encoding, ignoring the language `chardet` has indicated."""
|
|
75
104
|
already_seen_encodings = {}
|
|
76
105
|
|
|
77
106
|
for i, result in enumerate(self.assessments):
|
|
@@ -91,7 +120,7 @@ class EncodingDetector:
|
|
|
91
120
|
self.unique_assessments.sort(key=attrgetter('confidence'), reverse=True)
|
|
92
121
|
|
|
93
122
|
def _set_empty_results(self) -> None:
|
|
94
|
-
"""Set empty results for when chardet can't help us."""
|
|
123
|
+
"""Set empty results for when `chardet` can't help us."""
|
|
95
124
|
self.assessments = []
|
|
96
125
|
self.unique_assessments = []
|
|
97
126
|
self.raw_chardet_assessments = []
|
|
@@ -99,8 +128,8 @@ class EncodingDetector:
|
|
|
99
128
|
self.force_display_assessments = []
|
|
100
129
|
|
|
101
130
|
|
|
102
|
-
def _empty_chardet_results_table():
|
|
103
|
-
"""Returns
|
|
131
|
+
def _empty_chardet_results_table() -> Table:
|
|
132
|
+
"""Returns an empty `Table` with appropriate columns for `chardet` results."""
|
|
104
133
|
table = Table(
|
|
105
134
|
'Rank', 'Encoding', 'Confidence',
|
|
106
135
|
title='chardet.detect results',
|
|
@@ -25,14 +25,15 @@ HEX_CHARS_PER_LINE = HEX_CHARS_PER_GROUP * HEX_GROUPS_PER_LINE
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def get_bytes_before_and_after_match(_bytes: bytes, match: re.Match, num_before=None, num_after=None) -> bytes:
|
|
28
|
-
|
|
28
|
+
"""
|
|
29
29
|
Get bytes before and after a regex match within a byte sequence.
|
|
30
30
|
|
|
31
31
|
Args:
|
|
32
32
|
_bytes (bytes): The full byte sequence.
|
|
33
|
-
match (re.Match): The regex
|
|
34
|
-
num_before (int, optional): Number of bytes before the match to include. Defaults to
|
|
35
|
-
num_after (int, optional): Number of bytes after the match to include. Defaults to either
|
|
33
|
+
match (re.Match): The regex `Match` object.
|
|
34
|
+
num_before (int, optional): Number of bytes before the match to include. Defaults to configured value.
|
|
35
|
+
num_after (int, optional): Number of bytes after the match to include. Defaults to either configured value
|
|
36
|
+
or the `num_before` arg value.
|
|
36
37
|
|
|
37
38
|
Returns:
|
|
38
39
|
bytes: The surrounding bytes including the match.
|
|
@@ -41,15 +42,15 @@ def get_bytes_before_and_after_match(_bytes: bytes, match: re.Match, num_before=
|
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
def get_bytes_surrounding_range(_bytes: bytes, start_idx: int, end_idx: int, num_before=None, num_after=None) -> bytes:
|
|
44
|
-
|
|
45
|
+
"""
|
|
45
46
|
Get bytes surrounding a specified range in a byte sequence.
|
|
46
47
|
|
|
47
48
|
Args:
|
|
48
49
|
_bytes (bytes): The full byte sequence.
|
|
49
50
|
start_idx (int): Start index of the range.
|
|
50
51
|
end_idx (int): End index of the range.
|
|
51
|
-
num_before (int, optional): Number of bytes before the range. Defaults to
|
|
52
|
-
num_after (int, optional): Number of bytes after the range. Defaults to
|
|
52
|
+
num_before (int, optional): Number of bytes before the range. Defaults to configured value.
|
|
53
|
+
num_after (int, optional): Number of bytes after the range. Defaults to configured value.
|
|
53
54
|
|
|
54
55
|
Returns:
|
|
55
56
|
bytes: The surrounding bytes including the range.
|
|
@@ -87,8 +88,8 @@ def clean_byte_string(bytes_array: bytes) -> str:
|
|
|
87
88
|
|
|
88
89
|
|
|
89
90
|
def rich_text_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
90
|
-
|
|
91
|
-
Return a rich Text object of raw bytes, highlighting the matched bytes.
|
|
91
|
+
"""
|
|
92
|
+
Return a rich `Text` object of raw bytes, highlighting the matched bytes.
|
|
92
93
|
|
|
93
94
|
Args:
|
|
94
95
|
_bytes (bytes): The full byte sequence.
|
|
@@ -110,7 +111,7 @@ def rich_text_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
|
110
111
|
|
|
111
112
|
|
|
112
113
|
def hex_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
113
|
-
|
|
114
|
+
"""
|
|
114
115
|
Return a hexadecimal view of raw bytes, highlighting the matched bytes.
|
|
115
116
|
|
|
116
117
|
Args:
|
|
@@ -129,7 +130,7 @@ def hex_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
def ascii_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
132
|
-
|
|
133
|
+
"""
|
|
133
134
|
Return an ASCII view of raw bytes, highlighting the matched bytes.
|
|
134
135
|
|
|
135
136
|
Args:
|
|
@@ -171,7 +172,7 @@ def ascii_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
|
|
|
171
172
|
|
|
172
173
|
|
|
173
174
|
def hex_text(_bytes: bytes) -> Text:
|
|
174
|
-
|
|
175
|
+
"""
|
|
175
176
|
Return a rich Text object of the hex string for the given bytes.
|
|
176
177
|
|
|
177
178
|
Args:
|
|
@@ -184,7 +185,7 @@ def hex_text(_bytes: bytes) -> Text:
|
|
|
184
185
|
|
|
185
186
|
|
|
186
187
|
def hex_string(_bytes: bytes) -> str:
|
|
187
|
-
|
|
188
|
+
"""
|
|
188
189
|
Return a hex string representation of the given bytes.
|
|
189
190
|
|
|
190
191
|
Args:
|
|
@@ -197,8 +198,8 @@ def hex_string(_bytes: bytes) -> str:
|
|
|
197
198
|
|
|
198
199
|
|
|
199
200
|
def print_bytes(bytes_array: bytes, style=None) -> None:
|
|
200
|
-
|
|
201
|
-
Print a string representation of bytes to the console.
|
|
201
|
+
"""
|
|
202
|
+
Print a string representation of some bytes to the console.
|
|
202
203
|
|
|
203
204
|
Args:
|
|
204
205
|
bytes_array (bytes): The bytes to print.
|
|
@@ -209,7 +210,7 @@ def print_bytes(bytes_array: bytes, style=None) -> None:
|
|
|
209
210
|
|
|
210
211
|
|
|
211
212
|
def truncate_for_encoding(_bytes: bytes, encoding: str) -> bytes:
|
|
212
|
-
|
|
213
|
+
"""
|
|
213
214
|
Truncate bytes to a multiple of the character width for the given encoding.
|
|
214
215
|
For example, for utf-16 this means truncating to a multiple of 2, for utf-32 to a multiple of 4.
|
|
215
216
|
|
|
@@ -246,8 +247,8 @@ def _find_str_rep_of_bytes(surrounding_bytes_str: str, highlighted_bytes_str: st
|
|
|
246
247
|
int: The index in the surrounding string where the highlighted bytes start, or -1 if not found.
|
|
247
248
|
"""
|
|
248
249
|
# Start a few chars in to avoid errors: sometimes we're searching for 1 or 2 bytes and there's a false positive
|
|
249
|
-
# in the extra bytes.
|
|
250
|
-
# check but this is almost certainly
|
|
250
|
+
# in the extra bytes. This isn't perfect - it's starting us at the first index into the *bytes* that's safe to
|
|
251
|
+
# check but this is almost certainly too soon given the large % of bytes that take 4 chars to print ('\x02' etc)
|
|
251
252
|
highlight_idx = surrounding_bytes_str.find(highlighted_bytes_str, highlighted_bytes.highlight_start_idx)
|
|
252
253
|
|
|
253
254
|
# TODO: Somehow \' and ' don't always come out the same :(
|
yaralyzer/helpers/file_helper.py
CHANGED
|
@@ -3,16 +3,21 @@ Helper methods to work with files.
|
|
|
3
3
|
"""
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from os import listdir, path
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from typing import List, Optional
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
def
|
|
10
|
-
"""
|
|
11
|
-
|
|
10
|
+
def files_in_dir(dir: Path | str, with_extname: Optional[str] = None) -> List[str]:
|
|
11
|
+
"""
|
|
12
|
+
Returns paths for all non dot files in `dir` (optionally filtered to only those ending in 'with_extname').
|
|
12
13
|
|
|
14
|
+
Args:
|
|
15
|
+
dir (str): Directory to list files from.
|
|
16
|
+
with_extname (Optional[str], optional): If set, only return files with this extension. Defaults to None.
|
|
13
17
|
|
|
14
|
-
|
|
15
|
-
|
|
18
|
+
Returns:
|
|
19
|
+
List[str]: List of file paths.
|
|
20
|
+
"""
|
|
16
21
|
files = [path.join(dir, path.basename(file)) for file in listdir(dir) if not file.startswith('.')]
|
|
17
22
|
files = [file for file in files if not path.isdir(file)]
|
|
18
23
|
|
|
@@ -23,20 +28,22 @@ def files_in_dir(dir: str, with_extname: Optional[str] = None) -> List[str]:
|
|
|
23
28
|
|
|
24
29
|
|
|
25
30
|
def files_with_extname(files: List[str], extname: str) -> List[str]:
|
|
31
|
+
"""Return only files from the list that end with the given `extname`."""
|
|
26
32
|
return [f for f in files if f.endswith(f".{extname}")]
|
|
27
33
|
|
|
28
34
|
|
|
29
|
-
def
|
|
30
|
-
"""
|
|
31
|
-
with open(file_path, 'r') as f:
|
|
32
|
-
return [line.rstrip().lstrip() for line in f.readlines()]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def load_binary_data(file_path) -> bytes:
|
|
35
|
+
def load_binary_data(file_path: Path | str) -> bytes:
|
|
36
|
+
"""Load and return the raw `bytes` from a file."""
|
|
36
37
|
with open(file_path, 'rb') as f:
|
|
37
38
|
return f.read()
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
def load_file(file_path) -> str:
|
|
41
|
+
def load_file(file_path: Path | str) -> str:
|
|
42
|
+
"""Load and return the text contents of a file."""
|
|
41
43
|
with open(file_path, 'r') as f:
|
|
42
44
|
return f.read()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def timestamp_for_filename() -> str:
|
|
48
|
+
"""Returns a string showing current time in a file name friendly format."""
|
|
49
|
+
return datetime.now().strftime("%Y-%m-%dT%H.%M.%S")
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Methods to handle turning various objects into Rich text/table/etc representations
|
|
3
3
|
|
|
4
|
-
Rich
|
|
4
|
+
[Rich color names](https://rich.readthedocs.io/en/stable/appendix/colors.html)
|
|
5
5
|
TODO: interesting colors # row_styles[0] = 'reverse bold on color(144)' <-
|
|
6
6
|
"""
|
|
7
|
-
from typing import List, Union
|
|
7
|
+
from typing import List, Optional, Union
|
|
8
8
|
|
|
9
9
|
from rich.columns import Columns
|
|
10
10
|
from rich.panel import Panel
|
|
@@ -38,15 +38,16 @@ DECODING_ERRORS_MSG = Text('Yes', style='dark_red dim')
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def na_txt(style: Union[str, Style] = 'white'):
|
|
41
|
+
"""Standard N/A text for tables and such."""
|
|
41
42
|
return Text('N/A', style=style)
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
def prefix_with_style(_str: str, style: str, root_style=None) -> Text:
|
|
45
|
+
def prefix_with_style(_str: str, style: str, root_style: Optional[Union[Style, str]] = None) -> Text:
|
|
45
46
|
"""Sometimes you need a Text() object to start plain lest the underline or whatever last forever."""
|
|
46
47
|
return Text('', style=root_style or 'white') + Text(_str, style)
|
|
47
48
|
|
|
48
49
|
|
|
49
|
-
def meter_style(meter_pct):
|
|
50
|
+
def meter_style(meter_pct: float | int) -> str:
|
|
50
51
|
"""For coloring numbers between 0 and 100 (AKA pcts). Closer to 100 means greener, closer to 0.0 means bluer."""
|
|
51
52
|
if meter_pct > 100 or meter_pct < 0:
|
|
52
53
|
log.warning(f"Invalid meter_pct: {meter_pct}")
|
|
@@ -86,11 +87,6 @@ def reverse_color(style: Style) -> Style:
|
|
|
86
87
|
return Style(color=style.bgcolor, bgcolor=style.color, underline=style.underline, bold=style.bold)
|
|
87
88
|
|
|
88
89
|
|
|
89
|
-
def yaralyzer_show_color_theme() -> None:
|
|
90
|
-
"""Script method to show yaralyzer's color theme. Invocable with 'yaralyzer_show_colors'."""
|
|
91
|
-
show_color_theme(YARALYZER_THEME_DICT)
|
|
92
|
-
|
|
93
|
-
|
|
94
90
|
def show_color_theme(styles: dict) -> None:
|
|
95
91
|
"""Print all colors in 'styles' to screen in a grid"""
|
|
96
92
|
console.print(Panel('The Yaralyzer Color Theme', style='reverse'))
|
|
@@ -119,3 +115,8 @@ def size_in_bytes_text(num_bytes: int) -> Text:
|
|
|
119
115
|
def newline_join(texts: List[Text]) -> Text:
|
|
120
116
|
"""Join a list of Text objects with newlines between them."""
|
|
121
117
|
return Text("\n").join(texts)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def yaralyzer_show_color_theme() -> None:
|
|
121
|
+
"""Script method to show yaralyzer's color theme. Invocable with 'yaralyzer_show_colors'."""
|
|
122
|
+
show_color_theme(YARALYZER_THEME_DICT)
|
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Methods to build the rich.table used to display decoding attempts of a given bytes array.
|
|
2
|
+
Methods to build the `rich.table` used to display decoding attempts of a given bytes array.
|
|
3
3
|
|
|
4
|
-
Final output should be rich.table of decoding attempts that are sorted like this:
|
|
4
|
+
Final output should be a `rich.table` of decoding attempts that are sorted like this:
|
|
5
5
|
|
|
6
6
|
1. String representation of undecoded bytes is always the first row
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
2. Encodings which `chardet.detect()` ranked as > 0% likelihood are sorted based on that confidence
|
|
9
|
+
|
|
8
10
|
3. Then the unchardetectable:
|
|
11
|
+
|
|
9
12
|
1. Decodings that were successful, unforced, and new
|
|
10
|
-
|
|
13
|
+
|
|
14
|
+
2. Decodings that were "successful" but forced
|
|
15
|
+
|
|
11
16
|
3. Decodings that were the same as other decodings
|
|
17
|
+
|
|
12
18
|
4. Failed decodings
|
|
13
19
|
"""
|
|
14
20
|
from collections import namedtuple
|
|
@@ -45,7 +51,7 @@ RAW_BYTES = Text('Raw', style=f"bytes")
|
|
|
45
51
|
|
|
46
52
|
|
|
47
53
|
def new_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
|
|
48
|
-
"""Build a new rich Table with two rows, the raw and hex views of the bytes_match data."""
|
|
54
|
+
"""Build a new rich `Table` with two rows, the raw and hex views of the `bytes_match` data."""
|
|
49
55
|
table = Table(show_lines=True, border_style='bytes', header_style='decode.table_header')
|
|
50
56
|
|
|
51
57
|
def add_col(title, **kwargs):
|
|
@@ -65,7 +71,18 @@ def new_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
|
|
|
65
71
|
|
|
66
72
|
|
|
67
73
|
def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Text, score: float) -> DecodingTableRow:
|
|
68
|
-
"""
|
|
74
|
+
"""
|
|
75
|
+
Build a table row for a decoding attempt.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
assessment (EncodingAssessment): The `chardet` assessment for the encoding used.
|
|
79
|
+
is_forced (Text): Text indicating if the decode was forced.
|
|
80
|
+
txt (Text): The decoded string as a rich `Text` object (with highlighting).
|
|
81
|
+
score (float): The score to use for sorting this row in the table.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
DecodingTableRow: The constructed table row named tuple.
|
|
85
|
+
"""
|
|
69
86
|
return DecodingTableRow(
|
|
70
87
|
assessment.encoding_label,
|
|
71
88
|
assessment.confidence_text,
|
|
@@ -78,13 +95,30 @@ def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Tex
|
|
|
78
95
|
)
|
|
79
96
|
|
|
80
97
|
|
|
81
|
-
def assessment_only_row(assessment: EncodingAssessment, score) -> DecodingTableRow:
|
|
82
|
-
"""
|
|
98
|
+
def assessment_only_row(assessment: EncodingAssessment, score: float) -> DecodingTableRow:
|
|
99
|
+
"""
|
|
100
|
+
Build a `DecodingTableRow` with just `chardet` assessment confidence data and no actual decoding attempt string.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
assessment (EncodingAssessment): The `chardet` assessment for the encoding used.
|
|
104
|
+
score (float): The score to use for sorting this row within the table.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DecodingTableRow: The constructed table row named tuple with no decoding attempt string.
|
|
108
|
+
"""
|
|
83
109
|
return decoding_table_row(assessment, na_txt(), DECODE_NOT_ATTEMPTED_MSG, score)
|
|
84
110
|
|
|
85
111
|
|
|
86
112
|
def _hex_preview_subtable(bytes_match: BytesMatch) -> Table:
|
|
87
|
-
"""
|
|
113
|
+
"""
|
|
114
|
+
Build a sub `Table` for hex view row (hex on one side, ascii on the other side).
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
bytes_match (BytesMatch): The `BytesMatch` object containing the bytes to display.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Table: A `rich.table` with hex and ascii views of the bytes.
|
|
121
|
+
"""
|
|
88
122
|
hex_table = Table(
|
|
89
123
|
'hex',
|
|
90
124
|
'ascii',
|
yaralyzer/output/file_export.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Functions to export Yaralyzer results to various file formats.
|
|
3
|
+
"""
|
|
2
4
|
import json
|
|
3
5
|
import time
|
|
4
6
|
from os import path
|
|
5
|
-
from typing import Optional
|
|
7
|
+
from typing import Callable, Optional
|
|
6
8
|
|
|
7
9
|
from rich.terminal_theme import TerminalTheme
|
|
8
10
|
|
|
@@ -52,7 +54,16 @@ _EXPORT_KWARGS = {
|
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
def export_json(yaralyzer: Yaralyzer, output_basepath: Optional[str]) -> str:
|
|
55
|
-
"""
|
|
57
|
+
"""
|
|
58
|
+
Export YARA scan results to JSON.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
yaralyzer (Yaralyzer): The `Yaralyzer` object containing the results to export.
|
|
62
|
+
output_basepath (Optional[str]): Base path to write output to. Should have no file extension.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
str: Path data was exported to.
|
|
66
|
+
"""
|
|
56
67
|
output_path = f"{output_basepath or 'yara_matches'}.json"
|
|
57
68
|
|
|
58
69
|
matches_data = [
|
|
@@ -67,11 +78,16 @@ def export_json(yaralyzer: Yaralyzer, output_basepath: Optional[str]) -> str:
|
|
|
67
78
|
return output_path
|
|
68
79
|
|
|
69
80
|
|
|
70
|
-
def invoke_rich_export(export_method, output_file_basepath) -> str:
|
|
81
|
+
def invoke_rich_export(export_method: Callable, output_file_basepath: str) -> str:
|
|
71
82
|
"""
|
|
72
|
-
Announce the export, perform the export, announce completion.
|
|
73
|
-
|
|
74
|
-
|
|
83
|
+
Announce the export, perform the export, and announce completion.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
export_method (Callable): Usually a `Rich.console.save_whatever()` method
|
|
87
|
+
output_file_basepath (str): Path to write output to. Should have no file extension.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
str: Path data was exported to.
|
|
75
91
|
"""
|
|
76
92
|
method_name = export_method.__name__
|
|
77
93
|
extname = 'txt' if method_name == 'save_text' else method_name.split('_')[-1]
|
|
@@ -19,15 +19,16 @@ def bytes_hashes_table(
|
|
|
19
19
|
title_justify: str = LEFT
|
|
20
20
|
) -> Table:
|
|
21
21
|
"""
|
|
22
|
-
Build a Rich Table displaying the size, MD5, SHA1, and SHA256 hashes of a byte sequence.
|
|
22
|
+
Build a Rich `Table` displaying the size, MD5, SHA1, and SHA256 hashes of a byte sequence.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
bytes_or_bytes_info (Union[bytes, BytesInfo]): The bytes to hash, or a BytesInfo
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
bytes_or_bytes_info (Union[bytes, BytesInfo]): The `bytes` to hash, or a `BytesInfo`
|
|
26
|
+
namedtuple with precomputed values.
|
|
27
|
+
title (Optional[str], optional): Optional title for the table. Defaults to `None`.
|
|
28
|
+
title_justify (str, optional): Justification for the table title. Defaults to `"LEFT"`.
|
|
28
29
|
|
|
29
30
|
Returns:
|
|
30
|
-
Table: A Rich Table object with the size and hash values.
|
|
31
|
+
Table: A Rich `Table` object with the size and hash values.
|
|
31
32
|
"""
|
|
32
33
|
if isinstance(bytes_or_bytes_info, bytes):
|
|
33
34
|
bytes_info = compute_file_hashes(bytes_or_bytes_info)
|
|
@@ -54,10 +55,10 @@ def compute_file_hashes(_bytes: bytes) -> BytesInfo:
|
|
|
54
55
|
Compute the size, MD5, SHA1, and SHA256 hashes for a given byte sequence.
|
|
55
56
|
|
|
56
57
|
Args:
|
|
57
|
-
_bytes (bytes): The bytes to hash.
|
|
58
|
+
_bytes (bytes): The `bytes` to hash.
|
|
58
59
|
|
|
59
60
|
Returns:
|
|
60
|
-
BytesInfo:
|
|
61
|
+
BytesInfo: `BytesInfo` namedtuple containing size, md5, sha1, and sha256 values.
|
|
61
62
|
"""
|
|
62
63
|
return BytesInfo(
|
|
63
64
|
size=len(_bytes),
|
|
@@ -75,7 +76,7 @@ def compute_file_hashes_for_file(file_path) -> BytesInfo:
|
|
|
75
76
|
file_path (str): Path to the file to hash.
|
|
76
77
|
|
|
77
78
|
Returns:
|
|
78
|
-
BytesInfo:
|
|
79
|
+
BytesInfo: `BytesInfo` namedtuple containing size, md5, sha1, and sha256 values for the file contents.
|
|
79
80
|
"""
|
|
80
81
|
with open(file_path, 'rb') as file:
|
|
81
82
|
return compute_file_hashes(file.read())
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
`RegexMatchMetrics` class.
|
|
3
|
+
"""
|
|
2
4
|
from collections import defaultdict
|
|
3
5
|
|
|
4
6
|
from yaralyzer.decoding.bytes_decoder import BytesDecoder
|
|
@@ -7,13 +9,25 @@ from yaralyzer.util.logging import log
|
|
|
7
9
|
|
|
8
10
|
class RegexMatchMetrics:
|
|
9
11
|
"""
|
|
10
|
-
Class to
|
|
11
|
-
|
|
12
|
-
(e.g. "bytes between quotes") against a relatively large pool of close to random encrypted binary data.
|
|
12
|
+
Class to measure what we enounter as we iterate over all matches of a relatively simple byte level regex.
|
|
13
13
|
|
|
14
14
|
Things like how much many of our matched bytes were we able to decode easily vs. by force vs. not at all,
|
|
15
|
-
were some encodings have a higher pct of success than others (indicating part of our mystery data might be
|
|
16
|
-
that way?
|
|
15
|
+
were some encodings have a higher pct of success than others (indicating part of our mystery data might be
|
|
16
|
+
encoded that way?
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
"Find bytes between quotes" against a relatively large pool of close to random encrypted binary data.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
match_count (int): Total number of matches found.
|
|
23
|
+
bytes_matched (int): Total number of bytes matched across all matches.
|
|
24
|
+
matches_decoded (int): Number of matches where we were able to decode at least some of the matched bytes.
|
|
25
|
+
easy_decode_count (int): Number of matches where we were able to decode the matched bytes without forcing.
|
|
26
|
+
forced_decode_count (int): Number of matches where we were only able to decode the matched bytes by forcing.
|
|
27
|
+
undecodable_count (int): Number of matches where we were unable to decode any of the matched bytes.
|
|
28
|
+
skipped_matches_lengths (defaultdict): Dictionary mapping lengths of skipped matches to their counts.
|
|
29
|
+
bytes_match_objs (list): List of `BytesMatch` objects for all matches encountered.
|
|
30
|
+
per_encoding_stats (defaultdict): Dictionary mapping encoding names to their respective `RegexMatchMetrics`.
|
|
17
31
|
|
|
18
32
|
TODO: use @dataclass decorator https://realpython.com/python-data-classes/
|
|
19
33
|
"""
|
|
@@ -30,12 +44,20 @@ class RegexMatchMetrics:
|
|
|
30
44
|
self.per_encoding_stats = defaultdict(lambda: RegexMatchMetrics())
|
|
31
45
|
|
|
32
46
|
def num_matches_skipped_for_being_empty(self) -> int:
|
|
47
|
+
"""Number of matches skipped for being empty (0 length)."""
|
|
33
48
|
return self.skipped_matches_lengths[0]
|
|
34
49
|
|
|
35
50
|
def num_matches_skipped_for_being_too_big(self) -> int:
|
|
51
|
+
"""Number of matches skipped for being too big to decode."""
|
|
36
52
|
return sum({k: v for k, v in self.skipped_matches_lengths.items() if k > 0}.values())
|
|
37
53
|
|
|
38
54
|
def tally_match(self, decoder: BytesDecoder) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Tally statistics from a `BytesDecoder` after it has processed a match.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
decoder (BytesDecoder): The `BytesDecoder` that processed a match.
|
|
60
|
+
"""
|
|
39
61
|
log.debug(f"Tallying {decoder.bytes_match} ({len(decoder.decodings)} decodings)")
|
|
40
62
|
self.match_count += 1
|
|
41
63
|
self.bytes_matched += decoder.bytes_match.match_length
|
yaralyzer/output/rich_console.py
CHANGED
|
@@ -86,11 +86,6 @@ def console_width_possibilities():
|
|
|
86
86
|
return [get_terminal_size().columns - 2, DEFAULT_CONSOLE_WIDTH]
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
def console_width() -> int:
|
|
90
|
-
"""Current width set in console obj."""
|
|
91
|
-
return console._width or 40
|
|
92
|
-
|
|
93
|
-
|
|
94
89
|
# Maximize output width if YARALYZER_MAXIMIZE_WIDTH is set (also can changed with --maximize-width option)
|
|
95
90
|
if is_invoked_by_pytest():
|
|
96
91
|
CONSOLE_WIDTH = DEFAULT_CONSOLE_WIDTH
|
|
@@ -104,8 +99,8 @@ CONSOLE_PRINT_BYTE_WIDTH = int(CONSOLE_WIDTH / 4.0)
|
|
|
104
99
|
console = Console(theme=YARALYZER_THEME, color_system='256', highlight=False, width=CONSOLE_WIDTH)
|
|
105
100
|
|
|
106
101
|
|
|
107
|
-
def console_print_with_fallback(_string, style=None) -> None:
|
|
108
|
-
"""
|
|
102
|
+
def console_print_with_fallback(_string: Text | str, style=None) -> None:
|
|
103
|
+
"""`rich.console.print()` with fallback to regular `print()` if there's a Rich Markup issue."""
|
|
109
104
|
try:
|
|
110
105
|
console.print(_string, style=style)
|
|
111
106
|
except MarkupError:
|
|
@@ -113,13 +108,18 @@ def console_print_with_fallback(_string, style=None) -> None:
|
|
|
113
108
|
print(_string.plain if isinstance(_string, Text) else _string)
|
|
114
109
|
|
|
115
110
|
|
|
116
|
-
def
|
|
117
|
-
"""
|
|
118
|
-
return
|
|
111
|
+
def console_width() -> int:
|
|
112
|
+
"""Current width set in `console` object."""
|
|
113
|
+
return console._width or 40
|
|
119
114
|
|
|
120
115
|
|
|
121
116
|
def print_fatal_error_and_exit(error_message: str) -> None:
|
|
122
|
-
"""
|
|
117
|
+
"""
|
|
118
|
+
Print a fatal error message in a `Panel` and exit.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
error_message (str): The error message to display.
|
|
122
|
+
"""
|
|
123
123
|
console.line(1)
|
|
124
124
|
print_header_panel(error_message, style='bold red reverse')
|
|
125
125
|
console.line(1)
|
|
@@ -128,15 +128,17 @@ def print_fatal_error_and_exit(error_message: str) -> None:
|
|
|
128
128
|
|
|
129
129
|
def print_header_panel(headline: str, style: str, expand: bool = True, padding: tuple = (0, 2)) -> None:
|
|
130
130
|
"""
|
|
131
|
-
Print a headline inside a styled Rich Panel to the console.
|
|
131
|
+
Print a headline inside a styled Rich `Panel` to the console.
|
|
132
132
|
|
|
133
133
|
Args:
|
|
134
134
|
headline (str): The text to display as the panel's headline.
|
|
135
135
|
style (str): The style to apply to the panel (e.g., color, bold, reverse).
|
|
136
|
-
expand (bool, optional): Whether the panel should expand to the full console width. Defaults to True
|
|
137
|
-
padding (tuple, optional): Padding around the panel content (top/bottom, left/right). Defaults to (0, 2)
|
|
138
|
-
|
|
139
|
-
Returns:
|
|
140
|
-
None
|
|
136
|
+
expand (bool, optional): Whether the panel should expand to the full console width. Defaults to `True`.
|
|
137
|
+
padding (tuple, optional): Padding around the panel content (top/bottom, left/right). Defaults to `(0, 2)`.
|
|
141
138
|
"""
|
|
142
139
|
console.print(Panel(headline, box=box.DOUBLE_EDGE, style=style, expand=expand, padding=padding))
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def theme_colors_with_prefix(prefix: str) -> List[Text]:
|
|
143
|
+
"""Return a list of (name, style) `Text` objects for all styles in the theme that start with `prefix`."""
|
|
144
|
+
return [Text(k, v) for k, v in YARALYZER_THEME.styles.items() if k.startswith(prefix)]
|