yaralyzer 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of yaralyzer might be problematic. Click here for more details.

.yaralyzer.example CHANGED
@@ -17,7 +17,7 @@
17
17
  # YARALYZER_STACK_SIZE=10485760
18
18
  # YARALYZER_MAX_MATCH_LENGTH=10737418240
19
19
 
20
- # Suppress all PDF binary regex matching/scanning/etc
20
+ # Suppress all attempts to decode bytes into various text encodings
21
21
  # YARALYZER_SUPPRESS_DECODES_TABLE=False
22
22
 
23
23
  # Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
@@ -30,6 +30,8 @@
30
30
  # Configure how many bytes before and after any binary data should be included in scans and visualizations
31
31
  # YARALYZER_SURROUNDING_BYTES=64
32
32
 
33
+
34
+
33
35
  # Size thresholds (in bytes) under/over which yaralyzer will NOT make attempts to decode a match.
34
36
  # Longer byte sequences are for obvious reasons slower to decode by force.
35
37
  # It may feel counterintuitive but larger chunks of random binary are also harder to examine and
@@ -45,6 +47,8 @@
45
47
  # Minimum bytes to run chardet.detect() on a sequence of bytes
46
48
  # YARALYZER_MIN_BYTES_TO_DETECT_ENCODING
47
49
 
50
+
51
+
48
52
  # Directory to write application logs to. Must be an absolute path, not a relative one.
49
53
  # These logs are not normally written to a file and the default log level means that the standard behavior
50
54
  # is to more or less discard them. Be aware that if you configure this variable a few things will change:
CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.0.1
4
+ * Fix iteration of byte offsets during attempted decodes for UTF-16 and UTF-32 (was starting at second byte instead of first)
5
+ * Label the byte offset for forced UTF-16 and UTF-32 decodes
6
+ * Show helpful message if logs are being sent to files in `YaralyzerConfig.LOG_DIR` instead of being written to stderr/stdout
7
+ * Warn if `--debug` and `--log-level` args both provided
8
+
3
9
  # 1.0.0
4
10
  * Add `--export-json` option
5
11
 
yaralyzer/config.py CHANGED
@@ -3,6 +3,8 @@ from argparse import ArgumentParser, Namespace
3
3
  from os import environ
4
4
  from typing import Any, List
5
5
 
6
+ from rich.console import Console
7
+
6
8
  YARALYZE = 'yaralyze'
7
9
  YARALYZER = f"{YARALYZE}r".upper()
8
10
  PYTEST_FLAG = 'INVOKED_BY_PYTEST'
@@ -56,6 +58,9 @@ class YaralyzerConfig:
56
58
  LOG_LEVEL_ENV_VAR = f"{YARALYZER}_LOG_LEVEL"
57
59
  LOG_LEVEL = logging.getLevelName(environ.get(LOG_LEVEL_ENV_VAR, 'WARN'))
58
60
 
61
+ if LOG_DIR and not is_invoked_by_pytest():
62
+ Console(color_system='256').print(f"Writing logs to '{LOG_DIR}' instead of stderr/stdout...", style='dim')
63
+
59
64
  HIGHLIGHT_STYLE = 'orange1'
60
65
 
61
66
  ONLY_CLI_ARGS = [
@@ -5,6 +5,7 @@ in the results.
5
5
  """
6
6
 
7
7
  from collections import defaultdict
8
+ from copy import deepcopy
8
9
  from operator import attrgetter
9
10
  from typing import List, Optional
10
11
 
@@ -13,18 +14,18 @@ from rich.console import Console, ConsoleOptions, NewLine, RenderResult
13
14
  from rich.panel import Panel
14
15
  from rich.table import Table
15
16
  from rich.text import Text
16
- from yaralyzer import bytes_match
17
17
 
18
18
  #from yaralyzer.bytes_match import BytesMatch
19
19
  from yaralyzer.config import YaralyzerConfig
20
20
  from yaralyzer.decoding.decoding_attempt import DecodingAttempt
21
- from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT
21
+ from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT, encoding_offsets
22
22
  from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
23
23
  from yaralyzer.encoding_detection.encoding_detector import EncodingDetector
24
24
  from yaralyzer.helpers.dict_helper import get_dict_key_by_value
25
+ from yaralyzer.helpers.list_helper import flatten
25
26
  from yaralyzer.helpers.rich_text_helper import CENTER, DECODING_ERRORS_MSG, NO_DECODING_ERRORS_MSG
26
27
  from yaralyzer.output.decoding_attempts_table import (DecodingTableRow, assessment_only_row,
27
- build_decoding_attempts_table, decoding_table_row)
28
+ decoding_table_row, new_decoding_attempts_table)
28
29
  from yaralyzer.util.logging import log
29
30
 
30
31
  # A 2-tuple that can be indexed by booleans of messages used in the table to show true vs. false
@@ -63,51 +64,48 @@ class BytesDecoder:
63
64
 
64
65
  # In standalone mode we always print the hex/raw bytes
65
66
  if self.bytes_match.is_decodable():
66
- yield self._generate_decodings_table()
67
+ yield self._build_decodings_table()
67
68
  elif YaralyzerConfig.args.standalone_mode:
68
- # TODO: yield self.bytes_match.suppression_notice()
69
- yield self._generate_decodings_table(True)
69
+ # TODO: yield self.bytes_match.suppression_notice() (i guess to show some notice that things are suppressed?)
70
+ yield self._build_decodings_table(True)
70
71
 
71
72
  yield NewLine()
72
73
  yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
73
74
 
74
- def _generate_decodings_table(self, suppress_decodes: bool = False) -> Table:
75
+ def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
75
76
  """First rows are the raw / hex views of the bytes, next rows are the attempted decodings"""
76
- self.table = build_decoding_attempts_table(self.bytes_match)
77
+ self.table = new_decoding_attempts_table(self.bytes_match)
77
78
 
78
- if YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes:
79
- return self.table
79
+ # Add the encoding rows to the table if not suppressed
80
+ if not (YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes):
81
+ self.decodings = [DecodingAttempt(self.bytes_match, encoding) for encoding in ENCODINGS_TO_ATTEMPT]
82
+ # Attempt decodings we don't usually attempt if chardet is insistent enough
83
+ forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
84
+ self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
80
85
 
81
- self.decodings = [
82
- DecodingAttempt(self.bytes_match, encoding)
83
- for encoding in ENCODINGS_TO_ATTEMPT.keys()
84
- ]
86
+ # If we still haven't decoded chardet's top choice, decode it
87
+ if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
88
+ chardet_top_encoding = self._forced_displays()[0].encoding
89
+ log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
90
+ self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
85
91
 
86
- # Attempt decodings we don't usually attempt if chardet is insistent enough
87
- forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
88
- self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
92
+ # Build the table rows from the decoding attempts
93
+ rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
94
+ rows += [assessment_only_row(a, a.confidence * SCORE_SCALER) for a in self._forced_displays()]
95
+ self._track_decode_stats()
89
96
 
90
- # If we still haven't decoded chardets top choice, decode it
91
- if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
92
- chardet_top_encoding = self._forced_displays()[0].encoding
93
- log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
94
- self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
95
-
96
- rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
97
- rows += [assessment_only_row(a, a.confidence * SCORE_SCALER) for a in self._forced_displays()]
98
- self._track_decode_stats()
99
-
100
- for row in sorted(rows, key=attrgetter('sort_score'), reverse=True):
101
- self.table.add_row(*row[0:4])
97
+ for row in sorted(rows, key=attrgetter('sort_score', 'encoding_label_plain'), reverse=True):
98
+ self.table.add_row(*row[0:4])
102
99
 
103
100
  return self.table
104
101
 
102
+ # TODO: rename this to something that makes more sense, maybe assessments_over_display_threshold()?
105
103
  def _forced_displays(self) -> List[EncodingAssessment]:
106
- """Returns assessments over the display threshold that are not yet decoded"""
104
+ """Returns assessments over the display threshold that are not yet decoded."""
107
105
  return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
108
106
 
109
107
  def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
110
- """Fiter out the already decoded assessments from a set of assessments"""
108
+ """Filter out the already decoded assessments from a set of assessments"""
111
109
  return [a for a in assessments if not self._was_decoded(a.encoding)]
112
110
 
113
111
  def _was_decoded(self, encoding: str) -> bool:
@@ -115,7 +113,7 @@ class BytesDecoder:
115
113
  return any(row.encoding == encoding for row in self.decodings)
116
114
 
117
115
  def _decode_attempt_subheading(self) -> Panel:
118
- """Generate a rich.Panel for decode attempts"""
116
+ """Generate a rich.Panel for displaying decode attempts"""
119
117
  headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
120
118
  return Panel(headline, style='decode.subheading', expand=False)
121
119
 
@@ -132,26 +130,33 @@ class BytesDecoder:
132
130
  self.was_match_force_decoded[decoding.encoding] += 1
133
131
 
134
132
  def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
135
- """
136
- Create a DecodingAttemptTable row from a DecodingAttempt.
137
- If the decoding result is a duplicate of a previous decoding, replace the decoded text
138
- with "same output as X" where X is the previous encoding that gave the same result.
139
- """
133
+ """Create a DecodingAttemptTable row from a DecodingAttempt."""
140
134
  assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
135
+
136
+ # If the decoding can have a start offset add an appropriate extension to the encoding label
137
+ if decoding.start_offset_label:
138
+ if assessment.language:
139
+ log.warning(f"{decoding.encoding} has offset {decoding.start_offset} and language '{assessment.language}'")
140
+ else:
141
+ assessment = deepcopy(assessment)
142
+ assessment.set_encoding_label(decoding.start_offset_label)
143
+
141
144
  plain_decoded_string = decoding.decoded_string.plain
142
145
  sort_score = assessment.confidence * SCORE_SCALER
143
146
 
147
+ # If the decoding result is a duplicate of a previous decoding, replace the decoded text
148
+ # with "same output as X" where X is the previous encoding that gave the same result.
144
149
  if plain_decoded_string in self.decoded_strings.values():
145
150
  encoding_with_same_output = get_dict_key_by_value(self.decoded_strings, plain_decoded_string)
146
151
  display_text = Text('same output as ', style='color(66) dim italic')
147
152
  display_text.append(encoding_with_same_output, style=ENCODING).append('...', style='white')
148
153
  else:
149
- self.decoded_strings[decoding.encoding] = plain_decoded_string
154
+ self.decoded_strings[decoding.encoding_label] = plain_decoded_string
150
155
  display_text = decoding.decoded_string
151
156
 
152
157
  # Set failures negative, shave off a little for forced decodes
153
158
  if decoding.failed_to_decode:
154
- sort_score = sort_score * -1 - 100
159
+ sort_score = (sort_score * -1) - 100
155
160
  elif decoding.was_force_decoded:
156
161
  sort_score -= 10
157
162
 
@@ -1,14 +1,15 @@
1
+ """
2
+ Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
3
+ """
1
4
  from sys import byteorder
2
5
  from typing import Optional
3
6
 
4
7
  from rich.markup import escape
5
- from rich.panel import Panel
6
8
  from rich.text import Text
7
9
 
8
- #from yaralyzer.bytes_match import ALERT_STYLE, BytesMatch
9
10
  from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
10
- UTF_8, UTF_16, UTF_32)
11
- from yaralyzer.helpers.bytes_helper import clean_byte_string
11
+ UTF_8, encoding_width, is_wide_utf)
12
+ from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
12
13
  from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj, unprintable_byte_to_text
13
14
  from yaralyzer.output.rich_console import ALERT_STYLE, BYTES_BRIGHTER, BYTES_BRIGHTEST, BYTES_NO_DIM, GREY_ADDRESS
14
15
  from yaralyzer.util.logging import log
@@ -16,17 +17,27 @@ from yaralyzer.util.logging import log
16
17
 
17
18
  class DecodingAttempt:
18
19
  def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
20
+ # Args
21
+ self.bytes = bytes_match.surrounding_bytes
19
22
  self.bytes_match = bytes_match
20
23
  self.encoding = encoding
21
- self.bytes = bytes_match.surrounding_bytes
24
+ # Inferred / derived values
25
+ self.encoding_label = encoding
26
+ self.start_offset = 0 # Offset in bytes to start decoding from
27
+ self.start_offset_label = None # String to indicate what offset we were able to decode
22
28
  self.was_force_decoded = False
23
29
  self.failed_to_decode = False
24
30
  self.decoded_string = self._decode_bytes()
25
31
 
32
+ def is_wide_utf_encoding(self) -> bool:
33
+ """Returns True if the encoding is UTF-16 or UTF-32"""
34
+ return is_wide_utf(self.encoding)
35
+
26
36
  def _decode_bytes(self) -> Text:
27
37
  """
28
- Sets self.decoded_string. Tries builtin decode, hands off to other methods for harsher treatement
29
- (Byte shifting for UTF-16/32 and custom decode for the rest) if that fails
38
+ Tries builtin decode, hands off to other methods for harsher treatement
39
+ (byte shifting for UTF-16/32 and custom decode for the rest) if that fails.
40
+ Has side effect of setting 'self.decoded_string' value.
30
41
  """
31
42
  try:
32
43
  decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
@@ -36,12 +47,12 @@ class DecodingAttempt:
36
47
  log.info(f"{self.encoding} failed on 1st pass decoding {self.bytes_match} capture; custom decoding...")
37
48
  except LookupError as e:
38
49
  log.warning(f"Unknown encoding: {self.encoding}. {e}")
39
- return self._failed_to_decode(e)
50
+ return self._failed_to_decode_msg_txt(e)
40
51
 
41
52
  self.was_force_decoded = True
42
53
 
43
- if self.encoding in [UTF_16, UTF_32]:
44
- return self._decode_utf_multibyte_with_byte_offset()
54
+ if self.is_wide_utf_encoding():
55
+ return self._decode_utf_multibyte()
45
56
  else:
46
57
  return self._custom_decode()
47
58
 
@@ -103,34 +114,35 @@ class DecodingAttempt:
103
114
 
104
115
  return output
105
116
 
106
- def _decode_utf_multibyte_with_byte_offset(self) -> Text:
107
- """ UTF-16/32 are fixed width (and wide)"""
108
- char_width = 2 if self.encoding == UTF_16 else 4
109
- log.debug(f"Decoding {self.encoding}, char_width is {char_width}...")
117
+ def _decode_utf_multibyte(self) -> Text:
118
+ """UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte."""
119
+ char_width = encoding_width(self.encoding)
110
120
  last_exception = None
111
121
  decoded_str = None
112
- bytes_offset = 1
122
+ bytes_offset = 0
113
123
 
124
+ # Iterate through the possibly byte offsets until we find a valid decoded string (or don't)
114
125
  while bytes_offset < char_width:
115
126
  try:
116
- decoded_str = self.bytes[bytes_offset:].decode(self.encoding)
127
+ decoded_str = truncate_for_encoding(self.bytes[bytes_offset:], self.encoding).decode(self.encoding)
117
128
  except UnicodeDecodeError as e:
118
129
  log.info(f"Exception decoding w/offset {bytes_offset} in {self.encoding}: {e}")
119
130
  last_exception = e
120
131
 
132
+ # Append the current bytes_offset to the encoding label if we found a valid decoded string
121
133
  if decoded_str is not None:
134
+ log.debug(f"Successfully decoded '{self.encoding}' w/offset {bytes_offset}")
135
+ self.start_offset = bytes_offset
136
+ self.start_offset_label = f"offset {self.start_offset} byte" + ('s' if self.start_offset > 1 else '')
137
+ self.encoding_label = f"{self.encoding} ({self.start_offset_label})"
122
138
  break
123
139
 
124
140
  bytes_offset += 1
125
141
 
126
- if decoded_str is None:
127
- return self._failed_to_decode(last_exception)
128
-
129
- return self._to_rich_text(decoded_str, bytes_offset)
130
-
131
- def _failed_to_decode(self, exception: Optional[Exception]) -> Text:
132
- self.failed_to_decode = True
133
- return prefix_with_plain_text_obj(f"(decode failed: {exception})", style='red dim italic')
142
+ if decoded_str is not None:
143
+ return self._to_rich_text(decoded_str, bytes_offset)
144
+ else:
145
+ return self._failed_to_decode_msg_txt(last_exception)
134
146
 
135
147
  def _to_rich_text(self, _string: str, bytes_offset: int=0) -> Text:
136
148
  """Convert a decoded string to highlighted Text representation"""
@@ -165,3 +177,8 @@ class DecodingAttempt:
165
177
  current_byte_idx += char_width
166
178
 
167
179
  return txt
180
+
181
+ def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
182
+ """Set failed_to_decode flag and return a Text object with the error message."""
183
+ self.failed_to_decode = True
184
+ return prefix_with_plain_text_obj(f"(decode failed: {exception})", style='red dim italic')
@@ -136,22 +136,49 @@ UNPRINTABLE_ISO_8859_7.update({
136
136
  })
137
137
 
138
138
 
139
- # The encodings we will attempt to actually use
140
- # Values are the unprintable values in that encoding in a dict (keys in dict are ints)
139
+ # Keys are names of encodings we will attempt to decode with, values are dicts mapping the unprintable bytes
140
+ # in that encoding to appropriate string represenations of those unprintable bytes.
141
+ # Order matters here, as we will attempt the decoding in the order of the keys.
141
142
  ENCODINGS_TO_ATTEMPT = {
142
143
  ASCII: UNPRINTABLE_ASCII,
143
- UTF_8: UNPRINTABLE_UTF_8,
144
- UTF_16: None,
145
- UTF_32: None, # UTF-16 and 32 are handled differently
146
- #'utf-7':
144
+ UTF_8: UNPRINTABLE_UTF_8,
145
+ UTF_16: None,
146
+ UTF_32: None, # UTF-16 and 32 are handled differently
147
147
  ISO_8859_1: UNPRINTABLE_ISO_8859_1,
148
- WINDOWS_1252: UNPRINTABLE_WIN_1252
148
+ WINDOWS_1252: UNPRINTABLE_WIN_1252,
149
+ #'utf-7':
150
+ }
151
+
152
+ SINGLE_BYTE_ENCODINGS = [
153
+ ASCII,
154
+ ISO_8859_1,
155
+ WINDOWS_1252,
156
+ ]
157
+
158
+ # Keys are encodings that use multiple bytes to represent a single character, values are the possible offsets
159
+ # to attempt to use as the starting point for decoding in a given set of bytes.
160
+ WIDE_UTF_ENCODINGS = {
161
+ UTF_16: [0, 1],
162
+ UTF_32: [0, 1, 2, 3],
149
163
  }
150
164
 
151
165
 
152
- SINGLE_BYTE_ENCODINGS = [ASCII, ISO_8859_1, WINDOWS_1252]
166
+ def encoding_offsets(encoding: str) -> list:
167
+ """Get possible offsets for a given encoding. If the encoding is not in WIDE_UTF_ENCODINGS, return [0]."""
168
+ return WIDE_UTF_ENCODINGS.get(encoding, [0])
169
+
170
+
171
+ def encoding_width(encoding: str) -> int:
172
+ """Get the width of a character in bytes for a given encoding, which is the number of possible offsets."""
173
+ return len(encoding_offsets(encoding))
174
+
175
+
176
+ def is_wide_utf(encoding: str) -> bool:
177
+ """Check if the encoding is a wide UTF encoding (UTF-16 or UTF-32)."""
178
+ return encoding in WIDE_UTF_ENCODINGS
179
+
153
180
 
154
- # Unused cruft (mostly Asian language encodings)
181
+ # TODO: this is unused cruft (mostly Asian language encodings)
155
182
  ENCODINGS = [
156
183
  'big5',
157
184
  'big5hkscs',
@@ -1,7 +1,7 @@
1
1
  """
2
2
  Class to smooth some of the rough edges around the dicts returned by chardet.detect_all()
3
3
  """
4
- from typing import Any
4
+ from typing import Any, Optional
5
5
 
6
6
  from rich.text import Text
7
7
 
@@ -17,30 +17,32 @@ class EncodingAssessment:
17
17
  def __init__(self, assessment: dict) -> None:
18
18
  self.assessment = assessment
19
19
  self.encoding = assessment[ENCODING].lower()
20
- self.encoding_text = Text(self.encoding, 'encoding.header')
21
- self.language = self._get_dict_empty_value_as_None(LANGUAGE)
22
- self.language_text = None if self.language is None else Text(self.language, 'encoding.language')
23
20
 
24
21
  # Shift confidence from 0-1.0 scale to 0-100.0 scale
25
- confidence = self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0
26
- assert isinstance(confidence, float)
27
- self.confidence = 100.0 * confidence
22
+ self.confidence = 100.0 * (self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0)
28
23
  self.confidence_text = prefix_with_plain_text_obj(f"{round(self.confidence, 1)}%", style=meter_style(self.confidence))
29
24
 
30
- # Pair the language name with the encoding name into one Text obj
31
- if self.language is not None:
32
- dim = 'dim' if confidence < DIM_COUNTRY_THRESHOLD else ''
33
- self.encoding_text.append(f" ({self.language.title()})", style=f"color(23) {dim}")
25
+ # Add detected language info and label if any language was detected
26
+ self.language = self._get_dict_empty_value_as_None(LANGUAGE)
27
+ self.set_encoding_label(self.language.title() if self.language else None)
34
28
 
35
29
  @classmethod
36
30
  def dummy_encoding_assessment(cls, encoding) -> 'EncodingAssessment':
37
- """Generate an empty EncodingAssessment to use as a dummy when chardet gives us nothing"""
38
- assessment = cls({ENCODING: encoding, 'confidence': 0.0})
31
+ """Generate an empty EncodingAssessment to use as a dummy when chardet gives us nothing."""
32
+ assessment = cls({ENCODING: encoding, CONFIDENCE: 0.0})
39
33
  assessment.confidence_text = Text('none', 'no_attempt')
40
34
  return assessment
41
35
 
36
+ def set_encoding_label(self, alt_text: Optional[str]) -> None:
37
+ """Alt text is displayed below the encoding in slightly dimmer font."""
38
+ self.encoding_label = Text(self.encoding, 'encoding.header')
39
+
40
+ if alt_text is not None:
41
+ dim = 'dim' if (self.confidence or 0.0) < DIM_COUNTRY_THRESHOLD else ''
42
+ self.encoding_label.append(f" ({alt_text})", style=f"color(23) {dim}")
43
+
42
44
  def __rich__(self) -> Text:
43
- return Text('<Chardet(', 'white') + self.encoding_text + Text(':') + self.confidence_text + Text('>')
45
+ return Text('<Chardet(', 'white') + self.encoding_label + Text(':') + self.confidence_text + Text('>')
44
46
 
45
47
  def __str__(self) -> str:
46
48
  return self.__rich__().plain
@@ -53,7 +53,7 @@ class EncodingDetector:
53
53
  self.force_decode_assessments = self.assessments_above_confidence(type(self).force_decode_threshold)
54
54
  self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
55
55
 
56
- def get_encoding_assessment(self, encoding) -> EncodingAssessment:
56
+ def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
57
57
  """If chardet produced one, return it, otherwise return a dummy node with confidence of 0"""
58
58
  assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
59
59
  return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
@@ -75,7 +75,7 @@ class EncodingDetector:
75
75
  if result.confidence < YaralyzerConfig.args.min_chardet_table_confidence:
76
76
  continue
77
77
 
78
- self.table.add_row(f"{i + 1}", result.encoding_text, result.confidence_text)
78
+ self.table.add_row(f"{i + 1}", result.encoding_label, result.confidence_text)
79
79
 
80
80
  # self.unique_assessments retains one result per encoding possibility (the highest confidence one)
81
81
  # Some encodings are not language specific and for those we don't care about the language
@@ -11,7 +11,7 @@ from rich.text import Text
11
11
 
12
12
  from yaralyzer.bytes_match import BytesMatch
13
13
  from yaralyzer.config import YaralyzerConfig
14
- from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
14
+ from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE, encoding_width
15
15
  from yaralyzer.helpers.rich_text_helper import newline_join
16
16
  from yaralyzer.output.rich_console import (BYTES, BYTES_BRIGHTER, BYTES_BRIGHTEST,
17
17
  BYTES_HIGHLIGHT, GREY, console, console_width)
@@ -126,6 +126,21 @@ def print_bytes(bytes_array: bytes, style=None) -> None:
126
126
  console.print(escape(clean_byte_string(line)), style=style or 'bytes')
127
127
 
128
128
 
129
+ def truncate_for_encoding(_bytes: bytes, encoding: str) -> bytes:
130
+ """
131
+ Truncate bytes to the a modulus of the char width of the given encoding.
132
+ For utf-16 this means truncate to a multiple of 2, for utf-32 to a multiple of 4.
133
+ """
134
+ char_width = encoding_width(encoding)
135
+ num_bytes = len(_bytes)
136
+ num_extra_bytes = num_bytes % char_width
137
+
138
+ if char_width <= 1 or num_bytes <= char_width or num_extra_bytes == 0:
139
+ return _bytes
140
+ else:
141
+ return _bytes[:-num_extra_bytes]
142
+
143
+
129
144
  def _find_str_rep_of_bytes(surrounding_bytes_str: str, highlighted_bytes_str: str, highlighted_bytes: BytesMatch):
130
145
  """
131
146
  Find the position of bytes_str in surrounding_byte_str. Both args are raw text dumps of binary data.
@@ -1,5 +1,5 @@
1
1
  """
2
- Help with dicts
2
+ Help with dicts.
3
3
  """
4
4
  from numbers import Number
5
5
 
@@ -0,0 +1,15 @@
1
+ """
2
+ Help with lists.
3
+ """
4
+
5
+ def flatten(a):
6
+ """From https://www.geeksforgeeks.org/python/python-flatten-list-to-individual-elements/"""
7
+ return_value = []
8
+
9
+ for x in a:
10
+ if isinstance(x, list):
11
+ return_value.extend(flatten(x)) # Recursively flatten nested lists
12
+ else:
13
+ return_value.append(x) # Append individual elements
14
+
15
+ return return_value
@@ -10,8 +10,7 @@ from rich.panel import Panel
10
10
  from rich.style import Style
11
11
  from rich.text import Text
12
12
 
13
- from yaralyzer.output.rich_console import (BYTES_BRIGHTEST, BYTES_HIGHLIGHT, YARALYZER_THEME_DICT,
14
- console)
13
+ from yaralyzer.output.rich_console import (BYTES_BRIGHTEST, BYTES_HIGHLIGHT, YARALYZER_THEME_DICT, console)
15
14
  from yaralyzer.util.logging import log
16
15
 
17
16
  # String constants
@@ -11,7 +11,6 @@ Final output should be rich.table of decoding attempts that are sorted like this
11
11
  3. Decodings that were the same as other decodings
12
12
  4. Failed decodings
13
13
  """
14
-
15
14
  from collections import namedtuple
16
15
 
17
16
  from rich import box
@@ -20,21 +19,23 @@ from rich.text import Text
20
19
 
21
20
  from yaralyzer.bytes_match import BytesMatch
22
21
  from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
23
- from yaralyzer.helpers.bytes_helper import (ascii_view_of_raw_bytes, hex_view_of_raw_bytes,
24
- rich_text_view_of_raw_bytes)
22
+ from yaralyzer.helpers.bytes_helper import ascii_view_of_raw_bytes, hex_view_of_raw_bytes, rich_text_view_of_raw_bytes
25
23
  from yaralyzer.helpers.rich_text_helper import CENTER, FOLD, MIDDLE, RIGHT, na_txt
26
24
 
27
- # The confidence and encoding will not be shown in the final display - instead their Text versions are shown
25
+ # The confidence and encoding will not be shown in the final display - instead their Text versions are shown.
26
+ # TODO: this should become a dataclass (requires Python 3.7+)
28
27
  DecodingTableRow = namedtuple(
29
28
  'DecodingTableRow',
30
29
  [
31
- 'encoding_text',
30
+ 'encoding_label',
32
31
  'confidence_text',
33
32
  'errors_while_decoded',
34
33
  'decoded_string',
34
+ # Properties below here are not displayed in the table but are used for sorting etc.
35
35
  'confidence',
36
36
  'encoding',
37
- 'sort_score'
37
+ 'sort_score',
38
+ 'encoding_label_plain', # For sorting purposes, if confidences match
38
39
  ]
39
40
  )
40
41
 
@@ -43,9 +44,9 @@ HEX = Text('HEX', style='bytes.title')
43
44
  RAW_BYTES = Text('Raw', style=f"bytes")
44
45
 
45
46
 
46
- def build_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
47
- """First rows are the raw / hex views of the bytes then 1 row per decoding attempt."""
48
- table = Table(show_lines=True, border_style='bytes', header_style='color(101) bold')
47
+ def new_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
48
+ """Build a new rich Table with two rows, the raw and hex views of the bytes_match data."""
49
+ table = Table(show_lines=True, border_style='bytes', header_style='decode.table_header')
49
50
 
50
51
  def add_col(title, **kwargs):
51
52
  kwargs['justify'] = kwargs.get('justify', CENTER)
@@ -64,19 +65,21 @@ def build_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
64
65
 
65
66
 
66
67
  def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Text, score: float) -> DecodingTableRow:
67
- """Get a table row for a decoding attempt"""
68
+ """Build a table row for a decoding attempt."""
68
69
  return DecodingTableRow(
69
- assessment.encoding_text,
70
+ assessment.encoding_label,
70
71
  assessment.confidence_text,
71
72
  is_forced,
72
73
  txt,
73
74
  assessment.confidence,
74
75
  assessment.encoding,
75
- sort_score=score)
76
+ sort_score=score,
77
+ encoding_label_plain=assessment.encoding_label.plain
78
+ )
76
79
 
77
80
 
78
81
  def assessment_only_row(assessment: EncodingAssessment, score) -> DecodingTableRow:
79
- """Build a row with just chardet assessment data and no actual decoded string"""
82
+ """Build a row with just chardet assessment confidence data and no actual decoding attempt string."""
80
83
  return decoding_table_row(assessment, na_txt(), DECODE_NOT_ATTEMPTED_MSG, score)
81
84
 
82
85
 
@@ -85,8 +88,8 @@ def _hex_preview_subtable(bytes_match: BytesMatch) -> Table:
85
88
  hex_table = Table(
86
89
  'hex',
87
90
  'ascii',
88
- border_style='color(235) dim',
89
- header_style='color(101) bold',
91
+ border_style='grey.darkest',
92
+ header_style='decode.table_header',
90
93
  box=box.MINIMAL,
91
94
  show_lines=True,
92
95
  show_header=True,
@@ -53,6 +53,7 @@ YARALYZER_THEME_DICT = {
53
53
  'decode.section_header': 'color(100) reverse',
54
54
  'decode.subheading': PEACH,
55
55
  'decode.subheading_2': 'color(215) dim italic',
56
+ 'decode.table_header': 'color(101) bold',
56
57
  'headline': 'bold white underline',
57
58
  # bytes
58
59
  'ascii': 'color(58)',
@@ -234,6 +234,9 @@ def parse_arguments(args: Optional[Namespace] = None):
234
234
 
235
235
  if args.debug:
236
236
  log.setLevel(logging.DEBUG)
237
+
238
+ if args.log_level and args.log_level != 'DEBUG':
239
+ log.warning("Ignoring --log-level option as debug mode means log level is DEBUG")
237
240
  elif args.log_level:
238
241
  log.setLevel(args.log_level)
239
242
 
yaralyzer/util/logging.py CHANGED
@@ -54,7 +54,6 @@ def configure_logger(log_label: str) -> logging.Logger:
54
54
  rich_stream_handler = RichHandler(rich_tracebacks=True)
55
55
  rich_stream_handler.setLevel('WARN')
56
56
  logger.addHandler(rich_stream_handler)
57
- logger.info('File logging triggered by setting of YARALYZER_LOG_DIR')
58
57
  else:
59
58
  logger.addHandler(RichHandler(rich_tracebacks=True))
60
59
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: yaralyzer
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: Visualize and force decode YARA and regex matches found in a file or byte stream. With colors. Lots of colors.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/yaralyzer
6
6
  License: GPL-3.0-or-later
@@ -0,0 +1,31 @@
1
+ .yaralyzer.example,sha256=z3_mk41xxm0Pr_8MGM7AKQG0xEFRtGcyJLboMuelRp4,3504
2
+ CHANGELOG.md,sha256=5DIUFaaTQpYkCLKYotRBpBZUMILJgP4ECtqla9zKJRY,2539
3
+ yaralyzer/__init__.py,sha256=YItEM_QKbLUj-6QZg2ZINrTzPQZ1IHOjGgoxmRR2buA,2703
4
+ yaralyzer/bytes_match.py,sha256=ShAxI_jZYElG1w-FJ9wNF-5SReL2uv-iJTiQQS3VTM0,8213
5
+ yaralyzer/config.py,sha256=VU5RTQwbNV3Ai02p4mAjiJrbL30gjjf9xBGl4IOh0Qs,3927
6
+ yaralyzer/decoding/bytes_decoder.py,sha256=tJKFoWChIpmgW23XiCwlfZCHdTXUz5z277U8-CXsjsg,8752
7
+ yaralyzer/decoding/decoding_attempt.py,sha256=8o0A4gidE4olW187QXIAYrAAYdjUsqoGz2YRgPqbJ3Y,8391
8
+ yaralyzer/encoding_detection/character_encodings.py,sha256=KqN0sdGZsVMaJM9qjGfcZNyyjcUPlTCob0jBLh-DW7E,5383
9
+ yaralyzer/encoding_detection/encoding_assessment.py,sha256=-YMjkl4AuQYBdq2SFMw1LvA7A8auNxtVIM93az9Xwzc,2368
10
+ yaralyzer/encoding_detection/encoding_detector.py,sha256=tqTgTOv7WjQgfVhShGETXgJmZFw16HoQ2l6WhLlAY34,4738
11
+ yaralyzer/helpers/bytes_helper.py,sha256=8AEW3aPv0dROD-srfe8z9m12bVZLrdvHRq-RBNQ4Vso,7442
12
+ yaralyzer/helpers/dict_helper.py,sha256=THbCgnTLgtM2v8MjjxZk2V296cYBghzjss6xhRBqYPQ,212
13
+ yaralyzer/helpers/file_helper.py,sha256=uf8dTOhRrJng0V36o7Mwk5t-L5gc4_uOaGj9F0s5OBA,1254
14
+ yaralyzer/helpers/list_helper.py,sha256=r3YUMkkVvl5R5VBzc3rxCzyPW_Nxhj5CRinBCHs9dAY,393
15
+ yaralyzer/helpers/rich_text_helper.py,sha256=elkWt2LoV_FnajK-UadEs_gPWSBE7NSsyJHN1eQsmgw,4213
16
+ yaralyzer/helpers/string_helper.py,sha256=AT2_CAgpvtp8GiUSKLTiDoToDD3tBB9BbrlX-s2bL7o,932
17
+ yaralyzer/output/decoding_attempts_table.py,sha256=x6AViJqAj7ept92OXWl9-PVk8MyBSyYt62mUgJjsP7U,4040
18
+ yaralyzer/output/file_export.py,sha256=YfF5D8aHOUQHwV0akFaaSMafbhdhUakvipadpq6HZmk,2927
19
+ yaralyzer/output/file_hashes_table.py,sha256=SnS2ip8dSeHoycQ0Ng3Gtpv9rXJSkKnvD2krTuhNg7s,1632
20
+ yaralyzer/output/regex_match_metrics.py,sha256=deJPaVnhpy-AUX6PCE_jbPLIlmfIOtl-cEVWsiFp3KY,3003
21
+ yaralyzer/output/rich_console.py,sha256=NJi6LjvoOfFXm9Kq9TQbZ3P32C5nQtahccUMEY_Ykpw,4248
22
+ yaralyzer/util/argument_parser.py,sha256=PNmdmFULBq10lAXOt9McZImQ-H5VNnrNN2LeTRxd0P0,12928
23
+ yaralyzer/util/logging.py,sha256=6N-JrQfAbVdCMYvqJ3MUHMchSwFN9208-0giWvX4OYY,4248
24
+ yaralyzer/yara/yara_match.py,sha256=4_26eaJT9I0PULiCdxerQtX4TfAIwcT-B6GJociGM9A,5119
25
+ yaralyzer/yara/yara_rule_builder.py,sha256=kAa3RBojM5GEaXDJjKZODAyx6yj34AlkOnQhACAFfZM,3021
26
+ yaralyzer/yaralyzer.py,sha256=f1y8qST6GZHEWl7nDNEBWpQuYjnsJ8dm9nGPWqZ4Hkk,9417
27
+ yaralyzer-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
28
+ yaralyzer-1.0.1.dist-info/METADATA,sha256=oWGgWkTQelQydVVU4o9rDqaxHayeGOn6c7-EI2DrlpI,10795
29
+ yaralyzer-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
30
+ yaralyzer-1.0.1.dist-info/entry_points.txt,sha256=7LnLJrNTfql0vuctjRWwp_ZD-BYvtv9ENVipdjuT7XI,136
31
+ yaralyzer-1.0.1.dist-info/RECORD,,
@@ -1,30 +0,0 @@
1
- .yaralyzer.example,sha256=4QKFDDNvnAlT3NVS0eiM_Qed9Hxy4ZPQFJkye8lKYAk,3486
2
- CHANGELOG.md,sha256=XN0_isuQMJy03PATrBkBIPWyIXzUV1g5YG1FMThCves,2158
3
- yaralyzer/__init__.py,sha256=YItEM_QKbLUj-6QZg2ZINrTzPQZ1IHOjGgoxmRR2buA,2703
4
- yaralyzer/bytes_match.py,sha256=ShAxI_jZYElG1w-FJ9wNF-5SReL2uv-iJTiQQS3VTM0,8213
5
- yaralyzer/config.py,sha256=eRJ88wBFs1rfjOv4htI1Ye0LFCFfk4kGDiFHuqZfkX0,3730
6
- yaralyzer/decoding/bytes_decoder.py,sha256=lulfZZhYmo9ky2KpqBW-c9hs5_uhlaz0gatdtT_NYSY,7951
7
- yaralyzer/decoding/decoding_attempt.py,sha256=GAxMNOX7I_FsuzGWIelTWAECytLUJD-wpmUAuVe2bn0,7241
8
- yaralyzer/encoding_detection/character_encodings.py,sha256=zrOUgNXwrcXkeYSgdo09vsFPmNYsTkaHvq7YzzpbMsk,4395
9
- yaralyzer/encoding_detection/encoding_assessment.py,sha256=yMb1QSHS7JpNyL6jnZwt9Vq0Y6ueVStYZjMKyP6-f5A,2307
10
- yaralyzer/encoding_detection/encoding_detector.py,sha256=e_UtZi1Nh3ZRBOESEFtjYz0vJ_1cZjIJ5uWRuzM91oc,4732
11
- yaralyzer/helpers/bytes_helper.py,sha256=XemBmf9tXgjoN-X7AULHL1wKS1lkQR6XXGt_D2lMBY0,6915
12
- yaralyzer/helpers/dict_helper.py,sha256=h8Sg01qCJRKfZ0bmTYhLP6X5OVxMg-7CZryJIjcbw8E,211
13
- yaralyzer/helpers/file_helper.py,sha256=uf8dTOhRrJng0V36o7Mwk5t-L5gc4_uOaGj9F0s5OBA,1254
14
- yaralyzer/helpers/rich_text_helper.py,sha256=9Wc6WM625iKxAXRvxBkVzvszfcxb8YtqoQ6d7d8EqoQ,4218
15
- yaralyzer/helpers/string_helper.py,sha256=AT2_CAgpvtp8GiUSKLTiDoToDD3tBB9BbrlX-s2bL7o,932
16
- yaralyzer/output/decoding_attempts_table.py,sha256=cMY9eCXZHj0FfGxJ9uoM5cpdhQve-EtTRHv3fTHKJAo,3712
17
- yaralyzer/output/file_export.py,sha256=YfF5D8aHOUQHwV0akFaaSMafbhdhUakvipadpq6HZmk,2927
18
- yaralyzer/output/file_hashes_table.py,sha256=SnS2ip8dSeHoycQ0Ng3Gtpv9rXJSkKnvD2krTuhNg7s,1632
19
- yaralyzer/output/regex_match_metrics.py,sha256=deJPaVnhpy-AUX6PCE_jbPLIlmfIOtl-cEVWsiFp3KY,3003
20
- yaralyzer/output/rich_console.py,sha256=Botb8aec4_aRiPyaEkwrnhwERHE8a5-lk5KfgzXVlBE,4202
21
- yaralyzer/util/argument_parser.py,sha256=cVUe3lQCb6iFnP5leE-C0OeXkBPomw55Xi1MiD1Gl50,12776
22
- yaralyzer/util/logging.py,sha256=3qtLnCFbN8L1nTSwIQvxfcM5jfhIRWTFZj9XGQk74kc,4326
23
- yaralyzer/yara/yara_match.py,sha256=4_26eaJT9I0PULiCdxerQtX4TfAIwcT-B6GJociGM9A,5119
24
- yaralyzer/yara/yara_rule_builder.py,sha256=kAa3RBojM5GEaXDJjKZODAyx6yj34AlkOnQhACAFfZM,3021
25
- yaralyzer/yaralyzer.py,sha256=f1y8qST6GZHEWl7nDNEBWpQuYjnsJ8dm9nGPWqZ4Hkk,9417
26
- yaralyzer-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
27
- yaralyzer-1.0.0.dist-info/METADATA,sha256=HfE-vyCyOUs_QyM7saQAU9b4hiS9poBxuwLuFf8jsNM,10795
28
- yaralyzer-1.0.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
29
- yaralyzer-1.0.0.dist-info/entry_points.txt,sha256=7LnLJrNTfql0vuctjRWwp_ZD-BYvtv9ENVipdjuT7XI,136
30
- yaralyzer-1.0.0.dist-info/RECORD,,