PyPI - yaralyzer - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

yaralyzer 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of yaralyzer might be problematic. Click here for more details.

Files changed (22) hide show

.yaralyzer.example +5 -1
CHANGELOG.md +6 -0
yaralyzer/config.py +5 -0
yaralyzer/decoding/bytes_decoder.py +44 -39
yaralyzer/decoding/decoding_attempt.py +41 -24
yaralyzer/encoding_detection/character_encodings.py +36 -9
yaralyzer/encoding_detection/encoding_assessment.py +16 -14
yaralyzer/encoding_detection/encoding_detector.py +2 -2
yaralyzer/helpers/bytes_helper.py +16 -1
yaralyzer/helpers/dict_helper.py +1 -1
yaralyzer/helpers/list_helper.py +15 -0
yaralyzer/helpers/rich_text_helper.py +1 -2
yaralyzer/output/decoding_attempts_table.py +18 -15
yaralyzer/output/rich_console.py +1 -0
yaralyzer/util/argument_parser.py +3 -0
yaralyzer/util/logging.py +0 -1
{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/METADATA +1 -1
yaralyzer-1.0.1.dist-info/RECORD +31 -0
yaralyzer-1.0.0.dist-info/RECORD +0 -30
{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/LICENSE +0 -0
{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/WHEEL +0 -0
{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/entry_points.txt +0 -0

.yaralyzer.example CHANGED Viewed

@@ -17,7 +17,7 @@
 #    YARALYZER_STACK_SIZE=10485760
 #    YARALYZER_MAX_MATCH_LENGTH=10737418240
-# Suppress all PDF binary regex matching/scanning/etc
+# Suppress all attempts to decode bytes into various text encodings
 #    YARALYZER_SUPPRESS_DECODES_TABLE=False
 # Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
@@ -30,6 +30,8 @@
 # Configure how many bytes before and after any binary data should be included in scans and visualizations
 #    YARALYZER_SURROUNDING_BYTES=64
 # Size thresholds (in bytes) under/over which yaralyzer will NOT make attempts to decode a match.
 # Longer byte sequences are for obvious reasons slower to decode by force.
 # It may feel counterintuitive but larger chunks of random binary are also harder to examine and
@@ -45,6 +47,8 @@
 # Minimum bytes to run chardet.detect() on a sequence of bytes
 #    YARALYZER_MIN_BYTES_TO_DETECT_ENCODING
 # Directory to write application logs to. Must be an absolute path, not a relative one.
 # These logs are not normally written to a file and the default log level means that the standard behavior
 # is to more or less discard them. Be aware that if you configure this variable a few things will change:

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # NEXT RELEASE
+### 1.0.1
+* Fix iteration of byte offsets during attempted decodes for UTF-16 and UTF-32 (was starting at second byte instead of first)
+* Label the byte offset for forced UTF-16 and UTF-32 decodes
+* Show helpful message if logs are being sent to files in `YaralyzerConfig.LOG_DIR` instead of being written to stderr/stdout
+* Warn if `--debug` and `--log-level` args both provided
 # 1.0.0
 * Add `--export-json` option

yaralyzer/config.py CHANGED Viewed

@@ -3,6 +3,8 @@ from argparse import ArgumentParser, Namespace
 from os import environ
 from typing import Any, List
+from rich.console import Console
 YARALYZE = 'yaralyze'
 YARALYZER = f"{YARALYZE}r".upper()
 PYTEST_FLAG = 'INVOKED_BY_PYTEST'
@@ -56,6 +58,9 @@ class YaralyzerConfig:
     LOG_LEVEL_ENV_VAR = f"{YARALYZER}_LOG_LEVEL"
     LOG_LEVEL = logging.getLevelName(environ.get(LOG_LEVEL_ENV_VAR, 'WARN'))
+    if LOG_DIR and not is_invoked_by_pytest():
+        Console(color_system='256').print(f"Writing logs to '{LOG_DIR}' instead of stderr/stdout...", style='dim')
     HIGHLIGHT_STYLE = 'orange1'
     ONLY_CLI_ARGS = [

yaralyzer/decoding/bytes_decoder.py CHANGED Viewed

@@ -5,6 +5,7 @@ in the results.
 """
 from collections import defaultdict
+from copy import deepcopy
 from operator import attrgetter
 from typing import List, Optional
@@ -13,18 +14,18 @@ from rich.console import Console, ConsoleOptions, NewLine, RenderResult
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
-from yaralyzer import bytes_match
 #from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.config import YaralyzerConfig
 from yaralyzer.decoding.decoding_attempt import DecodingAttempt
-from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT
+from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT, encoding_offsets
 from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
 from yaralyzer.encoding_detection.encoding_detector import EncodingDetector
 from yaralyzer.helpers.dict_helper import get_dict_key_by_value
+from yaralyzer.helpers.list_helper import flatten
 from yaralyzer.helpers.rich_text_helper import CENTER, DECODING_ERRORS_MSG, NO_DECODING_ERRORS_MSG
 from yaralyzer.output.decoding_attempts_table import (DecodingTableRow, assessment_only_row,
-     build_decoding_attempts_table, decoding_table_row)
+     decoding_table_row, new_decoding_attempts_table)
 from yaralyzer.util.logging import log
 # A 2-tuple that can be indexed by booleans of messages used in the table to show true vs. false
@@ -63,51 +64,48 @@ class BytesDecoder:
         # In standalone mode we always print the hex/raw bytes
         if self.bytes_match.is_decodable():
-            yield self._generate_decodings_table()
+            yield self._build_decodings_table()
         elif YaralyzerConfig.args.standalone_mode:
-            # TODO: yield self.bytes_match.suppression_notice()
-            yield self._generate_decodings_table(True)
+            # TODO: yield self.bytes_match.suppression_notice() (i guess to show some notice that things are suppressed?)
+            yield self._build_decodings_table(True)
         yield NewLine()
         yield Align(self.bytes_match.bytes_hashes_table(), CENTER, style='dim')
-    def _generate_decodings_table(self, suppress_decodes: bool = False) -> Table:
+    def _build_decodings_table(self, suppress_decodes: bool = False) -> Table:
         """First rows are the raw / hex views of the bytes, next rows are the attempted decodings"""
-        self.table = build_decoding_attempts_table(self.bytes_match)
+        self.table = new_decoding_attempts_table(self.bytes_match)
-        if YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes:
-            return self.table
+        # Add the encoding rows to the table if not suppressed
+        if not (YaralyzerConfig.args.suppress_decoding_attempts or suppress_decodes):
+            self.decodings = [DecodingAttempt(self.bytes_match, encoding) for encoding in ENCODINGS_TO_ATTEMPT]
+            # Attempt decodings we don't usually attempt if chardet is insistent enough
+            forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
+            self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
-        self.decodings = [
-            DecodingAttempt(self.bytes_match, encoding)
-            for encoding in ENCODINGS_TO_ATTEMPT.keys()
-        ]
+            # If we still haven't decoded chardet's top choice, decode it
+            if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
+                chardet_top_encoding = self._forced_displays()[0].encoding
+                log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
+                self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
-        # Attempt decodings we don't usually attempt if chardet is insistent enough
-        forced_decodes = self._undecoded_assessments(self.encoding_detector.force_decode_assessments)
-        self.decodings += [DecodingAttempt(self.bytes_match, a.encoding) for a in forced_decodes]
+            # Build the table rows from the decoding attempts
+            rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
+            rows += [assessment_only_row(a, a.confidence * SCORE_SCALER) for a in self._forced_displays()]
+            self._track_decode_stats()
-        # If we still haven't decoded chardets top choice, decode it
-        if len(self._forced_displays()) > 0 and not self._was_decoded(self._forced_displays()[0].encoding):
-            chardet_top_encoding = self._forced_displays()[0].encoding
-            log.info(f"Decoding {chardet_top_encoding} because it's chardet top choice...")
-            self.decodings.append(DecodingAttempt(self.bytes_match, chardet_top_encoding))
-        rows = [self._row_from_decoding_attempt(decoding) for decoding in self.decodings]
-        rows += [assessment_only_row(a, a.confidence * SCORE_SCALER) for a in self._forced_displays()]
-        self._track_decode_stats()
-        for row in sorted(rows, key=attrgetter('sort_score'), reverse=True):
-            self.table.add_row(*row[0:4])
+            for row in sorted(rows, key=attrgetter('sort_score', 'encoding_label_plain'), reverse=True):
+                self.table.add_row(*row[0:4])
         return self.table
+    # TODO: rename this to something that makes more sense, maybe assessments_over_display_threshold()?
     def _forced_displays(self) -> List[EncodingAssessment]:
-        """Returns assessments over the display threshold that are not yet decoded"""
+        """Returns assessments over the display threshold that are not yet decoded."""
         return self._undecoded_assessments(self.encoding_detector.force_display_assessments)
     def _undecoded_assessments(self, assessments: List[EncodingAssessment]) -> List[EncodingAssessment]:
-        """Fiter out the already decoded assessments from a set of assessments"""
+        """Filter out the already decoded assessments from a set of assessments"""
         return [a for a in assessments if not self._was_decoded(a.encoding)]
     def _was_decoded(self, encoding: str) -> bool:
@@ -115,7 +113,7 @@ class BytesDecoder:
         return any(row.encoding == encoding for row in self.decodings)
     def _decode_attempt_subheading(self) -> Panel:
-        """Generate a rich.Panel for decode attempts"""
+        """Generate a rich.Panel for displaying decode attempts"""
         headline = Text(f"Found ", style='decode.subheading') + self.bytes_match.__rich__()
         return Panel(headline, style='decode.subheading', expand=False)
@@ -132,26 +130,33 @@ class BytesDecoder:
                 self.was_match_force_decoded[decoding.encoding] += 1
     def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
-        """
-        Create a DecodingAttemptTable row from a DecodingAttempt.
-        If the decoding result is a duplicate of a previous decoding, replace the decoded text
-        with "same output as X" where X is the previous encoding that gave the same result.
-        """
+        """Create a DecodingAttemptTable row from a DecodingAttempt."""
         assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
+        # If the decoding can have a start offset add an appropriate extension to the encoding label
+        if decoding.start_offset_label:
+            if assessment.language:
+                log.warning(f"{decoding.encoding} has offset {decoding.start_offset} and language '{assessment.language}'")
+            else:
+                assessment = deepcopy(assessment)
+                assessment.set_encoding_label(decoding.start_offset_label)
         plain_decoded_string = decoding.decoded_string.plain
         sort_score = assessment.confidence * SCORE_SCALER
+        # If the decoding result is a duplicate of a previous decoding, replace the decoded text
+        # with "same output as X" where X is the previous encoding that gave the same result.
         if plain_decoded_string in self.decoded_strings.values():
             encoding_with_same_output = get_dict_key_by_value(self.decoded_strings, plain_decoded_string)
             display_text = Text('same output as ', style='color(66) dim italic')
             display_text.append(encoding_with_same_output, style=ENCODING).append('...', style='white')
         else:
-            self.decoded_strings[decoding.encoding] = plain_decoded_string
+            self.decoded_strings[decoding.encoding_label] = plain_decoded_string
             display_text = decoding.decoded_string
         # Set failures negative, shave off a little for forced decodes
         if decoding.failed_to_decode:
-            sort_score = sort_score * -1 - 100
+            sort_score = (sort_score * -1) - 100
         elif decoding.was_force_decoded:
             sort_score -= 10

yaralyzer/decoding/decoding_attempt.py CHANGED Viewed

@@ -1,14 +1,15 @@
+"""
+Class to manage attempting to decode a chunk of bytes into strings with a given encoding.
+"""
 from sys import byteorder
 from typing import Optional
 from rich.markup import escape
-from rich.panel import Panel
 from rich.text import Text
-#from yaralyzer.bytes_match import ALERT_STYLE, BytesMatch
 from yaralyzer.encoding_detection.character_encodings import (ENCODINGS_TO_ATTEMPT, SINGLE_BYTE_ENCODINGS,
-     UTF_8, UTF_16, UTF_32)
-from yaralyzer.helpers.bytes_helper import clean_byte_string
+     UTF_8, encoding_width, is_wide_utf)
+from yaralyzer.helpers.bytes_helper import clean_byte_string, truncate_for_encoding
 from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj, unprintable_byte_to_text
 from yaralyzer.output.rich_console import ALERT_STYLE, BYTES_BRIGHTER, BYTES_BRIGHTEST, BYTES_NO_DIM, GREY_ADDRESS
 from yaralyzer.util.logging import log
@@ -16,17 +17,27 @@ from yaralyzer.util.logging import log
 class DecodingAttempt:
     def __init__(self, bytes_match: 'BytesMatch', encoding: str) -> None:
+        # Args
+        self.bytes = bytes_match.surrounding_bytes
         self.bytes_match = bytes_match
         self.encoding = encoding
-        self.bytes = bytes_match.surrounding_bytes
+        # Inferred / derived values
+        self.encoding_label = encoding
+        self.start_offset = 0  # Offset in bytes to start decoding from
+        self.start_offset_label = None  # String to indicate what offset we were able to decode
         self.was_force_decoded = False
         self.failed_to_decode = False
         self.decoded_string = self._decode_bytes()
+    def is_wide_utf_encoding(self) -> bool:
+        """Returns True if the encoding is UTF-16 or UTF-32"""
+        return is_wide_utf(self.encoding)
     def _decode_bytes(self) -> Text:
         """
-        Sets self.decoded_string. Tries builtin decode, hands off to other methods for harsher treatement
-        (Byte shifting for UTF-16/32 and custom decode for the rest) if that fails
+        Tries builtin decode, hands off to other methods for harsher treatement
+        (byte shifting for UTF-16/32 and custom decode for the rest) if that fails.
+        Has side effect of setting 'self.decoded_string' value.
         """
         try:
             decoded_string = self._to_rich_text(escape(self.bytes.decode(self.encoding)))
@@ -36,12 +47,12 @@ class DecodingAttempt:
             log.info(f"{self.encoding} failed on 1st pass decoding {self.bytes_match} capture; custom decoding...")
         except LookupError as e:
             log.warning(f"Unknown encoding: {self.encoding}. {e}")
-            return self._failed_to_decode(e)
+            return self._failed_to_decode_msg_txt(e)
         self.was_force_decoded = True
-        if self.encoding in [UTF_16, UTF_32]:
-            return self._decode_utf_multibyte_with_byte_offset()
+        if self.is_wide_utf_encoding():
+            return self._decode_utf_multibyte()
         else:
             return self._custom_decode()
@@ -103,34 +114,35 @@ class DecodingAttempt:
         return output
-    def _decode_utf_multibyte_with_byte_offset(self) -> Text:
-        """ UTF-16/32 are fixed width (and wide)"""
-        char_width = 2 if self.encoding == UTF_16 else 4
-        log.debug(f"Decoding {self.encoding}, char_width is {char_width}...")
+    def _decode_utf_multibyte(self) -> Text:
+        """UTF-16/32 are fixed width and multibyte and therefore depend on the position of the starting byte."""
+        char_width = encoding_width(self.encoding)
         last_exception = None
         decoded_str = None
-        bytes_offset = 1
+        bytes_offset = 0
+        # Iterate through the possibly byte offsets until we find a valid decoded string (or don't)
         while bytes_offset < char_width:
             try:
-                decoded_str = self.bytes[bytes_offset:].decode(self.encoding)
+                decoded_str = truncate_for_encoding(self.bytes[bytes_offset:], self.encoding).decode(self.encoding)
             except UnicodeDecodeError as e:
                 log.info(f"Exception decoding w/offset {bytes_offset} in {self.encoding}: {e}")
                 last_exception = e
+            # Append the current bytes_offset to the encoding label if we found a valid decoded string
             if decoded_str is not None:
+                log.debug(f"Successfully decoded '{self.encoding}' w/offset {bytes_offset}")
+                self.start_offset = bytes_offset
+                self.start_offset_label = f"offset {self.start_offset} byte" + ('s' if self.start_offset > 1 else '')
+                self.encoding_label = f"{self.encoding} ({self.start_offset_label})"
                 break
             bytes_offset += 1
-        if decoded_str is None:
-            return self._failed_to_decode(last_exception)
-        return self._to_rich_text(decoded_str, bytes_offset)
-    def _failed_to_decode(self, exception: Optional[Exception]) -> Text:
-        self.failed_to_decode = True
-        return prefix_with_plain_text_obj(f"(decode failed: {exception})", style='red dim italic')
+        if decoded_str is not None:
+            return self._to_rich_text(decoded_str, bytes_offset)
+        else:
+            return self._failed_to_decode_msg_txt(last_exception)
     def _to_rich_text(self, _string: str, bytes_offset: int=0) -> Text:
         """Convert a decoded string to highlighted Text representation"""
@@ -165,3 +177,8 @@ class DecodingAttempt:
             current_byte_idx += char_width
         return txt
+    def _failed_to_decode_msg_txt(self, exception: Optional[Exception]) -> Text:
+        """Set failed_to_decode flag and return a Text object with the error message."""
+        self.failed_to_decode = True
+        return prefix_with_plain_text_obj(f"(decode failed: {exception})", style='red dim italic')

yaralyzer/encoding_detection/character_encodings.py CHANGED Viewed

@@ -136,22 +136,49 @@ UNPRINTABLE_ISO_8859_7.update({
 })
-# The encodings we will attempt to actually use
-# Values are the unprintable values in that encoding in a dict (keys in dict are ints)
+# Keys are names of encodings we will attempt to decode with, values are dicts mapping the unprintable bytes
+# in that encoding to appropriate string represenations of those unprintable bytes.
+# Order matters here, as we will attempt the decoding in the order of the keys.
 ENCODINGS_TO_ATTEMPT = {
     ASCII:        UNPRINTABLE_ASCII,
-    UTF_8:          UNPRINTABLE_UTF_8,
-    UTF_16:         None,
-    UTF_32:         None,  # UTF-16 and 32 are handled differently
-    #'utf-7':
+    UTF_8:        UNPRINTABLE_UTF_8,
+    UTF_16:       None,
+    UTF_32:       None,  # UTF-16 and 32 are handled differently
     ISO_8859_1:   UNPRINTABLE_ISO_8859_1,
-    WINDOWS_1252: UNPRINTABLE_WIN_1252
+    WINDOWS_1252: UNPRINTABLE_WIN_1252,
+    #'utf-7':
+}
+SINGLE_BYTE_ENCODINGS = [
+    ASCII,
+    ISO_8859_1,
+    WINDOWS_1252,
+]
+# Keys are encodings that use multiple bytes to represent a single character, values are the possible offsets
+# to attempt to use as the starting point for decoding in a given set of bytes.
+WIDE_UTF_ENCODINGS = {
+    UTF_16: [0, 1],
+    UTF_32: [0, 1, 2, 3],
 }
-SINGLE_BYTE_ENCODINGS = [ASCII, ISO_8859_1, WINDOWS_1252]
+def encoding_offsets(encoding: str) -> list:
+    """Get possible offsets for a given encoding. If the encoding is not in WIDE_UTF_ENCODINGS, return [0]."""
+    return WIDE_UTF_ENCODINGS.get(encoding, [0])
+def encoding_width(encoding: str) -> int:
+    """Get the width of a character in bytes for a given encoding, which is the number of possible offsets."""
+    return len(encoding_offsets(encoding))
+def is_wide_utf(encoding: str) -> bool:
+    """Check if the encoding is a wide UTF encoding (UTF-16 or UTF-32)."""
+    return encoding in WIDE_UTF_ENCODINGS
-# Unused cruft (mostly Asian language encodings)
+# TODO: this is unused cruft (mostly Asian language encodings)
 ENCODINGS = [
     'big5',
     'big5hkscs',

yaralyzer/encoding_detection/encoding_assessment.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 Class to smooth some of the rough edges around the dicts returned by chardet.detect_all()
 """
-from typing import Any
+from typing import Any, Optional
 from rich.text import Text
@@ -17,30 +17,32 @@ class EncodingAssessment:
     def __init__(self, assessment: dict) -> None:
         self.assessment = assessment
         self.encoding = assessment[ENCODING].lower()
-        self.encoding_text = Text(self.encoding, 'encoding.header')
-        self.language = self._get_dict_empty_value_as_None(LANGUAGE)
-        self.language_text = None if self.language is None else Text(self.language, 'encoding.language')
         # Shift confidence from 0-1.0 scale to 0-100.0 scale
-        confidence = self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0
-        assert isinstance(confidence, float)
-        self.confidence = 100.0 * confidence
+        self.confidence = 100.0 * (self._get_dict_empty_value_as_None(CONFIDENCE) or 0.0)
         self.confidence_text = prefix_with_plain_text_obj(f"{round(self.confidence, 1)}%", style=meter_style(self.confidence))
-        # Pair the language name with the encoding name into one Text obj
-        if self.language is not None:
-            dim = 'dim' if confidence < DIM_COUNTRY_THRESHOLD else ''
-            self.encoding_text.append(f" ({self.language.title()})", style=f"color(23) {dim}")
+        # Add detected language info and label if any language was detected
+        self.language = self._get_dict_empty_value_as_None(LANGUAGE)
+        self.set_encoding_label(self.language.title() if self.language else None)
     @classmethod
     def dummy_encoding_assessment(cls, encoding) -> 'EncodingAssessment':
-        """Generate an empty EncodingAssessment to use as a dummy when chardet gives us nothing"""
-        assessment = cls({ENCODING: encoding, 'confidence': 0.0})
+        """Generate an empty EncodingAssessment to use as a dummy when chardet gives us nothing."""
+        assessment = cls({ENCODING: encoding, CONFIDENCE: 0.0})
         assessment.confidence_text = Text('none', 'no_attempt')
         return assessment
+    def set_encoding_label(self, alt_text: Optional[str]) -> None:
+        """Alt text is displayed below the encoding in slightly dimmer font."""
+        self.encoding_label = Text(self.encoding, 'encoding.header')
+        if alt_text is not None:
+            dim = 'dim' if (self.confidence or 0.0) < DIM_COUNTRY_THRESHOLD else ''
+            self.encoding_label.append(f" ({alt_text})", style=f"color(23) {dim}")
     def __rich__(self) -> Text:
-        return Text('<Chardet(', 'white') + self.encoding_text + Text(':') + self.confidence_text + Text('>')
+        return Text('<Chardet(', 'white') + self.encoding_label + Text(':') + self.confidence_text + Text('>')
     def __str__(self) -> str:
         return self.__rich__().plain

yaralyzer/encoding_detection/encoding_detector.py CHANGED Viewed

@@ -53,7 +53,7 @@ class EncodingDetector:
         self.force_decode_assessments = self.assessments_above_confidence(type(self).force_decode_threshold)
         self.force_display_assessments = self.assessments_above_confidence(type(self).force_display_threshold)
-    def get_encoding_assessment(self, encoding) -> EncodingAssessment:
+    def get_encoding_assessment(self, encoding: str) -> EncodingAssessment:
         """If chardet produced one, return it, otherwise return a dummy node with confidence of 0"""
         assessment = next((r for r in self.unique_assessments if r.encoding == encoding), None)
         return assessment or EncodingAssessment.dummy_encoding_assessment(encoding)
@@ -75,7 +75,7 @@ class EncodingDetector:
             if result.confidence < YaralyzerConfig.args.min_chardet_table_confidence:
                 continue
-            self.table.add_row(f"{i + 1}", result.encoding_text, result.confidence_text)
+            self.table.add_row(f"{i + 1}", result.encoding_label, result.confidence_text)
             # self.unique_assessments retains one result per encoding possibility (the highest confidence one)
             # Some encodings are not language specific and for those we don't care about the language

yaralyzer/helpers/bytes_helper.py CHANGED Viewed

@@ -11,7 +11,7 @@ from rich.text import Text
 from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.config import YaralyzerConfig
-from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
+from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE, encoding_width
 from yaralyzer.helpers.rich_text_helper import newline_join
 from yaralyzer.output.rich_console import (BYTES, BYTES_BRIGHTER, BYTES_BRIGHTEST,
      BYTES_HIGHLIGHT, GREY, console, console_width)
@@ -126,6 +126,21 @@ def print_bytes(bytes_array: bytes, style=None) -> None:
         console.print(escape(clean_byte_string(line)), style=style or 'bytes')
+def truncate_for_encoding(_bytes: bytes, encoding: str) -> bytes:
+    """
+    Truncate bytes to the a modulus of the char width of the given encoding.
+    For utf-16 this means truncate to a multiple of 2, for utf-32 to a multiple of 4.
+    """
+    char_width = encoding_width(encoding)
+    num_bytes = len(_bytes)
+    num_extra_bytes = num_bytes % char_width
+    if char_width <= 1 or num_bytes <= char_width or num_extra_bytes == 0:
+        return _bytes
+    else:
+        return _bytes[:-num_extra_bytes]
 def _find_str_rep_of_bytes(surrounding_bytes_str: str, highlighted_bytes_str: str, highlighted_bytes: BytesMatch):
     """
     Find the position of bytes_str in surrounding_byte_str. Both args are raw text dumps of binary data.

yaralyzer/helpers/dict_helper.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Help with dicts
+Help with dicts.
 """
 from numbers import Number

yaralyzer/helpers/list_helper.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Help with lists.
+"""
+def flatten(a):
+    """From https://www.geeksforgeeks.org/python/python-flatten-list-to-individual-elements/"""
+    return_value = []
+    for x in a:
+        if isinstance(x, list):
+            return_value.extend(flatten(x))  # Recursively flatten nested lists
+        else:
+            return_value.append(x)  # Append individual elements
+    return return_value

yaralyzer/helpers/rich_text_helper.py CHANGED Viewed

@@ -10,8 +10,7 @@ from rich.panel import Panel
 from rich.style import Style
 from rich.text import Text
-from yaralyzer.output.rich_console import (BYTES_BRIGHTEST, BYTES_HIGHLIGHT, YARALYZER_THEME_DICT,
-     console)
+from yaralyzer.output.rich_console import (BYTES_BRIGHTEST, BYTES_HIGHLIGHT, YARALYZER_THEME_DICT, console)
 from yaralyzer.util.logging import log
 # String constants

yaralyzer/output/decoding_attempts_table.py CHANGED Viewed

@@ -11,7 +11,6 @@ Final output should be rich.table of decoding attempts that are sorted like this
         3. Decodings that were the same as other decodings
         4. Failed decodings
 """
 from collections import namedtuple
 from rich import box
@@ -20,21 +19,23 @@ from rich.text import Text
 from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
-from yaralyzer.helpers.bytes_helper import (ascii_view_of_raw_bytes, hex_view_of_raw_bytes,
-     rich_text_view_of_raw_bytes)
+from yaralyzer.helpers.bytes_helper import ascii_view_of_raw_bytes, hex_view_of_raw_bytes, rich_text_view_of_raw_bytes
 from yaralyzer.helpers.rich_text_helper import CENTER, FOLD, MIDDLE, RIGHT, na_txt
-# The confidence and encoding will not be shown in the final display - instead their Text versions are shown
+# The confidence and encoding will not be shown in the final display - instead their Text versions are shown.
+# TODO: this should become a dataclass (requires Python 3.7+)
 DecodingTableRow = namedtuple(
     'DecodingTableRow',
     [
-        'encoding_text',
+        'encoding_label',
         'confidence_text',
         'errors_while_decoded',
         'decoded_string',
+        # Properties below here are not displayed in the table but are used for sorting etc.
         'confidence',
         'encoding',
-        'sort_score'
+        'sort_score',
+        'encoding_label_plain',  # For sorting purposes, if confidences match
     ]
 )
@@ -43,9 +44,9 @@ HEX = Text('HEX', style='bytes.title')
 RAW_BYTES = Text('Raw', style=f"bytes")
-def build_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
-    """First rows are the raw / hex views of the bytes then 1 row per decoding attempt."""
-    table = Table(show_lines=True, border_style='bytes', header_style='color(101) bold')
+def new_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
+    """Build a new rich Table with two rows, the raw and hex views of the bytes_match data."""
+    table = Table(show_lines=True, border_style='bytes', header_style='decode.table_header')
     def add_col(title, **kwargs):
         kwargs['justify'] = kwargs.get('justify', CENTER)
@@ -64,19 +65,21 @@ def build_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
 def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Text, score: float) -> DecodingTableRow:
-    """Get a table row for a decoding attempt"""
+    """Build a table row for a decoding attempt."""
     return DecodingTableRow(
-        assessment.encoding_text,
+        assessment.encoding_label,
         assessment.confidence_text,
         is_forced,
         txt,
         assessment.confidence,
         assessment.encoding,
-        sort_score=score)
+        sort_score=score,
+        encoding_label_plain=assessment.encoding_label.plain
+    )
 def assessment_only_row(assessment: EncodingAssessment, score) -> DecodingTableRow:
-    """Build a row with just chardet assessment data and no actual decoded string"""
+    """Build a row with just chardet assessment confidence data and no actual decoding attempt string."""
     return decoding_table_row(assessment, na_txt(), DECODE_NOT_ATTEMPTED_MSG, score)
@@ -85,8 +88,8 @@ def _hex_preview_subtable(bytes_match: BytesMatch) -> Table:
     hex_table = Table(
         'hex',
         'ascii',
-        border_style='color(235) dim',
-        header_style='color(101) bold',
+        border_style='grey.darkest',
+        header_style='decode.table_header',
         box=box.MINIMAL,
         show_lines=True,
         show_header=True,

yaralyzer/output/rich_console.py CHANGED Viewed

@@ -53,6 +53,7 @@ YARALYZER_THEME_DICT = {
     'decode.section_header': 'color(100) reverse',
     'decode.subheading': PEACH,
     'decode.subheading_2': 'color(215) dim italic',
+    'decode.table_header': 'color(101) bold',
     'headline': 'bold white underline',
     # bytes
     'ascii': 'color(58)',

yaralyzer/util/argument_parser.py CHANGED Viewed

@@ -234,6 +234,9 @@ def parse_arguments(args: Optional[Namespace] = None):
     if args.debug:
         log.setLevel(logging.DEBUG)
+        if args.log_level and args.log_level != 'DEBUG':
+            log.warning("Ignoring --log-level option as debug mode means log level is DEBUG")
     elif args.log_level:
         log.setLevel(args.log_level)

yaralyzer/util/logging.py CHANGED Viewed

@@ -54,7 +54,6 @@ def configure_logger(log_label: str) -> logging.Logger:
         rich_stream_handler = RichHandler(rich_tracebacks=True)
         rich_stream_handler.setLevel('WARN')
         logger.addHandler(rich_stream_handler)
-        logger.info('File logging triggered by setting of YARALYZER_LOG_DIR')
     else:
         logger.addHandler(RichHandler(rich_tracebacks=True))

{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: yaralyzer
-Version: 1.0.0
+Version: 1.0.1
 Summary: Visualize and force decode YARA and regex matches found in a file or byte stream. With colors. Lots of colors.
 Home-page: https://github.com/michelcrypt4d4mus/yaralyzer
 License: GPL-3.0-or-later

yaralyzer-1.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,31 @@
+.yaralyzer.example,sha256=z3_mk41xxm0Pr_8MGM7AKQG0xEFRtGcyJLboMuelRp4,3504
+CHANGELOG.md,sha256=5DIUFaaTQpYkCLKYotRBpBZUMILJgP4ECtqla9zKJRY,2539
+yaralyzer/__init__.py,sha256=YItEM_QKbLUj-6QZg2ZINrTzPQZ1IHOjGgoxmRR2buA,2703
+yaralyzer/bytes_match.py,sha256=ShAxI_jZYElG1w-FJ9wNF-5SReL2uv-iJTiQQS3VTM0,8213
+yaralyzer/config.py,sha256=VU5RTQwbNV3Ai02p4mAjiJrbL30gjjf9xBGl4IOh0Qs,3927
+yaralyzer/decoding/bytes_decoder.py,sha256=tJKFoWChIpmgW23XiCwlfZCHdTXUz5z277U8-CXsjsg,8752
+yaralyzer/decoding/decoding_attempt.py,sha256=8o0A4gidE4olW187QXIAYrAAYdjUsqoGz2YRgPqbJ3Y,8391
+yaralyzer/encoding_detection/character_encodings.py,sha256=KqN0sdGZsVMaJM9qjGfcZNyyjcUPlTCob0jBLh-DW7E,5383
+yaralyzer/encoding_detection/encoding_assessment.py,sha256=-YMjkl4AuQYBdq2SFMw1LvA7A8auNxtVIM93az9Xwzc,2368
+yaralyzer/encoding_detection/encoding_detector.py,sha256=tqTgTOv7WjQgfVhShGETXgJmZFw16HoQ2l6WhLlAY34,4738
+yaralyzer/helpers/bytes_helper.py,sha256=8AEW3aPv0dROD-srfe8z9m12bVZLrdvHRq-RBNQ4Vso,7442
+yaralyzer/helpers/dict_helper.py,sha256=THbCgnTLgtM2v8MjjxZk2V296cYBghzjss6xhRBqYPQ,212
+yaralyzer/helpers/file_helper.py,sha256=uf8dTOhRrJng0V36o7Mwk5t-L5gc4_uOaGj9F0s5OBA,1254
+yaralyzer/helpers/list_helper.py,sha256=r3YUMkkVvl5R5VBzc3rxCzyPW_Nxhj5CRinBCHs9dAY,393
+yaralyzer/helpers/rich_text_helper.py,sha256=elkWt2LoV_FnajK-UadEs_gPWSBE7NSsyJHN1eQsmgw,4213
+yaralyzer/helpers/string_helper.py,sha256=AT2_CAgpvtp8GiUSKLTiDoToDD3tBB9BbrlX-s2bL7o,932
+yaralyzer/output/decoding_attempts_table.py,sha256=x6AViJqAj7ept92OXWl9-PVk8MyBSyYt62mUgJjsP7U,4040
+yaralyzer/output/file_export.py,sha256=YfF5D8aHOUQHwV0akFaaSMafbhdhUakvipadpq6HZmk,2927
+yaralyzer/output/file_hashes_table.py,sha256=SnS2ip8dSeHoycQ0Ng3Gtpv9rXJSkKnvD2krTuhNg7s,1632
+yaralyzer/output/regex_match_metrics.py,sha256=deJPaVnhpy-AUX6PCE_jbPLIlmfIOtl-cEVWsiFp3KY,3003
+yaralyzer/output/rich_console.py,sha256=NJi6LjvoOfFXm9Kq9TQbZ3P32C5nQtahccUMEY_Ykpw,4248
+yaralyzer/util/argument_parser.py,sha256=PNmdmFULBq10lAXOt9McZImQ-H5VNnrNN2LeTRxd0P0,12928
+yaralyzer/util/logging.py,sha256=6N-JrQfAbVdCMYvqJ3MUHMchSwFN9208-0giWvX4OYY,4248
+yaralyzer/yara/yara_match.py,sha256=4_26eaJT9I0PULiCdxerQtX4TfAIwcT-B6GJociGM9A,5119
+yaralyzer/yara/yara_rule_builder.py,sha256=kAa3RBojM5GEaXDJjKZODAyx6yj34AlkOnQhACAFfZM,3021
+yaralyzer/yaralyzer.py,sha256=f1y8qST6GZHEWl7nDNEBWpQuYjnsJ8dm9nGPWqZ4Hkk,9417
+yaralyzer-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+yaralyzer-1.0.1.dist-info/METADATA,sha256=oWGgWkTQelQydVVU4o9rDqaxHayeGOn6c7-EI2DrlpI,10795
+yaralyzer-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+yaralyzer-1.0.1.dist-info/entry_points.txt,sha256=7LnLJrNTfql0vuctjRWwp_ZD-BYvtv9ENVipdjuT7XI,136
+yaralyzer-1.0.1.dist-info/RECORD,,

yaralyzer-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,30 +0,0 @@
-.yaralyzer.example,sha256=4QKFDDNvnAlT3NVS0eiM_Qed9Hxy4ZPQFJkye8lKYAk,3486
-CHANGELOG.md,sha256=XN0_isuQMJy03PATrBkBIPWyIXzUV1g5YG1FMThCves,2158
-yaralyzer/__init__.py,sha256=YItEM_QKbLUj-6QZg2ZINrTzPQZ1IHOjGgoxmRR2buA,2703
-yaralyzer/bytes_match.py,sha256=ShAxI_jZYElG1w-FJ9wNF-5SReL2uv-iJTiQQS3VTM0,8213
-yaralyzer/config.py,sha256=eRJ88wBFs1rfjOv4htI1Ye0LFCFfk4kGDiFHuqZfkX0,3730
-yaralyzer/decoding/bytes_decoder.py,sha256=lulfZZhYmo9ky2KpqBW-c9hs5_uhlaz0gatdtT_NYSY,7951
-yaralyzer/decoding/decoding_attempt.py,sha256=GAxMNOX7I_FsuzGWIelTWAECytLUJD-wpmUAuVe2bn0,7241
-yaralyzer/encoding_detection/character_encodings.py,sha256=zrOUgNXwrcXkeYSgdo09vsFPmNYsTkaHvq7YzzpbMsk,4395
-yaralyzer/encoding_detection/encoding_assessment.py,sha256=yMb1QSHS7JpNyL6jnZwt9Vq0Y6ueVStYZjMKyP6-f5A,2307
-yaralyzer/encoding_detection/encoding_detector.py,sha256=e_UtZi1Nh3ZRBOESEFtjYz0vJ_1cZjIJ5uWRuzM91oc,4732
-yaralyzer/helpers/bytes_helper.py,sha256=XemBmf9tXgjoN-X7AULHL1wKS1lkQR6XXGt_D2lMBY0,6915
-yaralyzer/helpers/dict_helper.py,sha256=h8Sg01qCJRKfZ0bmTYhLP6X5OVxMg-7CZryJIjcbw8E,211
-yaralyzer/helpers/file_helper.py,sha256=uf8dTOhRrJng0V36o7Mwk5t-L5gc4_uOaGj9F0s5OBA,1254
-yaralyzer/helpers/rich_text_helper.py,sha256=9Wc6WM625iKxAXRvxBkVzvszfcxb8YtqoQ6d7d8EqoQ,4218
-yaralyzer/helpers/string_helper.py,sha256=AT2_CAgpvtp8GiUSKLTiDoToDD3tBB9BbrlX-s2bL7o,932
-yaralyzer/output/decoding_attempts_table.py,sha256=cMY9eCXZHj0FfGxJ9uoM5cpdhQve-EtTRHv3fTHKJAo,3712
-yaralyzer/output/file_export.py,sha256=YfF5D8aHOUQHwV0akFaaSMafbhdhUakvipadpq6HZmk,2927
-yaralyzer/output/file_hashes_table.py,sha256=SnS2ip8dSeHoycQ0Ng3Gtpv9rXJSkKnvD2krTuhNg7s,1632
-yaralyzer/output/regex_match_metrics.py,sha256=deJPaVnhpy-AUX6PCE_jbPLIlmfIOtl-cEVWsiFp3KY,3003
-yaralyzer/output/rich_console.py,sha256=Botb8aec4_aRiPyaEkwrnhwERHE8a5-lk5KfgzXVlBE,4202
-yaralyzer/util/argument_parser.py,sha256=cVUe3lQCb6iFnP5leE-C0OeXkBPomw55Xi1MiD1Gl50,12776
-yaralyzer/util/logging.py,sha256=3qtLnCFbN8L1nTSwIQvxfcM5jfhIRWTFZj9XGQk74kc,4326
-yaralyzer/yara/yara_match.py,sha256=4_26eaJT9I0PULiCdxerQtX4TfAIwcT-B6GJociGM9A,5119
-yaralyzer/yara/yara_rule_builder.py,sha256=kAa3RBojM5GEaXDJjKZODAyx6yj34AlkOnQhACAFfZM,3021
-yaralyzer/yaralyzer.py,sha256=f1y8qST6GZHEWl7nDNEBWpQuYjnsJ8dm9nGPWqZ4Hkk,9417
-yaralyzer-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-yaralyzer-1.0.0.dist-info/METADATA,sha256=HfE-vyCyOUs_QyM7saQAU9b4hiS9poBxuwLuFf8jsNM,10795
-yaralyzer-1.0.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-yaralyzer-1.0.0.dist-info/entry_points.txt,sha256=7LnLJrNTfql0vuctjRWwp_ZD-BYvtv9ENVipdjuT7XI,136
-yaralyzer-1.0.0.dist-info/RECORD,,

{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{yaralyzer-1.0.0.dist-info → yaralyzer-1.0.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

yaralyzer 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

yaralyzer 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl