RubyGems - fileshunter - Versions diffs - 0.1.0.20130725 - Mend

fileshunter 0.1.0.20130725

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data/AUTHORS +3 -0
data/ChangeLog +5 -0
data/Credits +21 -0
data/LICENSE +31 -0
data/README +15 -0
data/README.md +11 -0
data/Rakefile +7 -0
data/ReleaseInfo +8 -0
data/bin/fileshunt +216 -0
data/ext/fileshunter/Decoders/_FLAC.c +233 -0
data/ext/fileshunter/Decoders/extconf.rb +3 -0
data/lib/fileshunter/BeginPatternDecoder.rb +218 -0
data/lib/fileshunter/Decoder.rb +66 -0
data/lib/fileshunter/Decoders/ASF.rb +50 -0
data/lib/fileshunter/Decoders/BMP.rb +118 -0
data/lib/fileshunter/Decoders/CAB.rb +140 -0
data/lib/fileshunter/Decoders/CFBF.rb +92 -0
data/lib/fileshunter/Decoders/EBML.rb +369 -0
data/lib/fileshunter/Decoders/EXE.rb +505 -0
data/lib/fileshunter/Decoders/FLAC.rb +387 -0
data/lib/fileshunter/Decoders/ICO.rb +71 -0
data/lib/fileshunter/Decoders/JPEG.rb +247 -0
data/lib/fileshunter/Decoders/M2V.rb +30 -0
data/lib/fileshunter/Decoders/MP3.rb +341 -0
data/lib/fileshunter/Decoders/MP4.rb +620 -0
data/lib/fileshunter/Decoders/MPG_Video.rb +30 -0
data/lib/fileshunter/Decoders/OGG.rb +74 -0
data/lib/fileshunter/Decoders/RIFF.rb +437 -0
data/lib/fileshunter/Decoders/TIFF.rb +350 -0
data/lib/fileshunter/Decoders/Text.rb +240 -0
data/lib/fileshunter/Segment.rb +50 -0
data/lib/fileshunter/SegmentsAnalyzer.rb +251 -0
data/lib/fileshunter.rb +15 -0
metadata +130 -0

data/lib/fileshunter/BeginPatternDecoder.rb ADDED Viewed

@@ -0,0 +1,218 @@
+module FilesHunter
+  # Decoders that are based on begin patterns (sucha as Magic Numbers) inherit from this class.
+  # They then have to implement the following methods:
+  # * *get_begin_pattern*: To give the begin pattern and eventual options
+  # * *decode*: To decode data starting a given offset that matches the begin pattern
+  # * *check_begin_pattern*: Provide a quick check of the begin pattern when found [optional]
+  # They can then use the following DSL in the decode method:
+  # * *found_relevant_data*: Indicate that we are certain the beginning of data of the given extension has been found
+  # * *invalid_data*: Indicate the data read is invalid for our Decoder
+  # * *truncated_data*: Indicate the data should have continued if it were to be complete. This can happen even in the middle of a stream, if the data has been corrupted.
+  # * *progress*: Indicate the progression of the scan: everything before the progression is considered valid for the given extension (if found_relevant_data was called previously)
+  # * *metadata*: Set metadata properties
+  class BeginPatternDecoder < Decoder
+    class TruncatedDataError < RuntimeError
+      attr_reader :cursor_truncated
+      # Constructor
+      #
+      # Parameters::
+      # * *message* (_String_): The error message
+      # * *cursor_truncated* (_Fixnum_): The exceeding offset
+      def initialize(message, cursor_truncated)
+        super(message)
+        @cursor_truncated = cursor_truncated
+      end
+    end
+    class InvalidDataError < RuntimeError
+    end
+    # Find segments from a given data
+    def find_segments
+      @begin_pattern, options = get_begin_pattern
+      log_debug "Pattern to find: #{@begin_pattern.inspect}"
+      @has_to_check_begin_pattern = self.respond_to?(:check_begin_pattern)
+      # Parse options
+      @max_regexp_size = 32
+      @offset_inc = 1
+      @begin_pattern_offset_in_segment = 0
+      if (options != nil)
+        @max_regexp_size = options[:max_regexp_size] if (options[:max_regexp_size] != nil)
+        @offset_inc = options[:offset_inc] if (options[:offset_inc] != nil)
+        @begin_pattern_offset_in_segment = options[:begin_pattern_offset_in_segment] if (options[:begin_pattern_offset_in_segment] != nil)
+      end
+      @metadata = {}
+      @missing_previous_data = false
+      foreach_begin_pattern do |begin_pattern_offset|
+        next decode(begin_pattern_offset)
+      end
+    end
+    protected
+    # Mark the current decoding as being valid.
+    # This is called when the decoder knows that it has valid data matching its specification.
+    # Before calling this method, decoded data might still be junk.
+    #
+    # Parameters::
+    # * *extension* (_Symbol_ or <em>list<Symbol></em>): Extension(s) this data belongs to
+    def found_relevant_data(extension)
+      @extension = extension
+    end
+    # Indicate that the data is invalid.
+    # This will stop the decoding by raising an exception.
+    #
+    # Parameters::
+    # * *message* (_String_): Message to give with the exception [default = '']
+    def invalid_data(message = '')
+      raise InvalidDataError.new(message)
+    end
+    # Indicate that the data is truncated.
+    # This will stop the decoding by raising an exception.
+    #
+    # Parameters::
+    # * *message* (_String_): Message to give with the exception [default = '']
+    # * *cursor_truncated* (_Fixnum_): Cursor where data has been truncated [default = nil]
+    def truncated_data(message = '', cursor_truncated = nil)
+      raise TruncatedDataError.new(message, ((cursor_truncated == nil) ? ((@last_offset_to_be_decoded == nil) ? @end_offset : @last_offset_to_be_decoded) : cursor_truncated))
+    end
+    # Indicate that the data is missing previous data.
+    def missing_previous_data
+      @missing_previous_data = true
+    end
+    # Indicate progression in the decoding
+    #
+    # Parameters::
+    # * *offset_to_be_decoded* (_Fixnum_): Next to be decoded
+    def progress(offset_to_be_decoded)
+      @last_offset_to_be_decoded = offset_to_be_decoded
+      raise TruncatedDataError.new("Progression @#{offset_to_be_decoded} is over limit (#{@end_offset})", @end_offset) if (@last_offset_to_be_decoded > @end_offset)
+      keep_alive
+    end
+    # Set metadata properties
+    #
+    # Parameters::
+    # * *properties* (<em>map<Symbol,Object></em>): The properties to be set
+    def metadata(properties)
+      #log_debug "Add metadata: #{properties.inspect}"
+      @metadata.merge!(properties)
+    end
+    private
+    # Find a starting pattern and call a client block when it matches.
+    # Client block decodes data, and calls the following methods to give progress on its decoding:
+    # * *found_relevant_data*: Indicate that there is valid data to be decoded. If a TruncatedDataError occurs before this method is called, the data is ignored ; otherwise it will be marked as decoded but truncated to the end of the current segment.
+    # * *progress*: Indicate progression
+    # * *truncated_data*: Indicate that the data is truncated
+    # * *invalid_data*: Indicate that the data is invalid
+    #
+    # Parameters::
+    # * _Block_: Client code called when such a pattern matches. Its goal is to decode correctly at the given offset.
+    #   * Parameters::
+    #   * *begin_pattern_offset* (_Fixnum_): The offset of the pattern
+    #   * *pattern_index* (_Fixnum_): The pattern index that matched the search. Always nil if begin_pattern is not a list.
+    #   * Result::
+    #   * *end_offset* (_Fixnum_): The ending offset (nil if could not be decoded). If the ending offset returned is greater than end_offset, segment will be considered as truncated.
+    def foreach_begin_pattern
+      # Loop to the end
+      current_offset = @begin_offset
+      while (current_offset < @end_offset)
+        # Find the begin pattern
+        log_debug "Find begin_pattern starting #{current_offset}..."
+        begin_pattern_offset, pattern_index = @data.index(@begin_pattern, current_offset, @max_regexp_size)
+        if ((begin_pattern_offset == nil) or
+            (begin_pattern_offset >= @end_offset))
+          # No match
+          current_offset = @end_offset
+          log_debug "No more pattern."
+        else
+          if (begin_pattern_offset >= @begin_offset + @begin_pattern_offset_in_segment)
+            begin_pattern_offset -= @begin_pattern_offset_in_segment
+            log_debug "Found begin_pattern at #{begin_pattern_offset}."
+            # We have a candidate
+            # Try to decode it
+            decoded_end_offset = nil
+            truncated = false
+            @missing_previous_data = false
+            @extension = nil
+            @last_offset_to_be_decoded = nil
+            begin
+              # If the decoder can perform additional tests, call them
+              begin_pattern_valid = (@has_to_check_begin_pattern) ? check_begin_pattern(begin_pattern_offset, pattern_index) : true
+              if begin_pattern_valid
+                # Call the Decoder
+                decoded_end_offset = yield(begin_pattern_offset, pattern_index)
+              else
+                log_debug 'Invalid pattern returned by the check.'
+              end
+            rescue InvalidDataError
+              # If data was already validated, it means that the segment is truncated.
+              log_debug "Got an invalid data exception while decoding data: #{$!}"
+              #log_debug $!.backtrace.join("\n")
+              # If not, drop everything.
+              if ((@extension != nil) and
+                  (@last_offset_to_be_decoded != nil))
+                truncated = true
+                # Use the last decoded offset as the truncated limit.
+                decoded_end_offset = @last_offset_to_be_decoded
+              else
+                decoded_end_offset = nil
+              end
+            rescue TruncatedDataError, AccessAfterDataError
+              # Data is truncated
+              log_debug "Got a truncation exception while decoding data: #{$!}"
+              #log_debug $!.backtrace.join("\n")
+              # If we already got relevant data, mark it as truncated
+              if (@extension != nil)
+                truncated = true
+                if ($!.is_a?(AccessAfterDataError))
+                  decoded_end_offset = $!.exceeding_offset
+                else
+                  decoded_end_offset = $!.cursor_truncated
+                end
+              else
+                decoded_end_offset = nil
+              end
+            rescue
+              #log_err "Error while decoding data: #{$!}\n#{$!.backtrace.join("\n")}"
+              #decoded_end_offset = nil
+              raise
+            end
+            if ((decoded_end_offset == nil) or
+                (@extension == nil))
+              log_debug 'Invalid segment.'
+              # Try searching from further: maybe another BEGIN_PATTERN might be found
+              current_offset = begin_pattern_offset + @begin_pattern_offset_in_segment + @offset_inc
+            else
+              log_debug "Decoded segment in offsets [ #{begin_pattern_offset} - #{decoded_end_offset} ]"
+              if (decoded_end_offset > @end_offset)
+                log_debug "Decoded segment ends at #{decoded_end_offset} which is greater than #{@end_offset} => truncated"
+                decoded_end_offset = @end_offset
+                truncated = true
+              end
+              # Extract the segment and go on to the next
+              found_segment(begin_pattern_offset, decoded_end_offset, @extension, truncated, @missing_previous_data, @metadata)
+              current_offset = decoded_end_offset
+            end
+          else
+            # Try searching from further: maybe another BEGIN_PATTERN might be found
+            current_offset = begin_pattern_offset + @offset_inc
+          end
+        end
+      end
+    end
+  end
+end

data/lib/fileshunter/Decoder.rb ADDED Viewed

@@ -0,0 +1,66 @@
+module FilesHunter
+  # Generic Decode class
+  # All Decoders inherit from this class and have to implement the find_segments method, using @data, @begin_offset and @end_offset instance variables to parse data.
+  # Here is the DSL Decoders can use in their find_segments method:
+  # * *@data* (_IOBlockReader_): The data to be accessed
+  # * *@begin_offset* (_Fixnum_): The begin offset
+  # * *@end_offset* (_Fixnum_): The end offset
+  # * *found_segment*: Method used to indicate a Segment was successfully parsed
+  # * *keep_alive*: Method used to indicate progression
+  class Decoder
+    # Prepare for new search
+    #
+    # Parameters::
+    # * *segments_analyzer* (_SegmentsAnalyzer_): The segments analyzer for which this Decoder is working
+    # * *data* (_IOBlockReader_): Data being analyzed
+    # * *begin_offset* (_Fixnum_): The begin offset
+    # * *end_offset* (_Fixnum_): The end offset
+    def setup(segments_analyzer, data, begin_offset, end_offset)
+      @segments_analyzer = segments_analyzer
+      @data = data
+      @begin_offset = begin_offset
+      @end_offset = end_offset
+      @segments = []
+    end
+    # Return found segments since last setup
+    #
+    # Result::
+    # * <em>list<Segment></em>: The list of segments
+    def segments_found
+      return @segments
+    end
+    protected
+    # Callback called by decoders to notify a Segment has been found successfully
+    #
+    # Parameters::
+    # * *begin_offset* (_Fixnum_): The begin offset
+    # * *end_offset* (_Fixnum_): The end offset
+    # * *extension* (_Symbol_ or <em>list<Symbol></em>): The extension (can be a list of possible extensions)
+    # * *truncated* (_Boolean_): Is the data truncated in this segment?
+    # * *missing_previous_data* (_Boolean_): Is some data missing before?
+    # * *metadata* (<em>map<Symbol,Object></em>): Metadata associated to this segment (Decoder dependent) [default = {}]
+    def found_segment(segment_begin_offset, segment_end_offset, extension, truncated, missing_previous_data, metadata)
+      raise "Segment begin offset (#{segment_begin_offset}) is lower than data begin offset (#{@begin_offset})" if (segment_begin_offset < @begin_offset)
+      if (segment_end_offset > @end_offset)
+        log_debug "Segment end offset (#{segment_end_offset}) is greater than data end offset (#{@end_offset}). Mark Segment as truncated."
+        segment_end_offset = @end_offset
+        truncated = true
+      end
+      @segments << Segment.new(segment_begin_offset, segment_end_offset, extension, truncated, missing_previous_data, metadata)
+      @segments_analyzer.add_bytes_decoded(segment_end_offset - segment_begin_offset)
+    end
+    # Indicate progression in the decoding
+    # This is used to eventually cancel the parsing
+    def keep_alive
+      raise CancelParsingError.new('Parsing cancelled while decoding') if (@segments_analyzer.parsing_cancelled)
+    end
+  end
+end

data/lib/fileshunter/Decoders/ASF.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module FilesHunter
+  module Decoders
+    class ASF < BeginPatternDecoder
+      BEGIN_PATTERN_ASF = "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C".force_encoding(Encoding::ASCII_8BIT)
+      ASF_DATA_GUID = "\x36\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C".force_encoding(Encoding::ASCII_8BIT)
+      ACCEPTABLE_INDEX_GUID = [
+        "\x90\x08\x00\x33\xB1\xE5\xCF\x11\x89\xF4\x00\xA0\xC9\x03\x49\xCB".force_encoding(Encoding::ASCII_8BIT),
+        "\xD3\x29\xE2\xD6\xDA\x35\xD1\x11\x90\x34\x00\xA0\xC9\x03\x49\xBE".force_encoding(Encoding::ASCII_8BIT),
+        "\xF8\x03\xB1\xFE\xAD\x12\x64\x4C\x84\x0F\x2A\x1D\x2F\x7A\xD4\x8C".force_encoding(Encoding::ASCII_8BIT),
+        "\xD0\x3F\xB7\x3C\x4A\x0C\x03\x48\x95\x3D\xED\xF7\xB6\x22\x8F\x0C".force_encoding(Encoding::ASCII_8BIT)
+      ]
+      def get_begin_pattern
+        return BEGIN_PATTERN_ASF, { :offset_inc => 16 }
+      end
+      def decode(offset)
+        ending_offset = nil
+        cursor = offset + BinData::Uint64le.read(@data[cursor+16..cursor+23])
+        progress(cursor)
+        # Should be on the DATA object
+        invalid_data("@#{cursor} - Missing Data object in ASF. GUID does not match.") if (@data[cursor..cursor+15] != ASF_DATA_GUID)
+        found_relevant_data(:asf)
+        cursor += BinData::Uint64le.read(@data[cursor+16..cursor+23])
+        progress(cursor)
+        # Now cycle through optional Index objects
+        while (ending_offset == nil)
+          if (ACCEPTABLE_INDEX_GUID.include?(@data[cursor..cursor+15]))
+            # There is an index object
+            cursor += BinData::Uint64le.read(@data[cursor+16..cursor+23])
+            progress(cursor)
+            ending_offset = cursor if (cursor == @end_offset)
+          else
+            # Finished
+            ending_offset = cursor
+          end
+        end
+        return ending_offset
+      end
+    end
+  end
+end

data/lib/fileshunter/Decoders/BMP.rb ADDED Viewed

@@ -0,0 +1,118 @@
+module FilesHunter
+  module Decoders
+    class BMP < BeginPatternDecoder
+      BEGIN_PATTERN_BMP = Regexp.new("BM....\x00\x00\x00\x00", nil, 'n')
+      PADDING_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
+      def get_begin_pattern
+        return BEGIN_PATTERN_BMP, { :offset_inc => 2, :max_regexp_size => 10 }
+      end
+      def decode(offset)
+        ending_offset = nil
+        cursor = offset + 14
+        header_size = BinData::Uint32le.read(@data[cursor..cursor+3])
+        width = nil
+        height = nil
+        bpp = nil
+        header_version = nil
+        bitmap_size = nil
+        compression = 0
+        if (header_size == 12)
+          # BMP v2 header
+          header_version = 2
+          width = BinData::Sint16le.read(@data[cursor+4..cursor+5])
+          height = BinData::Sint16le.read(@data[cursor+6..cursor+7])
+          nbr_planes = BinData::Uint16le.read(@data[cursor+8..cursor+9])
+          invalid_data("@#{cursor} - Number of planes (#{nbr_planes}) should always be 1") if (nbr_planes != 1)
+          bpp = BinData::Uint16le.read(@data[cursor+10..cursor+11])
+          invalid_data("@#{cursor} - Invalid BPP: #{bpp}") if (![1,4,8,16,24,32].include?(bpp))
+          cursor += header_size
+          # Color palette
+          cursor += 3*(1 << bpp) if (bpp != 24)
+        else
+          # BMP v3+ header
+          header_version = 3
+          width = BinData::Uint32le.read(@data[cursor+4..cursor+7])
+          height = BinData::Uint32le.read(@data[cursor+8..cursor+11])
+          nbr_planes = BinData::Uint16le.read(@data[cursor+12..cursor+13])
+          invalid_data("@#{cursor} - Number of planes (#{nbr_planes}) should always be 1") if (nbr_planes != 1)
+          bpp = BinData::Uint16le.read(@data[cursor+14..cursor+15])
+          invalid_data("@#{cursor} - Invalid BPP: #{bpp}") if (![1,4,8,16,24,32].include?(bpp))
+          compression = BinData::Uint32le.read(@data[cursor+16..cursor+19])
+          invalid_data("@#{cursor} - Invalid compression method: #{compression}") if (compression > 3)
+          invalid_data("@#{cursor} - Invalid compression method: #{compression} for given bpp (#{bpp})") if ((compression != 3) and (bpp == 16))
+          bitmap_size = BinData::Uint32le.read(@data[cursor+20..cursor+23])
+          invalid_data("@#{cursor} - Empty bitmap size for compression method: #{compression}") if ((bitmap_size == 0) and ((compression == 1) or (compression == 2)))
+          #ppm_horizontal = BinData::Uint32le.read(@data[cursor+24..cursor+27])
+          #ppm_vertical = BinData::Uint32le.read(@data[cursor+28..cursor+31])
+          nbr_colors_used = BinData::Uint32le.read(@data[cursor+32..cursor+35])
+          invalid_data("@#{cursor} - Number of colors used specified (#{nbr_colors_used} whereas bpp is >= 16 (#{bpp})") if ((bpp >= 16) and (nbr_colors_used > 0))
+          #nbr_colors_important = BinData::Uint32le.read(@data[cursor+36..cursor+39])
+          if (header_size == 56)
+            # BMP v? header
+            header_version = 56
+          elsif (header_size == 108)
+            # BMP v4 header
+            header_version = 4
+            cstype = BinData::Uint32le.read(@data[cursor+56..cursor+59])
+            invalid_data("@#{cursor} - Invalid cstype: #{cstype}") if (cstype > 2)
+          end
+          cursor += header_size
+          # Color palette
+          cursor += 4*(1 << bpp) if (bpp < 16)
+          cursor += 12 if (((bpp == 16) or (bpp == 32)) and (compression == 3) and (header_version == 3))
+        end
+        progress(cursor)
+        found_relevant_data(:bmp)
+        metadata(
+          :width => width,
+          :height => height,
+          :bpp => bpp,
+          :header_version => header_version,
+          :bitmap_size => bitmap_size,
+          :compression => compression
+        )
+        log_debug "@#{cursor} - Decoding bitmap data: header_version=#{header_version} width=#{width} height=#{height} bpp=#{bpp} compression=#{compression} bitmap_size=#{bitmap_size}"
+        if ((compression == 0) or
+            (compression == 3))
+          # Compute the scanline size
+          scanline_size = nil
+          case bpp.to_i
+          when 1, 4, 8
+            scanline_size, extra = width.divmod(8/bpp)
+            scanline_size += 1 if (extra > 0)
+          when 16, 24, 32
+            scanline_size = width * (bpp/8)
+            scanline_size *= 2 if ((bpp == 16) and (header_version == 4))
+          end
+          rest = scanline_size % 4
+          scanline_size += 4 - rest if (rest > 0)
+          computed_bitmap_size = scanline_size * height
+          cursor += computed_bitmap_size
+        else
+          cursor += bitmap_size
+        end
+        progress(cursor)
+        # Eventually pad to the next 32 bits with \x00
+        rest = (cursor - offset) % 4
+        if (rest > 0)
+          # Check if we have padding
+          possible_padding_size = 4 - rest
+          cursor += possible_padding_size if ((cursor + possible_padding_size <= @end_offset) and (@data[cursor..cursor + possible_padding_size - 1] == PADDING_CHAR * possible_padding_size))
+        end
+        ending_offset = cursor
+        return ending_offset
+      end
+    end
+  end
+end

data/lib/fileshunter/Decoders/CAB.rb ADDED Viewed

@@ -0,0 +1,140 @@
+module FilesHunter
+  module Decoders
+    class CAB < BeginPatternDecoder
+      BEGIN_PATTERN_CAB = "MSCF\x00\x00\x00\x00".force_encoding(Encoding::ASCII_8BIT)
+      END_STRING_TERMINATOR = "\x00".force_encoding(Encoding::ASCII_8BIT)
+      AUTHENTICODE_ID = "\x30\x82".force_encoding(Encoding::ASCII_8BIT)
+      def get_begin_pattern
+        return BEGIN_PATTERN_CAB, { :offset_inc => 4 }
+      end
+      def decode(offset)
+        # CFHEADER
+        cabinet_size = BinData::Uint32le.read(@data[offset+8..offset+11])
+        invalid_data("@#{offset} - Invalid CAB header.") if (BinData::Uint32le.read(@data[offset+12..offset+15]) != 0)
+        #cf_file_offset = BinData::Uint32le.read(@data[offset+16..offset+19])
+        invalid_data("@#{offset} - Invalid CAB header.") if (BinData::Uint32le.read(@data[offset+20..offset+23]) != 0)
+        minor_version = @data[offset+24].ord
+        major_version = @data[offset+25].ord
+        nbr_cf_folders = BinData::Uint16le.read(@data[offset+26..offset+27])
+        nbr_cf_files = BinData::Uint16le.read(@data[offset+28..offset+29])
+        flags = BinData::Uint16le.read(@data[offset+30..offset+31])
+        flag_prev_cabinet = ((flags & 0b00000000_00000001) != 0)
+        flag_next_cabinet = ((flags & 0b00000000_00000010) != 0)
+        flag_reserve_present = ((flags & 0b00000000_00000100) != 0)
+        set_id = BinData::Uint16le.read(@data[offset+32..offset+33])
+        idx_cabinet = BinData::Uint16le.read(@data[offset+34..offset+35])
+        cursor = offset + 36
+        reserve_field_size_in_folder = 0
+        reserve_field_size_in_data = 0
+        if flag_reserve_present
+          reserve_field_size_in_header = BinData::Uint16le.read(@data[offset+36..offset+37])
+          invalid_data("@#{offset} - Invalid reserve_field_size_in_header (#{reserve_field_size_in_header})") if (reserve_field_size_in_header > 60000)
+          reserve_field_size_in_folder = @data[offset+38].ord
+          reserve_field_size_in_data = @data[offset+39].ord
+          cursor += 4 + reserve_field_size_in_header
+        end
+        if flag_prev_cabinet
+          idx_terminator = @data.index(END_STRING_TERMINATOR, cursor)
+          invalid_data("@#{cursor} - Unable to read previous cabinet name") if (idx_terminator == nil)
+          cursor = idx_terminator + 1
+          idx_terminator = @data.index(END_STRING_TERMINATOR, cursor)
+          invalid_data("@#{cursor} - Unable to read previous disk name") if (idx_terminator == nil)
+          cursor = idx_terminator + 1
+        end
+        if flag_next_cabinet
+          idx_terminator = @data.index(END_STRING_TERMINATOR, cursor)
+          invalid_data("@#{cursor} - Unable to read next cabinet name") if (idx_terminator == nil)
+          cursor = idx_terminator + 1
+          idx_terminator = @data.index(END_STRING_TERMINATOR, cursor)
+          invalid_data("@#{cursor} - Unable to read next disk name") if (idx_terminator == nil)
+          cursor = idx_terminator + 1
+        end
+        progress(cursor)
+        found_relevant_data([:cab, :msu, :mzz])
+        metadata(
+          :cabinet_size => cabinet_size,
+          :minor_version => minor_version,
+          :major_version => major_version,
+          :nbr_cf_folders => nbr_cf_folders,
+          :nbr_cf_files => nbr_cf_files,
+          :set_id => set_id,
+          :idx_cabinet => idx_cabinet,
+          :flag_prev_cabinet => flag_prev_cabinet,
+          :flag_next_cabinet => flag_next_cabinet,
+          :flag_reserve_present => flag_reserve_present
+        )
+        # CFFOLDER
+        data_blocks = []
+        log_debug "@#{cursor} - Beginning of #{nbr_cf_folders} CFFOLDER structures"
+        nbr_cf_folders.times do |idx_cf_folder|
+          first_data_offset = BinData::Uint32le.read(@data[cursor..cursor+3])
+          nbr_data_blocks = BinData::Uint16le.read(@data[cursor+4..cursor+5])
+          data_blocks << [ first_data_offset, nbr_data_blocks ]
+          # compression_type = BinData::Uint16le.read(@data[cursor+6..cursor+7])
+          cursor += 8 + reserve_field_size_in_folder
+          progress(cursor)
+        end
+        # CFFILE
+        log_debug "@#{cursor} - Beginning of #{nbr_cf_files} CFFILE structures"
+        nbr_cf_files.times do |idx_cf_file|
+          # file_size = BinData::Uint32le.read(@data[cursor..cursor+3])
+          # file_offset = BinData::Uint32le.read(@data[cursor+4..cursor+7])
+          # idx_file_in_folder = BinData::Uint16le.read(@data[cursor+8..cursor+9])
+          # file_date = BinData::Uint16le.read(@data[cursor+10..cursor+11])
+          # file_time = BinData::Uint16le.read(@data[cursor+12..cursor+13])
+          # file_attrs = BinData::Uint16le.read(@data[cursor+14..cursor+15])
+          cursor += 16
+          idx_terminator = @data.index(END_STRING_TERMINATOR, cursor)
+          invalid_data("@#{cursor} - Unable to read file name") if (idx_terminator == nil)
+          cursor = idx_terminator + 1
+          progress(cursor)
+        end
+        # CFDATA
+        log_debug "@#{cursor} - Beginning of CFDATA"
+        while (!data_blocks.empty?)
+          # We should be on the first data block
+          first_datablock_offset, nbr_datablocks = data_blocks.shift
+          invalid_data("@#{cursor} - We should be on the next data block offset (#{offset+first_datablock_offset})") if (cursor-offset != first_datablock_offset)
+          nbr_datablocks.times do |idx_datablock|
+            # data_crc = BinData::Uint32le.read(@data[cursor..cursor+3])
+            nbr_compressed_bytes = BinData::Uint16le.read(@data[cursor+4..cursor+5])
+            # nbr_uncompressed_bytes = BinData::Uint16le.read(@data[cursor+6..cursor+7])
+            cursor += 8 + reserve_field_size_in_data + nbr_compressed_bytes
+            progress(cursor)
+          end
+        end
+        invalid_data("@#{cursor} - We should be on at the end of the CAB file (#{offset+cabinet_size})") if (cursor-offset != cabinet_size)
+        # Check if it is signed digitally using Authenticode
+        if ((cursor+4 < @end_offset) and
+            (@data[cursor..cursor+1] == AUTHENTICODE_ID))
+          # Read the size
+          authenticode_size = BinData::Uint16be.read(@data[cursor+2..cursor+3])
+          log_debug "@#{cursor} - Found authenticode data of size #{authenticode_size}"
+          cursor += 4 + authenticode_size
+          # Eat eventually up to 4 "\x00" bytes
+          while ((cursor < @end_offset) and
+                 (@data[cursor] == "\x00"))
+            cursor += 1
+          end
+        end
+        return cursor
+      end
+    end
+  end
+end

data/lib/fileshunter/Decoders/CFBF.rb ADDED Viewed

@@ -0,0 +1,92 @@
+module FilesHunter
+  module Decoders
+    class CFBF < BeginPatternDecoder
+      BEGIN_PATTERN_CFBF = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00".force_encoding(Encoding::ASCII_8BIT)
+      KNOWN_EXTENSIONS = {
+        'MSWordDoc'.force_encoding(Encoding::ASCII_8BIT) => :doc,
+        "P\x00o\x00w\x00e\x00r\x00P\x00o\x00i\x00n\x00t\x00".force_encoding(Encoding::ASCII_8BIT) => :pps,
+        'Microsoft Excel'.force_encoding(Encoding::ASCII_8BIT) => :xls,
+        "C\x00a\x00t\x00a\x00l\x00o\x00g\x00".force_encoding(Encoding::ASCII_8BIT) => :db,
+        'Install,MSI,Framework'.force_encoding(Encoding::ASCII_8BIT) => :msi
+      }
+      def get_begin_pattern
+        return BEGIN_PATTERN_CFBF, { :offset_inc => 24 }
+      end
+      def decode(offset)
+        # Know if we are little or big-endian
+        big_endian = (@data[offset+28..offset+29] == "\xFF\xFE")
+        bindata32 = big_endian ? BinData::Uint32be : BinData::Uint32le
+        bindata16 = big_endian ? BinData::Uint16be : BinData::Uint16le
+        # Read sector size
+        vector_size = 1 << bindata16.read(@data[offset+30..offset+31])
+        # Count the number of sectors
+        # Read the MSAT (first 109 entries)
+        msat = @data[offset+76..offset+511]
+        found_relevant_data(:doc) # Default
+        first_sector_offset = offset + 512
+        # Check if there are additional MSAT sectors
+        next_msat_sector_id = bindata32.read(@data[offset+68..offset+71])
+        while (next_msat_sector_id < 4294967292)
+          # Read the MSAT
+          msat.concat(@data[first_sector_offset+next_msat_sector_id*vector_size..first_sector_offset+(next_msat_sector_id+1)*vector_size-5])
+          # The last sector ID is the next MSAT sector one
+          next_msat_sector_id = bindata32.read(@data[first_sector_offset+(next_msat_sector_id+1)*vector_size-4..first_sector_offset+(next_msat_sector_id+1)*vector_size-1])
+        end
+        # Decode the MSAT and read each SAT sector
+        sat_sector_ids = []
+        log_debug "=== Size of MSAT: #{msat.size}"
+        (msat.size / 4).times do |idx|
+          sector_id = bindata32.read(msat[idx*4..idx*4+3])
+          sat_sector_ids << sector_id if (sector_id < 4294967292)
+        end
+        # Read each SAT sector and get the maximum sector ID
+        max_sector_id = -1
+        sat_sector_ids.each do |container_sector_id|
+          sector_offset = first_sector_offset + container_sector_id*vector_size
+          (vector_size / 4).times do |idx|
+            sector_id = bindata32.read(@data[sector_offset+idx*4..sector_offset+idx*4+3])
+            if ((sector_id < 4294967292) and
+                (sector_id > max_sector_id))
+              max_sector_id = sector_id
+            end
+          end
+        end
+        # We got the number of sectors
+        nbr_sectors = max_sector_id + 1
+        log_debug "=== Number of sectors: #{nbr_sectors}"
+        metadata(
+          :msat_size => msat.size,
+          :nbr_sectors => nbr_sectors
+        )
+        # Now find some info about the file extension
+        found_extension = false
+        nbr_sectors.times do |idx_sector|
+          log_debug "=== Find extension @ sector #{idx_sector}"
+          KNOWN_EXTENSIONS.each do |token, extension|
+            if (@data[first_sector_offset+idx_sector*vector_size..first_sector_offset+(idx_sector+1)*vector_size-1].index(token) != nil)
+              log_debug "=== Found extension #{extension}"
+              found_relevant_data(extension)
+              found_extension = true
+              break
+            end
+          end
+          break if found_extension
+        end
+        log_debug "@#{offset} - Unable to get extension from CFBF document." if (!found_extension)
+        return first_sector_offset + nbr_sectors*vector_size
+      end
+    end
+  end
+end