RubyGems - format_parser - Versions diffs - 0.4.0 → 0.5.0 - Mend

format_parser 0.4.0 → 0.5.0

Files changed (19) hide show

checksums.yaml +5 -5
data/.gitignore +1 -1
data/README.md +15 -2
data/lib/archive.rb +3 -5
data/lib/format_parser.rb +4 -0
data/lib/format_parser/version.rb +1 -1
data/lib/io_utils.rb +3 -0
data/lib/parsers/flac_parser.rb +75 -0
data/lib/parsers/moov_parser.rb +17 -8
data/lib/parsers/moov_parser/decoder.rb +10 -1
data/lib/parsers/mp3_parser.rb +9 -0
data/lib/parsers/zip_parser.rb +11 -1
data/lib/parsers/zip_parser/file_reader.rb +24 -29
data/lib/parsers/zip_parser/office_formats.rb +51 -0
data/spec/esoteric_formats_spec.rb +10 -0
data/spec/parsers/flac_parser_spec.rb +63 -0
data/spec/parsers/moov_parser_spec.rb +39 -4
data/spec/parsers/zip_parser_spec.rb +24 -0
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
-  data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
+SHA256:
+  metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
+  data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
 SHA512:
-  metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
-  data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
+  metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
+  data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488

data/.gitignore CHANGED Viewed

@@ -10,4 +10,4 @@
 *.gem
 # rspec failure tracking
-.rspec_status
+.rspec_status

data/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
 ## Currently supported filetypes:
-`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
+`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
 ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
@@ -101,7 +101,7 @@ class MyParser
   def call(io)
     # ... do some parsing with `io`
     magic_bytes = io.read(4)
-    return unless magic_bytes != 'XBMP'
+    return unless magic_bytes == 'XBMP'
     # ... more parsing code
     # ...and return the FileInformation::Image object with the metadata.
     FormatParser::Image.new(
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
 ### CR2
 - CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
+### FLAC
+- atc_fixture_vbr.flac is a converted version of the MP3 with the same name
+- c_11k16btipcm.flac is a converted version of the WAV with the same name
+### M4A
+- fixture.m4a was created by one of the project maintainers and is MIT licensed
+### ZIP
+- The .zip fixture files have been created by the project maintainers
+### .docx
+- The .docx files were generated by the project maintainers

data/lib/archive.rb CHANGED Viewed

@@ -10,7 +10,9 @@ module FormatParser
       end
     end
-    NATURE = :archive
+    # Lots of Office and LibreOffice documents are in fact packaged into
+    # ZIPs, as are .epub files. We make `nature` customisable for this occasion
+    attr_accessor :nature
     # What filetype was recognized? Will contain a non-ambiguous symbol
     # referring to the file format. The symbol can be used as a filename
@@ -28,9 +30,5 @@ module FormatParser
     def initialize(**attributes)
       attributes.map { |(k, v)| public_send("#{k}=", v) }
     end
-    def nature
-      NATURE
-    end
   end
 end

data/lib/format_parser.rb CHANGED Viewed

@@ -93,6 +93,10 @@ module FormatParser
       rescue IOUtils::InvalidRead
         # There was not enough data for this parser to work on,
         # and it triggered an error
+      rescue IOUtils::MalformedFile
+        # Unexpected input was encountered during the parsing of
+        # a file. This might indicate either a malicious or a
+        # corruped file.
       rescue ReadLimiter::BudgetExceeded
         # The parser tried to read too much - most likely the file structure
         # caused the parser to go off-track. Strictly speaking we should log this

data/lib/format_parser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FormatParser
-  VERSION = '0.4.0'
+  VERSION = '0.5.0'
 end

data/lib/io_utils.rb CHANGED Viewed

@@ -2,6 +2,9 @@ module FormatParser::IOUtils
   class InvalidRead < ArgumentError
   end
+  class MalformedFile < ArgumentError
+  end
   def safe_read(io, n)
     raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
     buf = io.read(n)

data/lib/parsers/flac_parser.rb ADDED Viewed

@@ -0,0 +1,75 @@
+class FormatParser::FLACParser
+  include FormatParser::IOUtils
+  MAGIC_BYTES = 4
+  MAGIC_BYTE_STRING = 'fLaC'
+  BLOCK_HEADER_BYTES = 4
+  def bytestring_to_int(s)
+    s.unpack('B*')[0].to_i(2)
+  end
+  def call(io)
+    magic_bytes = safe_read(io, MAGIC_BYTES)
+    return unless magic_bytes == MAGIC_BYTE_STRING
+    # Skip info we don't need
+    safe_skip(io, BLOCK_HEADER_BYTES)
+    minimum_block_size = bytestring_to_int(safe_read(io, 2))
+    if minimum_block_size < 16
+      raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
+    end
+    maximum_block_size = bytestring_to_int(safe_read(io, 2))
+    if maximum_block_size < minimum_block_size
+      raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
+    end
+    minimum_frame_size = bytestring_to_int(safe_read(io, 3))
+    maximum_frame_size = bytestring_to_int(safe_read(io, 3))
+    # Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
+    # so read total as bitstring and parse separately
+    audio_info = safe_read(io, 8).unpack('B*')[0]
+    # sample rate is 20 bits
+    sample_rate = audio_info.slice!(0..19).to_i(2)
+    raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
+    # Number of channels is 3 bits
+    # Header contains number of channels minus one, so add one
+    num_channels = audio_info.slice!(0..2).to_i(2) + 1
+    # Bits per sample is 5 bits
+    # Header contains number of bits per sample minus one, so add one
+    bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
+    # Total samples is 36 bits
+    total_samples = audio_info.slice!(0..35).to_i(2)
+    # Division is safe due to check above
+    duration = total_samples.to_f / sample_rate
+    FormatParser::Audio.new(
+      format: :flac,
+      num_audio_channels: num_channels,
+      audio_sample_rate_hz: sample_rate,
+      media_duration_seconds: duration,
+      media_duration_frames: total_samples,
+      intrinsics: {
+        bits_per_sample: bits_per_sample,
+        minimum_frame_size: minimum_frame_size,
+        maximum_frame_size: maximum_frame_size,
+        minimum_block_size: minimum_block_size,
+        maximum_block_size: maximum_block_size
+      }
+    )
+  end
+  FormatParser.register_parser self, natures: :audio, formats: :flac
+end

data/lib/parsers/moov_parser.rb CHANGED Viewed

@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
       media_duration_s = duration / timescale.to_f
     end
-    FormatParser::Video.new(
-      format: format_from_moov_type(file_type),
-      width_px: width,
-      height_px: height,
-      media_duration_seconds: media_duration_s,
-      intrinsics: atom_tree,
-    )
+    # M4A only contains audio, while MP4 and friends can contain video.
+    if format_from_moov_type(file_type) == :m4a
+      FormatParser::Audio.new(
+        format: format_from_moov_type(file_type),
+        media_duration_seconds: media_duration_s,
+        intrinsics: atom_tree,
+      )
+    else
+      FormatParser::Video.new(
+        format: format_from_moov_type(file_type),
+        width_px: width,
+        height_px: height,
+        media_duration_seconds: media_duration_s,
+        intrinsics: atom_tree,
+      )
+    end
   end
   private
   def format_from_moov_type(file_type)
-    FTYP_MAP.fetch(file_type, :mov)
+    FTYP_MAP.fetch(file_type.downcase, :mov)
   end
   # An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length

data/lib/parsers/moov_parser/decoder.rb CHANGED Viewed

@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
     }
   end
+  def parse_meta_atom(io, atom_size)
+    parse_hdlr_atom(io, atom_size)
+  end
   def parse_atom_fields_per_type(io, atom_size, atom_type)
     if respond_to?("parse_#{atom_type}_atom", true)
       send("parse_#{atom_type}_atom", io, atom_size)
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
     end
   end
+  def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
+    parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
+    extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
+  end
   # Recursive descent parser - will drill down to atoms which
   # we know are permitted to have leaf/branch atoms within itself,
   # and will attempt to recover the data fields for leaf atoms
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
       atom_size_sans_header = atom_size - size_of_atom_type_and_size
       children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
-        parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
+        parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
       elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
         [extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
       else # Assume leaf atom

data/lib/parsers/mp3_parser.rb CHANGED Viewed

@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
   # Default frame size for mp3
   SAMPLES_PER_FRAME = 1152
+  # For some edge cases
+  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
   def call(io)
+    # Special case: some ZIPs (Office documents) did detect as MP3s.
+    # To avoid having that happen, we check for the PKZIP signature -
+    # local entry header signature - at the very start of the file
+    return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
+    io.seek(0)
     # Read the last 128 bytes which might contain ID3v1
     id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
     # Read the header bytes that might contain ID3v1

data/lib/parsers/zip_parser.rb CHANGED Viewed

@@ -1,17 +1,27 @@
 class FormatParser::ZIPParser
   require_relative 'zip_parser/file_reader'
+  require_relative 'zip_parser/office_formats'
+  include OfficeFormats
   def call(io)
     reader = FileReader.new
     entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
+    filenames_set = Set.new
     entries_archive = entries.map do |ze|
       ft = directory?(ze) ? :directory : :file
       decoded_filename = decode_filename(ze)
+      filenames_set << decoded_filename
       FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
     end
-    FormatParser::Archive.new(format: :zip, entries: entries_archive)
+    if office_document?(filenames_set)
+      office_format = office_file_format_from_entry_set(filenames_set)
+      FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
+    else
+      FormatParser::Archive.new(nature: :archive,  format: :zip, entries: entries_archive)
+    end
   rescue FileReader::Error
     # This is not a ZIP, or a broken ZIP.
     return

data/lib/parsers/zip_parser/file_reader.rb CHANGED Viewed

@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
   def read_zip_structure(io:)
     zip_file_size = io.size
     eocd_offset = get_eocd_offset(io, zip_file_size)
     zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
     num_files, cdir_location, cdir_size =
       if zip64_end_of_cdir_location
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
     eocd_offset
   end
-  # This is tricky. Essentially, we have to scan the maximum possible number
-  # of bytes (that the EOCD can theoretically occupy including the comment),
-  # and we have to find a combination of:
-  #   [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
-  # that size, eof].
-  # The only way I could find to do this was with a sliding window, but
-  # there probably is a better way.
+  def all_indices_of_substr_in_str(of_substring, in_string)
+    last_i = 0
+    found_at_indices = []
+    while last_i = in_string.index(of_substring, last_i)
+      found_at_indices << last_i
+      last_i += of_substring.bytesize
+    end
+    found_at_indices
+  end
   def locate_eocd_signature(in_str)
-    # We have to scan from the _very_ tail. We read the very minimum size
-    # the EOCD record can have (up to and including the comment size), using
-    # a sliding window. Once our end offset matches the comment size we found our
-    # EOCD marker.
+    eocd_signature = [0x06054b50].pack('V')
     unpack_pattern = 'VvvvvVVv'
     minimum_record_size = 22
-    end_location = minimum_record_size * -1
-    loop do
-      # If the window is nil, we have rolled off the start of the string, nothing to do here.
-      # We use negative values because if we used positive slice indices
-      # we would have to detect the rollover ourselves
-      break unless window = in_str[end_location, minimum_record_size]
-      window_location = in_str.bytesize + end_location
-      unpacked = window.unpack(unpack_pattern)
-      # If we found the signarue, pick up the comment size, and check if the size of the window
-      # plus that comment size is where we are in the string. If we are - bingo.
-      if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
-        assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
-        # if the comment size is where we should be at - we found our EOCD
-        return assumed_eocd_location if assumed_eocd_location == window_location
+    str_size = in_str.bytesize
+    indices = all_indices_of_substr_in_str(eocd_signature, in_str)
+    indices.each do |check_at|
+      maybe_record = in_str[check_at..str_size]
+      # If the record is smaller than the minimum - we will never recover anything
+      break if maybe_record.bytesize < minimum_record_size
+      signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
+      # Check the only condition for the match
+      if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
+        return check_at # Found the EOCD marker location
       end
-      end_location -= 1 # Shift the window back, by one byte, and try again.
     end
+    # If we haven't caught anything, return nil deliberately instead of returning the last statement
+    nil
   end
   # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the

data/lib/parsers/zip_parser/office_formats.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# Based on an unscientific sample of 63 documents I could find on my hard drive,
+# all docx/pptx/xlsx files contain, at the minimum, the following files:
+#
+#   [Content_types].xml
+#   _rels/.rels
+#   docProps/core.xml
+#   docPropx/app.xml
+#
+# Additionally, per file type, they contain the following:
+#
+#   word/document.xml
+#   xl/workbook.xml
+#   ppt/presentation.xml
+#
+# These are sufficient to say with certainty that a ZIP is in fact an Office document.
+# Also that unscientific sample revealed that I came to dislike MS Office so much as to
+# only have 63 documents on my entire workstation.
+#
+# We do not perform the actual _decoding_ of the Office documents here, because to read
+# their contents we need to:
+#
+# * inflate the compressed part files (potential for deflate bombs)
+# * parse the document XML (potential for XML parser exploitation)
+#
+# which are real threats and require adequate mitigation. For our purposes the
+# token detection of specific filenames should be enough to say with certainty
+# that a document _is_ an Office document, and not just a ZIP.
+module FormatParser::ZIPParser::OfficeFormats
+  OFFICE_MARKER_FILES = Set.new([
+    '[Content_Types].xml',
+    '_rels/.rels',
+    'docProps/core.xml',
+    'docProps/app.xml',
+  ])
+  def office_document?(filenames_set)
+    OFFICE_MARKER_FILES.subset?(filenames_set)
+  end
+  def office_file_format_from_entry_set(filenames_set)
+    if filenames_set.include?('word/document.xml')
+      :docx
+    elsif filenames_set.include?('xl/workbook.xml')
+      :xlsx
+    elsif filenames_set.include?('ppt/presentation.xml')
+      :pptx
+    else
+      :unknown
+    end
+  end
+end

data/spec/esoteric_formats_spec.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'spec_helper'
+describe 'Parsing esoteric files and files causing ambiguous detection' do
+  it 'correctly parses the test .docx files as Office docs' do
+    docx_path = fixtures_dir + '/ZIP/10.docx'
+    result = FormatParser.parse(File.open(docx_path, 'rb'))
+    expect(result).not_to be_nil
+    expect(result.nature).to eq(:document)
+  end
+end

data/spec/parsers/flac_parser_spec.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require 'spec_helper'
+describe FormatParser::FLACParser do
+  it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
+    fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
+    parsed = subject.call(File.open(fpath, 'rb'))
+    expect(parsed).not_to be_nil
+    expect(parsed.nature).to eq(:audio)
+    expect(parsed.format).to eq(:flac)
+    expect(parsed.num_audio_channels).to eq(2)
+    expect(parsed.audio_sample_rate_hz).to eq(44100)
+    expect(parsed.intrinsics).not_to be_nil
+    expect(parsed.media_duration_frames).to eq(33810)
+    expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
+  end
+  it 'decodes and estimates duration for the 16bit FLAC File' do
+    fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
+    parsed = subject.call(File.open(fpath, 'rb'))
+    expect(parsed).not_to be_nil
+    expect(parsed.nature).to eq(:audio)
+    expect(parsed.format).to eq(:flac)
+    expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
+    expect(parsed.num_audio_channels).to eq(1)
+    expect(parsed.audio_sample_rate_hz).to eq(11025)
+    expect(parsed.media_duration_frames).to eq(152267)
+    expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
+  end
+  it 'raises error on parsing an invalid file' do
+    fpath = fixtures_dir + 'FLAC/invalid.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::InvalidRead)
+  end
+  it 'raises error on parsing a file with an invalid block size' do
+    fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+    fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+  end
+  it 'raises an error when sample rate is 0' do
+    fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+  end
+end

data/spec/parsers/moov_parser_spec.rb CHANGED Viewed

@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
     end
   end
-  Dir.glob(fixtures_dir + '/MOOV/**/*.*').sort.each do |moov_path|
-    it "is able to parse #{File.basename(moov_path)}" do
-      result = subject.call(File.open(moov_path, 'rb'))
+  Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
+    it "is able to parse #{File.basename(m4a_path)}" do
+      result = subject.call(File.open(m4a_path, 'rb'))
+      expect(result).not_to be_nil
+      expect(result.nature).to eq(:audio)
+      expect(result.media_duration_seconds).to be_kind_of(Float)
+      expect(result.media_duration_seconds).to be > 0
+      expect(result.intrinsics).not_to be_nil
+    end
+  end
+  Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
+    it "is able to parse #{File.basename(mov_path)}" do
+      result = subject.call(File.open(mov_path, 'rb'))
+      expect(result).not_to be_nil
+      expect(result.nature).to eq(:video)
+      expect(result.width_px).to be > 0
+      expect(result.height_px).to be > 0
+      expect(result.media_duration_seconds).to be_kind_of(Float)
+      expect(result.media_duration_seconds).to be > 0
+      expect(result.intrinsics).not_to be_nil
+    end
+  end
+  Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
+    it "is able to parse #{File.basename(mp4_path)}" do
+      result = subject.call(File.open(mp4_path, 'rb'))
       expect(result).not_to be_nil
       expect(result.nature).to eq(:video)
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
     end
   end
-  it 'parses an M4A file and provides the necessary metadata'
+  it 'parses an M4A file and provides the necessary metadata' do
+    m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
+    result = subject.call(File.open(m4a_path, 'rb'))
+    expect(result).not_to be_nil
+    expect(result.nature).to eq(:audio)
+    expect(result.format).to eq(:m4a)
+  end
   it 'parses a MOV file and provides the necessary metadata' do
     mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'

data/spec/parsers/zip_parser_spec.rb CHANGED Viewed

@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
     expect(dir_entry.type).to eq(:directory)
   end
+  it 'correctly identifies Word documents' do
+    fixture_path = fixtures_dir + '/ZIP/10.docx'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result.nature).to eq(:document)
+    expect(result.format).to eq(:docx)
+    fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result.nature).to eq(:document)
+    expect(result.format).to eq(:docx)
+  end
+  it 'is able to handle specific fuzzed input' do
+    r = Random.new(354)
+    1024.times do
+      random_blob = StringIO.new(r.bytes(512 * 1024))
+      subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
+    end
+  end
   it 'returns a result that has a usable JSON representation' do
     fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
     fi_io = File.open(fixture_path, 'rb')

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: format_parser
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Noah Berman
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-30 00:00:00.000000000 Z
+date: 2018-04-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ks
@@ -174,6 +174,7 @@ files:
 - lib/parsers/dpx_parser.rb
 - lib/parsers/exif_parser.rb
 - lib/parsers/fdx_parser.rb
+- lib/parsers/flac_parser.rb
 - lib/parsers/gif_parser.rb
 - lib/parsers/jpeg_parser.rb
 - lib/parsers/moov_parser.rb
@@ -188,11 +189,13 @@ files:
 - lib/parsers/wav_parser.rb
 - lib/parsers/zip_parser.rb
 - lib/parsers/zip_parser/file_reader.rb
+- lib/parsers/zip_parser/office_formats.rb
 - lib/read_limiter.rb
 - lib/remote_io.rb
 - lib/video.rb
 - spec/attributes_json_spec.rb
 - spec/care_spec.rb
+- spec/esoteric_formats_spec.rb
 - spec/file_information_spec.rb
 - spec/format_parser_spec.rb
 - spec/io_utils_spec.rb
@@ -201,6 +204,7 @@ files:
 - spec/parsers/dpx_parser_spec.rb
 - spec/parsers/exif_parser_spec.rb
 - spec/parsers/fdx_parser_spec.rb
+- spec/parsers/flac_parser_spec.rb
 - spec/parsers/gif_parser_spec.rb
 - spec/parsers/jpeg_parser_spec.rb
 - spec/parsers/moov_parser_spec.rb
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.2
+rubygems_version: 2.7.3
 signing_key:
 specification_version: 4
 summary: A library for efficient parsing of file metadata