RubyGems - format_parser - Versions diffs - 0.4.0 → 0.5.0 - Mend

format_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +5 -5
data/.gitignore +1 -1
data/README.md +15 -2
data/lib/archive.rb +3 -5
data/lib/format_parser.rb +4 -0
data/lib/format_parser/version.rb +1 -1
data/lib/io_utils.rb +3 -0
data/lib/parsers/flac_parser.rb +75 -0
data/lib/parsers/moov_parser.rb +17 -8
data/lib/parsers/moov_parser/decoder.rb +10 -1
data/lib/parsers/mp3_parser.rb +9 -0
data/lib/parsers/zip_parser.rb +11 -1
data/lib/parsers/zip_parser/file_reader.rb +24 -29
data/lib/parsers/zip_parser/office_formats.rb +51 -0
data/spec/esoteric_formats_spec.rb +10 -0
data/spec/parsers/flac_parser_spec.rb +63 -0
data/spec/parsers/moov_parser_spec.rb +39 -4
data/spec/parsers/zip_parser_spec.rb +24 -0
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
-  data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
+SHA256:
+  metadata.gz: 7ff294de8e6759d2705cabe93f548ffb6733a121410e1b3c9dc929de52356745
+  data.tar.gz: 663240675efd9e8e5425f27333098e8661ef5e8dbbbd38bab91fff1e605bbefc
 SHA512:
-  metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
-  data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144
+  metadata.gz: ed7ea153adf28d2efb9352a880d9f4a146ebdad53e282f75d36c2dee2887740d727c25373d33c83dc8ab863e680e12175bb19109cbecf2347950a6fc0233c385
+  data.tar.gz: cf6d750264fdfe5a9520c3bb9a8a02b856ba825b709435661b3fcd614f118fc3709d3ba88a25bff0a142f1d03d4fc2389f63740606ff5df4e2db837977a66488

data/.gitignore CHANGED Viewed

@@ -10,4 +10,4 @@
 *.gem
 # rspec failure tracking
-.rspec_status
+.rspec_status

data/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
 ## Currently supported filetypes:
-`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, DPX, AIFF, WAV, FDX, MOV, MP4`
+`TIFF, CR2, PSD, PNG, MP3, JPEG, GIF, PDF, DPX, AIFF, WAV, FDX, MOV, MP4, M4A, FLAC, DOCX, PPTX, XLSX`
 ...with [more](https://github.com/WeTransfer/format_parser/issues?q=is%3Aissue+is%3Aopen+label%3Aformats) on the way!
@@ -101,7 +101,7 @@ class MyParser
   def call(io)
     # ... do some parsing with `io`
     magic_bytes = io.read(4)
-    return unless magic_bytes != 'XBMP'
+    return unless magic_bytes == 'XBMP'
     # ... more parsing code
     # ...and return the FileInformation::Image object with the metadata.
     FormatParser::Image.new(
@@ -171,3 +171,16 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
 ### CR2
 - CR2 examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
+### FLAC
+- atc_fixture_vbr.flac is a converted version of the MP3 with the same name
+- c_11k16btipcm.flac is a converted version of the WAV with the same name
+### M4A
+- fixture.m4a was created by one of the project maintainers and is MIT licensed
+### ZIP
+- The .zip fixture files have been created by the project maintainers
+### .docx
+- The .docx files were generated by the project maintainers

data/lib/archive.rb CHANGED Viewed

@@ -10,7 +10,9 @@ module FormatParser
       end
     end
-    NATURE = :archive
+    # Lots of Office and LibreOffice documents are in fact packaged into
+    # ZIPs, as are .epub files. We make `nature` customisable for this occasion
+    attr_accessor :nature
     # What filetype was recognized? Will contain a non-ambiguous symbol
     # referring to the file format. The symbol can be used as a filename
@@ -28,9 +30,5 @@ module FormatParser
     def initialize(**attributes)
       attributes.map { |(k, v)| public_send("#{k}=", v) }
     end
-    def nature
-      NATURE
-    end
   end
 end

data/lib/format_parser.rb CHANGED Viewed

@@ -93,6 +93,10 @@ module FormatParser
       rescue IOUtils::InvalidRead
         # There was not enough data for this parser to work on,
         # and it triggered an error
+      rescue IOUtils::MalformedFile
+        # Unexpected input was encountered during the parsing of
+        # a file. This might indicate either a malicious or a
+        # corruped file.
       rescue ReadLimiter::BudgetExceeded
         # The parser tried to read too much - most likely the file structure
         # caused the parser to go off-track. Strictly speaking we should log this

data/lib/format_parser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FormatParser
-  VERSION = '0.4.0'
+  VERSION = '0.5.0'
 end

data/lib/io_utils.rb CHANGED Viewed

@@ -2,6 +2,9 @@ module FormatParser::IOUtils
   class InvalidRead < ArgumentError
   end
+  class MalformedFile < ArgumentError
+  end
   def safe_read(io, n)
     raise ArgumentError, 'Unbounded reads are not supported' if n.nil?
     buf = io.read(n)

data/lib/parsers/flac_parser.rb ADDED Viewed

@@ -0,0 +1,75 @@
+class FormatParser::FLACParser
+  include FormatParser::IOUtils
+  MAGIC_BYTES = 4
+  MAGIC_BYTE_STRING = 'fLaC'
+  BLOCK_HEADER_BYTES = 4
+  def bytestring_to_int(s)
+    s.unpack('B*')[0].to_i(2)
+  end
+  def call(io)
+    magic_bytes = safe_read(io, MAGIC_BYTES)
+    return unless magic_bytes == MAGIC_BYTE_STRING
+    # Skip info we don't need
+    safe_skip(io, BLOCK_HEADER_BYTES)
+    minimum_block_size = bytestring_to_int(safe_read(io, 2))
+    if minimum_block_size < 16
+      raise MalformedFile, 'FLAC file minimum block size must be larger than 16'
+    end
+    maximum_block_size = bytestring_to_int(safe_read(io, 2))
+    if maximum_block_size < minimum_block_size
+      raise MalformedFile, 'FLAC file maximum block size must be equal to or larger than minimum block size'
+    end
+    minimum_frame_size = bytestring_to_int(safe_read(io, 3))
+    maximum_frame_size = bytestring_to_int(safe_read(io, 3))
+    # Audio info comes in irregularly sized (i.e. not 8-bit) chunks,
+    # so read total as bitstring and parse separately
+    audio_info = safe_read(io, 8).unpack('B*')[0]
+    # sample rate is 20 bits
+    sample_rate = audio_info.slice!(0..19).to_i(2)
+    raise MalformedFile, 'FLAC file sample rate must be larger than 0' unless sample_rate > 0
+    # Number of channels is 3 bits
+    # Header contains number of channels minus one, so add one
+    num_channels = audio_info.slice!(0..2).to_i(2) + 1
+    # Bits per sample is 5 bits
+    # Header contains number of bits per sample minus one, so add one
+    bits_per_sample = audio_info.slice!(0..4).to_i(2) + 1
+    # Total samples is 36 bits
+    total_samples = audio_info.slice!(0..35).to_i(2)
+    # Division is safe due to check above
+    duration = total_samples.to_f / sample_rate
+    FormatParser::Audio.new(
+      format: :flac,
+      num_audio_channels: num_channels,
+      audio_sample_rate_hz: sample_rate,
+      media_duration_seconds: duration,
+      media_duration_frames: total_samples,
+      intrinsics: {
+        bits_per_sample: bits_per_sample,
+        minimum_frame_size: minimum_frame_size,
+        maximum_frame_size: maximum_frame_size,
+        minimum_block_size: minimum_block_size,
+        maximum_block_size: maximum_block_size
+      }
+    )
+  end
+  FormatParser.register_parser self, natures: :audio, formats: :flac
+end

data/lib/parsers/moov_parser.rb CHANGED Viewed

@@ -52,19 +52,28 @@ class FormatParser::MOOVParser
       media_duration_s = duration / timescale.to_f
     end
-    FormatParser::Video.new(
-      format: format_from_moov_type(file_type),
-      width_px: width,
-      height_px: height,
-      media_duration_seconds: media_duration_s,
-      intrinsics: atom_tree,
-    )
+    # M4A only contains audio, while MP4 and friends can contain video.
+    if format_from_moov_type(file_type) == :m4a
+      FormatParser::Audio.new(
+        format: format_from_moov_type(file_type),
+        media_duration_seconds: media_duration_s,
+        intrinsics: atom_tree,
+      )
+    else
+      FormatParser::Video.new(
+        format: format_from_moov_type(file_type),
+        width_px: width,
+        height_px: height,
+        media_duration_seconds: media_duration_s,
+        intrinsics: atom_tree,
+      )
+    end
   end
   private
   def format_from_moov_type(file_type)
-    FTYP_MAP.fetch(file_type, :mov)
+    FTYP_MAP.fetch(file_type.downcase, :mov)
   end
   # An MPEG4/MOV/M4A will start with the "ftyp" atom. The atom must have a length

data/lib/parsers/moov_parser/decoder.rb CHANGED Viewed

@@ -181,6 +181,10 @@ class FormatParser::MOOVParser::Decoder
     }
   end
+  def parse_meta_atom(io, atom_size)
+    parse_hdlr_atom(io, atom_size)
+  end
   def parse_atom_fields_per_type(io, atom_size, atom_type)
     if respond_to?("parse_#{atom_type}_atom", true)
       send("parse_#{atom_type}_atom", io, atom_size)
@@ -189,6 +193,11 @@ class FormatParser::MOOVParser::Decoder
     end
   end
+  def parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
+    parse_atom_fields_per_type(io, atom_size_sans_header, atom_type)
+    extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type])
+  end
   # Recursive descent parser - will drill down to atoms which
   # we know are permitted to have leaf/branch atoms within itself,
   # and will attempt to recover the data fields for leaf atoms
@@ -215,7 +224,7 @@ class FormatParser::MOOVParser::Decoder
       atom_size_sans_header = atom_size - size_of_atom_type_and_size
       children, fields = if KNOWN_BRANCH_AND_LEAF_ATOM_TYPES.include?(atom_type)
-        parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type)
+        parse_atom_children_and_data_fields(io, atom_size_sans_header, atom_type, current_branch)
       elsif KNOWN_BRANCH_ATOM_TYPES.include?(atom_type)
         [extract_atom_stream(io, atom_size_sans_header, current_branch + [atom_type]), nil]
       else # Assume leaf atom

data/lib/parsers/mp3_parser.rb CHANGED Viewed

@@ -23,7 +23,16 @@ class FormatParser::MP3Parser
   # Default frame size for mp3
   SAMPLES_PER_FRAME = 1152
+  # For some edge cases
+  ZIP_LOCAL_ENTRY_SIGNATURE = "PK\x03\x04\x14\x00".b
   def call(io)
+    # Special case: some ZIPs (Office documents) did detect as MP3s.
+    # To avoid having that happen, we check for the PKZIP signature -
+    # local entry header signature - at the very start of the file
+    return if io.read(6) == ZIP_LOCAL_ENTRY_SIGNATURE
+    io.seek(0)
     # Read the last 128 bytes which might contain ID3v1
     id3_v1 = ID3V1.attempt_id3_v1_extraction(io)
     # Read the header bytes that might contain ID3v1

data/lib/parsers/zip_parser.rb CHANGED Viewed

@@ -1,17 +1,27 @@
 class FormatParser::ZIPParser
   require_relative 'zip_parser/file_reader'
+  require_relative 'zip_parser/office_formats'
+  include OfficeFormats
   def call(io)
     reader = FileReader.new
     entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
+    filenames_set = Set.new
     entries_archive = entries.map do |ze|
       ft = directory?(ze) ? :directory : :file
       decoded_filename = decode_filename(ze)
+      filenames_set << decoded_filename
       FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
     end
-    FormatParser::Archive.new(format: :zip, entries: entries_archive)
+    if office_document?(filenames_set)
+      office_format = office_file_format_from_entry_set(filenames_set)
+      FormatParser::Archive.new(nature: :document, format: office_format, entries: entries_archive)
+    else
+      FormatParser::Archive.new(nature: :archive,  format: :zip, entries: entries_archive)
+    end
   rescue FileReader::Error
     # This is not a ZIP, or a broken ZIP.
     return

data/lib/parsers/zip_parser/file_reader.rb CHANGED Viewed

@@ -159,7 +159,6 @@ class FormatParser::ZIPParser::FileReader
   def read_zip_structure(io:)
     zip_file_size = io.size
     eocd_offset = get_eocd_offset(io, zip_file_size)
     zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
     num_files, cdir_location, cdir_size =
       if zip64_end_of_cdir_location
@@ -345,39 +344,35 @@ class FormatParser::ZIPParser::FileReader
     eocd_offset
   end
-  # This is tricky. Essentially, we have to scan the maximum possible number
-  # of bytes (that the EOCD can theoretically occupy including the comment),
-  # and we have to find a combination of:
-  #   [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
-  # that size, eof].
-  # The only way I could find to do this was with a sliding window, but
-  # there probably is a better way.
+  def all_indices_of_substr_in_str(of_substring, in_string)
+    last_i = 0
+    found_at_indices = []
+    while last_i = in_string.index(of_substring, last_i)
+      found_at_indices << last_i
+      last_i += of_substring.bytesize
+    end
+    found_at_indices
+  end
   def locate_eocd_signature(in_str)
-    # We have to scan from the _very_ tail. We read the very minimum size
-    # the EOCD record can have (up to and including the comment size), using
-    # a sliding window. Once our end offset matches the comment size we found our
-    # EOCD marker.
+    eocd_signature = [0x06054b50].pack('V')
     unpack_pattern = 'VvvvvVVv'
     minimum_record_size = 22
-    end_location = minimum_record_size * -1
-    loop do
-      # If the window is nil, we have rolled off the start of the string, nothing to do here.
-      # We use negative values because if we used positive slice indices
-      # we would have to detect the rollover ourselves
-      break unless window = in_str[end_location, minimum_record_size]
-      window_location = in_str.bytesize + end_location
-      unpacked = window.unpack(unpack_pattern)
-      # If we found the signarue, pick up the comment size, and check if the size of the window
-      # plus that comment size is where we are in the string. If we are - bingo.
-      if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
-        assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
-        # if the comment size is where we should be at - we found our EOCD
-        return assumed_eocd_location if assumed_eocd_location == window_location
+    str_size = in_str.bytesize
+    indices = all_indices_of_substr_in_str(eocd_signature, in_str)
+    indices.each do |check_at|
+      maybe_record = in_str[check_at..str_size]
+      # If the record is smaller than the minimum - we will never recover anything
+      break if maybe_record.bytesize < minimum_record_size
+      signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)
+      # Check the only condition for the match
+      if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
+        return check_at # Found the EOCD marker location
       end
-      end_location -= 1 # Shift the window back, by one byte, and try again.
     end
+    # If we haven't caught anything, return nil deliberately instead of returning the last statement
+    nil
   end
   # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the

data/lib/parsers/zip_parser/office_formats.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# Based on an unscientific sample of 63 documents I could find on my hard drive,
+# all docx/pptx/xlsx files contain, at the minimum, the following files:
+#
+#   [Content_types].xml
+#   _rels/.rels
+#   docProps/core.xml
+#   docPropx/app.xml
+#
+# Additionally, per file type, they contain the following:
+#
+#   word/document.xml
+#   xl/workbook.xml
+#   ppt/presentation.xml
+#
+# These are sufficient to say with certainty that a ZIP is in fact an Office document.
+# Also that unscientific sample revealed that I came to dislike MS Office so much as to
+# only have 63 documents on my entire workstation.
+#
+# We do not perform the actual _decoding_ of the Office documents here, because to read
+# their contents we need to:
+#
+# * inflate the compressed part files (potential for deflate bombs)
+# * parse the document XML (potential for XML parser exploitation)
+#
+# which are real threats and require adequate mitigation. For our purposes the
+# token detection of specific filenames should be enough to say with certainty
+# that a document _is_ an Office document, and not just a ZIP.
+module FormatParser::ZIPParser::OfficeFormats
+  OFFICE_MARKER_FILES = Set.new([
+    '[Content_Types].xml',
+    '_rels/.rels',
+    'docProps/core.xml',
+    'docProps/app.xml',
+  ])
+  def office_document?(filenames_set)
+    OFFICE_MARKER_FILES.subset?(filenames_set)
+  end
+  def office_file_format_from_entry_set(filenames_set)
+    if filenames_set.include?('word/document.xml')
+      :docx
+    elsif filenames_set.include?('xl/workbook.xml')
+      :xlsx
+    elsif filenames_set.include?('ppt/presentation.xml')
+      :pptx
+    else
+      :unknown
+    end
+  end
+end

data/spec/esoteric_formats_spec.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'spec_helper'
+describe 'Parsing esoteric files and files causing ambiguous detection' do
+  it 'correctly parses the test .docx files as Office docs' do
+    docx_path = fixtures_dir + '/ZIP/10.docx'
+    result = FormatParser.parse(File.open(docx_path, 'rb'))
+    expect(result).not_to be_nil
+    expect(result.nature).to eq(:document)
+  end
+end

data/spec/parsers/flac_parser_spec.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require 'spec_helper'
+describe FormatParser::FLACParser do
+  it 'decodes and estimates duration for the atc_fixture_vbr FLAC File' do
+    fpath = fixtures_dir + 'FLAC/atc_fixture_vbr.flac'
+    parsed = subject.call(File.open(fpath, 'rb'))
+    expect(parsed).not_to be_nil
+    expect(parsed.nature).to eq(:audio)
+    expect(parsed.format).to eq(:flac)
+    expect(parsed.num_audio_channels).to eq(2)
+    expect(parsed.audio_sample_rate_hz).to eq(44100)
+    expect(parsed.intrinsics).not_to be_nil
+    expect(parsed.media_duration_frames).to eq(33810)
+    expect(parsed.media_duration_seconds).to be_within(0.1).of(0.836)
+  end
+  it 'decodes and estimates duration for the 16bit FLAC File' do
+    fpath = fixtures_dir + 'FLAC/c_11k16bitpcm.flac'
+    parsed = subject.call(File.open(fpath, 'rb'))
+    expect(parsed).not_to be_nil
+    expect(parsed.nature).to eq(:audio)
+    expect(parsed.format).to eq(:flac)
+    expect(parsed.intrinsics[:bits_per_sample]).to eq(16)
+    expect(parsed.num_audio_channels).to eq(1)
+    expect(parsed.audio_sample_rate_hz).to eq(11025)
+    expect(parsed.media_duration_frames).to eq(152267)
+    expect(parsed.media_duration_seconds).to be_within(0.01).of(13.81)
+  end
+  it 'raises error on parsing an invalid file' do
+    fpath = fixtures_dir + 'FLAC/invalid.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::InvalidRead)
+  end
+  it 'raises error on parsing a file with an invalid block size' do
+    fpath = fixtures_dir + 'FLAC/invalid_minimum_block_size.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+    fpath = fixtures_dir + 'FLAC/invalid_maximum_block_size.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+  end
+  it 'raises an error when sample rate is 0' do
+    fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
+    expect {
+      subject.call(File.open(fpath, 'rb'))
+    }.to raise_error(FormatParser::IOUtils::MalformedFile)
+  end
+end

data/spec/parsers/moov_parser_spec.rb CHANGED Viewed

@@ -29,9 +29,37 @@ describe FormatParser::MOOVParser do
     end
   end
-  Dir.glob(fixtures_dir + '/MOOV/**/*.*').sort.each do |moov_path|
-    it "is able to parse #{File.basename(moov_path)}" do
-      result = subject.call(File.open(moov_path, 'rb'))
+  Dir.glob(fixtures_dir + '/MOOV/**/*.m4a').sort.each do |m4a_path|
+    it "is able to parse #{File.basename(m4a_path)}" do
+      result = subject.call(File.open(m4a_path, 'rb'))
+      expect(result).not_to be_nil
+      expect(result.nature).to eq(:audio)
+      expect(result.media_duration_seconds).to be_kind_of(Float)
+      expect(result.media_duration_seconds).to be > 0
+      expect(result.intrinsics).not_to be_nil
+    end
+  end
+  Dir.glob(fixtures_dir + '/MOOV/**/*.mov').sort.each do |mov_path|
+    it "is able to parse #{File.basename(mov_path)}" do
+      result = subject.call(File.open(mov_path, 'rb'))
+      expect(result).not_to be_nil
+      expect(result.nature).to eq(:video)
+      expect(result.width_px).to be > 0
+      expect(result.height_px).to be > 0
+      expect(result.media_duration_seconds).to be_kind_of(Float)
+      expect(result.media_duration_seconds).to be > 0
+      expect(result.intrinsics).not_to be_nil
+    end
+  end
+  Dir.glob(fixtures_dir + '/MOOV/**/*.mp4').sort.each do |mp4_path|
+    it "is able to parse #{File.basename(mp4_path)}" do
+      result = subject.call(File.open(mp4_path, 'rb'))
       expect(result).not_to be_nil
       expect(result.nature).to eq(:video)
@@ -44,7 +72,14 @@ describe FormatParser::MOOVParser do
     end
   end
-  it 'parses an M4A file and provides the necessary metadata'
+  it 'parses an M4A file and provides the necessary metadata' do
+    m4a_path = fixtures_dir + '/MOOV/M4A/fixture.m4a'
+    result = subject.call(File.open(m4a_path, 'rb'))
+    expect(result).not_to be_nil
+    expect(result.nature).to eq(:audio)
+    expect(result.format).to eq(:m4a)
+  end
   it 'parses a MOV file and provides the necessary metadata' do
     mov_path = fixtures_dir + '/MOOV/MOV/Test_Circular_ProRes422.mov'

data/spec/parsers/zip_parser_spec.rb CHANGED Viewed

@@ -46,6 +46,30 @@ describe FormatParser::ZIPParser do
     expect(dir_entry.type).to eq(:directory)
   end
+  it 'correctly identifies Word documents' do
+    fixture_path = fixtures_dir + '/ZIP/10.docx'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result.nature).to eq(:document)
+    expect(result.format).to eq(:docx)
+    fixture_path = fixtures_dir + '/ZIP/sample-docx.docx'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result.nature).to eq(:document)
+    expect(result.format).to eq(:docx)
+  end
+  it 'is able to handle specific fuzzed input' do
+    r = Random.new(354)
+    1024.times do
+      random_blob = StringIO.new(r.bytes(512 * 1024))
+      subject.call(random_blob) # If there is an error in one of the parsers the example will raise too
+    end
+  end
   it 'returns a result that has a usable JSON representation' do
     fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
     fi_io = File.open(fixture_path, 'rb')

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: format_parser
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Noah Berman
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-30 00:00:00.000000000 Z
+date: 2018-04-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ks
@@ -174,6 +174,7 @@ files:
 - lib/parsers/dpx_parser.rb
 - lib/parsers/exif_parser.rb
 - lib/parsers/fdx_parser.rb
+- lib/parsers/flac_parser.rb
 - lib/parsers/gif_parser.rb
 - lib/parsers/jpeg_parser.rb
 - lib/parsers/moov_parser.rb
@@ -188,11 +189,13 @@ files:
 - lib/parsers/wav_parser.rb
 - lib/parsers/zip_parser.rb
 - lib/parsers/zip_parser/file_reader.rb
+- lib/parsers/zip_parser/office_formats.rb
 - lib/read_limiter.rb
 - lib/remote_io.rb
 - lib/video.rb
 - spec/attributes_json_spec.rb
 - spec/care_spec.rb
+- spec/esoteric_formats_spec.rb
 - spec/file_information_spec.rb
 - spec/format_parser_spec.rb
 - spec/io_utils_spec.rb
@@ -201,6 +204,7 @@ files:
 - spec/parsers/dpx_parser_spec.rb
 - spec/parsers/exif_parser_spec.rb
 - spec/parsers/fdx_parser_spec.rb
+- spec/parsers/flac_parser_spec.rb
 - spec/parsers/gif_parser_spec.rb
 - spec/parsers/jpeg_parser_spec.rb
 - spec/parsers/moov_parser_spec.rb
@@ -236,7 +240,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.2
+rubygems_version: 2.7.3
 signing_key:
 specification_version: 4
 summary: A library for efficient parsing of file metadata