RubyGems - format_parser - Versions diffs - 0.3.5 → 0.4.0 - Mend

format_parser 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/archive.rb +36 -0
data/lib/attributes_json.rb +9 -5
data/lib/document.rb +1 -0
data/lib/format_parser.rb +1 -0
data/lib/format_parser/version.rb +1 -1
data/lib/parsers/pdf_parser.rb +76 -0
data/lib/parsers/zip_parser.rb +39 -0
data/lib/parsers/zip_parser/file_reader.rb +485 -0
data/spec/attributes_json_spec.rb +19 -1
data/spec/parsers/pdf_parser_spec.rb +68 -0
data/spec/parsers/zip_parser_spec.rb +68 -0
metadata +8 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4bc81ce8d64a13fe43d93e7fe24b17ea484e64a4
-  data.tar.gz: 8d559f3fdd9f0a814b479ade6997a3d6bd7dd4a5
+  metadata.gz: a76c414094334f57859df79e61d42fa1fdabb3bd
+  data.tar.gz: 120aaee7484ee01165a2c8dd09b796bce7900c9f
 SHA512:
-  metadata.gz: d555a6eb131261f3c9bb2418e1c7acf12dfb07de4d3b2af65a8135247b8900ddb8e1c60bc6c0d6985ac3c2bfdfc81c0902396e21436a4b4b7a5fd6eee7dcf4d7
-  data.tar.gz: c955511e375429fb93068220a1b7d70875a7e71760f14652fd191ad8d1a8b5c20ff20258cda030468acfc53cde070436b8515fe9af7e0f757dbe2c2d66722a03
+  metadata.gz: a249af874800774dae313b42e4c191125341a6497a9e31b75e54d22ac008725331ce2227e41c167b9f746b85f7db86364dbbdf5614d48f11cb4122e4de01ce03
+  data.tar.gz: e6fee97f2741dccc1c9325813eed247d2a93d7b118b7b6b902cba2f23307650127875d443554a4df1e2ea3c8658c59bab68d2c4d84cef4afe2b4bf1e6454c144

data/lib/archive.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'ks'
+module FormatParser
+  class Archive
+    include FormatParser::AttributesJSON
+    class Entry < Ks.strict(:type, :size, :filename)
+      def to_json(*a)
+        to_h.to_json(*a)
+      end
+    end
+    NATURE = :archive
+    # What filetype was recognized? Will contain a non-ambiguous symbol
+    # referring to the file format. The symbol can be used as a filename
+    # extension safely
+    attr_accessor :format
+    # Array of Entry structs
+    attr_accessor :entries
+    # If a parser wants to provide any extra information to the caller
+    # it can be placed here
+    attr_accessor :intrinsics
+    # Only permits assignments via defined accessors
+    def initialize(**attributes)
+      attributes.map { |(k, v)| public_send("#{k}=", v) }
+    end
+    def nature
+      NATURE
+    end
+  end
+end

data/lib/attributes_json.rb CHANGED Viewed

@@ -14,7 +14,7 @@ module FormatParser::AttributesJSON
   # Implements a sane default `as_json` for an object
   # that accessors defined
-  def as_json(*_maybe_root_option)
+  def as_json(root: false)
     h = {}
     h['nature'] = nature if respond_to?(:nature) # Needed for file info structs
     methods.grep(/\w\=$/).each_with_object(h) do |attr_writer_method_name, h|
@@ -24,11 +24,15 @@ module FormatParser::AttributesJSON
       # by the caller
       h[reader_method_name] = value.respond_to?(:as_json) ? value.as_json : value
     end
+    if root
+      {'format_parser_file_info' => h}
+    else
+      h
+    end
   end
-  # Implements to_json with sane defaults - like
-  # support for `JSON.pretty_generate` vs. `JSON.dump`
-  def to_json(generator_state)
-    generator_state.generate(as_json)
+  # Implements to_json with sane defaults, with or without arguments
+  def to_json(*maybe_generator_state)
+    as_json(root: false).to_json(*maybe_generator_state)
   end
 end

data/lib/document.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module FormatParser
     attr_accessor :format
     attr_accessor :document_type
+    attr_accessor :page_count
     # Only permits assignments via defined accessors
     def initialize(**attributes)

data/lib/format_parser.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module FormatParser
   require_relative 'audio'
   require_relative 'document'
   require_relative 'video'
+  require_relative 'archive'
   require_relative 'io_utils'
   require_relative 'read_limiter'
   require_relative 'remote_io'

data/lib/format_parser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FormatParser
-  VERSION = '0.3.5'
+  VERSION = '0.4.0'
 end

data/lib/parsers/pdf_parser.rb ADDED Viewed

@@ -0,0 +1,76 @@
+class FormatParser::PDFParser
+  include FormatParser::IOUtils
+  # First 9 bytes of a PDF should be in this format, according to:
+  #
+  #  https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
+  #
+  # There are however exceptions, which are left out for now.
+  #
+  PDF_MARKER = /%PDF-1\.[0-8]{1}/
+  # Page counts have different markers depending on
+  # the PDF type. There is not a single common way of solving
+  # this. The only way of solving this correctly is by adding
+  # different types of PDF's in the specs.
+  #
+  COUNT_MARKERS = ['Count ']
+  EOF_MARKER    = '%EOF'
+  def call(io)
+    io = FormatParser::IOConstraint.new(io)
+    return unless safe_read(io, 9) =~ PDF_MARKER
+    attributes = scan_for_attributes(io)
+    FormatParser::Document.new(
+      format: :pdf,
+      page_count: attributes[:page_count]
+    )
+  end
+  private
+  # Read ahead bytes until one of % or / is reached.
+  # A header in a PDF always starts with a /
+  # The % is to detect the EOF
+  #
+  def scan_for_attributes(io)
+    result = {}
+    while read = safe_read(io, 1)
+      case read
+      when '%'
+        break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
+      when '/'
+        find_page_count(io, result)
+      end
+    end
+    result
+  end
+  def find_page_count(io, result)
+    COUNT_MARKERS.each do |marker|
+      if safe_read(io, marker.size) == marker
+        result[:page_count] = read_numbers(io)
+      end
+    end
+  end
+  # Read ahead bytes until no more numbers are found
+  # This assumes that the position of io starts at a
+  # number
+  def read_numbers(io)
+    numbers = ''
+    while c = safe_read(io, 1)
+      c =~ /\d+/ ? numbers << c : break
+    end
+    numbers.to_i
+  end
+  FormatParser.register_parser self, natures: :document, formats: :pdf
+end

data/lib/parsers/zip_parser.rb ADDED Viewed

@@ -0,0 +1,39 @@
+class FormatParser::ZIPParser
+  require_relative 'zip_parser/file_reader'
+  def call(io)
+    reader = FileReader.new
+    entries = reader.read_zip_structure(io: FormatParser::IOConstraint.new(io))
+    entries_archive = entries.map do |ze|
+      ft = directory?(ze) ? :directory : :file
+      decoded_filename = decode_filename(ze)
+      FormatParser::Archive::Entry.new(type: ft, size: ze.uncompressed_size, filename: decoded_filename)
+    end
+    FormatParser::Archive.new(format: :zip, entries: entries_archive)
+  rescue FileReader::Error
+    # This is not a ZIP, or a broken ZIP.
+    return
+  end
+  def directory?(zip_entry)
+    # We can do a lap dance here and parse out the individual bit fields
+    # from the external attributes, check the OS type that is in the entry
+    # to see if it can be interpreted as UNIX or not, and generally have
+    # heaps of fun. Instead, we will be frugal.
+    zip_entry.filename.end_with?('/')
+  end
+  def decode_filename(zip_entry)
+    # Check for the EFS bit in the general-purpose flags. If it is set,
+    # the entry filename can be treated as UTF-8
+    if zip_entry.gp_flags & 0b100000000000 == 0b100000000000
+      zip_entry.filename.unpack('U*').pack('U*')
+    else
+      zip_entry.filename.encode(Encoding::UTF_8, undefined: :replace)
+    end
+  end
+  FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
+end

data/lib/parsers/zip_parser/file_reader.rb ADDED Viewed

@@ -0,0 +1,485 @@
+# frozen_string_literal: true
+require 'stringio'
+# A very barebones ZIP file reader
+class FormatParser::ZIPParser::FileReader
+  Error = Class.new(StandardError)
+  ReadError = Class.new(Error)
+  UnsupportedFeature = Class.new(Error)
+  InvalidStructure = Class.new(Error)
+  LocalHeaderPending = Class.new(Error) do
+    def message
+      'The compressed data offset is not available (local header has not been read)'
+    end
+  end
+  MissingEOCD = Class.new(Error) do
+    def message
+      'Could not find the EOCD signature in the buffer - maybe a malformed ZIP file'
+    end
+  end
+  C_UINT32LE = 'V'
+  C_UINT16LE = 'v'
+  C_UINT64LE = 'Q<'
+  # To prevent too many tiny reads, read the maximum possible size of end of
+  # central directory record upfront (all the fixed fields + at most 0xFFFF
+  # bytes of the archive comment)
+  MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE =
+    begin
+      4 + # Offset of the start of central directory
+        4 + # Size of the central directory
+        2 + # Number of files in the cdir
+        4 + # End-of-central-directory signature
+        2 + # Number of this disk
+        2 + # Number of disk with the start of cdir
+        2 + # Number of files in the cdir of this disk
+        2 + # The comment size
+        0xFFFF # Maximum comment size
+    end
+  # To prevent too many tiny reads, read the maximum possible size of the local file header upfront.
+  # The maximum size is all the usual items, plus the maximum size
+  # of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
+  MAX_LOCAL_HEADER_SIZE =
+    begin
+      4 + # signature
+        2 + # Version needed to extract
+        2 + # gp flags
+        2 + # storage mode
+        2 + # dos time
+        2 + # dos date
+        4 + # CRC32
+        4 + # Comp size
+        4 + # Uncomp size
+        2 + # Filename size
+        2 + # Extra fields size
+        0xFFFF + # Maximum filename size
+        0xFFFF   # Maximum extra fields size
+    end
+  SIZE_OF_USABLE_EOCD_RECORD =
+    begin
+      4 + # Signature
+        2 + # Number of this disk
+        2 + # Number of the disk with the EOCD record
+        2 + # Number of entries in the central directory of this disk
+        2 + # Number of entries in the central directory total
+        4 + # Size of the central directory
+        4   # Start of the central directory offset
+    end
+  private_constant :C_UINT32LE, :C_UINT16LE, :C_UINT64LE, :MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE,
+                   :MAX_LOCAL_HEADER_SIZE, :SIZE_OF_USABLE_EOCD_RECORD
+  # Represents a file within the ZIP archive being read
+  class ZipEntry
+    include FormatParser::AttributesJSON
+    # @return [Fixnum] bit-packed version signature of the program that made the archive
+    attr_accessor :made_by
+    # @return [Fixnum] ZIP version support needed to extract this file
+    attr_accessor :version_needed_to_extract
+    # @return [Fixnum] bit-packed general purpose flags
+    attr_accessor :gp_flags
+    # @return [Fixnum] Storage mode (0 for stored, 8 for deflate)
+    attr_accessor :storage_mode
+    # @return [Fixnum] the bit-packed DOS time
+    attr_accessor :dos_time
+    # @return [Fixnum] the bit-packed DOS date
+    attr_accessor :dos_date
+    # @return [Fixnum] the CRC32 checksum of this file
+    attr_accessor :crc32
+    # @return [Fixnum] size of compressed file data in the ZIP
+    attr_accessor :compressed_size
+    # @return [Fixnum] size of the file once uncompressed
+    attr_accessor :uncompressed_size
+    # @return [String] the filename
+    attr_accessor :filename
+    # @return [Fixnum] disk number where this file starts
+    attr_accessor :disk_number_start
+    # @return [Fixnum] internal attributes of the file
+    attr_accessor :internal_attrs
+    # @return [Fixnum] external attributes of the file
+    attr_accessor :external_attrs
+    # @return [Fixnum] at what offset the local file header starts
+    #        in your original IO object
+    attr_accessor :local_file_header_offset
+    # @return [String] the file comment
+    attr_accessor :comment
+    # @return [Fixnum] at what offset you should start reading
+    #       for the compressed data in your original IO object
+    def compressed_data_offset
+      @compressed_data_offset || raise(LocalHeaderPending)
+    end
+    # Tells whether the compressed data offset is already known for this entry
+    # @return [Boolean]
+    def known_offset?
+      !@compressed_data_offset.nil?
+    end
+    # Tells whether the entry uses a data descriptor (this is defined
+    # by bit 3 in the GP flags).
+    def uses_data_descriptor?
+      (gp_flags & 0x0008) == 0x0008
+    end
+    # Sets the offset at which the compressed data for this file starts in the ZIP.
+    # By default, the value will be set by the Reader for you. If you use delayed
+    # reading, you need to set it by using the `get_compressed_data_offset` on the Reader:
+    #
+    #     entry.compressed_data_offset = reader.get_compressed_data_offset(io: file,
+    #            local_file_header_offset: entry.local_header_offset)
+    def compressed_data_offset=(offset)
+      @compressed_data_offset = offset.to_i
+    end
+  end
+  # Parse an IO handle to a ZIP archive into an array of Entry objects.
+  #
+  # @param io[#tell, #seek, #read, #size] an IO-ish object
+  # @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
+  def read_zip_structure(io:)
+    zip_file_size = io.size
+    eocd_offset = get_eocd_offset(io, zip_file_size)
+    zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
+    num_files, cdir_location, cdir_size =
+      if zip64_end_of_cdir_location
+        num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
+      else
+        num_files_and_central_directory_offset(io, eocd_offset)
+      end
+    log { format('Located the central directory start at %d', cdir_location) }
+    seek(io, cdir_location)
+    # In zip_tricks we read the entire central directory _and_ enything behind it.
+    # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
+    # BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
+    # So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
+    central_directory_str = io.read(cdir_size + 1024)
+    central_directory_io = StringIO.new(central_directory_str)
+    log do
+      format(
+        'Read %d bytes with central directory + EOCD record and locator',
+        central_directory_str.bytesize)
+    end
+    entries = (0...num_files).map do |entry_n|
+      offset_location = cdir_location + central_directory_io.pos
+      log do
+        format(
+          'Reading the central directory entry %d starting at offset %d',
+          entry_n, offset_location)
+      end
+      read_cdir_entry(central_directory_io)
+    end
+    entries
+  end
+  private
+  def skip_ahead_2(io)
+    skip_ahead_n(io, 2)
+  end
+  def skip_ahead_4(io)
+    skip_ahead_n(io, 4)
+  end
+  def skip_ahead_8(io)
+    skip_ahead_n(io, 8)
+  end
+  def seek(io, absolute_pos)
+    io.seek(absolute_pos)
+    unless absolute_pos == io.pos
+      raise ReadError,
+            "Expected to seek to #{absolute_pos} but only got to #{io.pos}"
+    end
+    nil
+  end
+  def assert_signature(io, signature_magic_number)
+    readback = read_4b(io)
+    if readback != signature_magic_number
+      expected = '0x0' + signature_magic_number.to_s(16)
+      actual = '0x0' + readback.to_s(16)
+      raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
+    end
+  end
+  def skip_ahead_n(io, n)
+    pos_before = io.pos
+    io.seek(io.pos + n)
+    pos_after = io.pos
+    delta = pos_after - pos_before
+    unless delta == n
+      raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
+    end
+    nil
+  end
+  def read_n(io, n_bytes)
+    io.read(n_bytes).tap do |d|
+      raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
+      unless d.bytesize == n_bytes
+        raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
+      end
+    end
+  end
+  def read_2b(io)
+    read_n(io, 2).unpack(C_UINT16LE).shift
+  end
+  def read_4b(io)
+    read_n(io, 4).unpack(C_UINT32LE).shift
+  end
+  def read_8b(io)
+    read_n(io, 8).unpack(C_UINT64LE).shift
+  end
+  def read_cdir_entry(io)
+    assert_signature(io, 0x02014b50)
+    ZipEntry.new.tap do |e|
+      e.made_by = read_2b(io)
+      e.version_needed_to_extract = read_2b(io)
+      e.gp_flags = read_2b(io)
+      e.storage_mode = read_2b(io)
+      e.dos_time = read_2b(io)
+      e.dos_date = read_2b(io)
+      e.crc32 = read_4b(io)
+      e.compressed_size = read_4b(io)
+      e.uncompressed_size = read_4b(io)
+      filename_size = read_2b(io)
+      extra_size = read_2b(io)
+      comment_len = read_2b(io)
+      e.disk_number_start = read_2b(io)
+      e.internal_attrs = read_2b(io)
+      e.external_attrs = read_4b(io)
+      e.local_file_header_offset = read_4b(io)
+      e.filename = read_n(io, filename_size)
+      # Extra fields
+      extras = read_n(io, extra_size)
+      # Comment
+      e.comment = read_n(io, comment_len)
+      # Parse out the extra fields
+      extra_table = parse_out_extra_fields(extras)
+      # ...of which we really only need the Zip64 extra
+      if zip64_extra_contents ||= extra_table[1]
+        # If the Zip64 extra is present, we let it override all
+        # the values fetched from the conventional header
+        zip64_extra = StringIO.new(zip64_extra_contents)
+        log do
+          format(
+            'Will read Zip64 extra data for %s, %d bytes',
+            e.filename, zip64_extra.size)
+        end
+        # Now here be dragons. The APPNOTE specifies that
+        #
+        # > The order of the fields in the ZIP64 extended
+        # > information record is fixed, but the fields will
+        # > only appear if the corresponding Local or Central
+        # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
+        #
+        # It means that before we read this stuff we need to check if the previously-read
+        # values are at overflow, and only _then_ proceed to read them. Bah.
+        if e.uncompressed_size == 0xFFFFFFFF
+          e.uncompressed_size = read_8b(zip64_extra)
+        end
+        if e.compressed_size == 0xFFFFFFFF
+          e.compressed_size = read_8b(zip64_extra)
+        end
+        if e.local_file_header_offset == 0xFFFFFFFF
+          e.local_file_header_offset = read_8b(zip64_extra)
+        end
+        # Disk number comes last and we can skip it anyway, since we do
+        # not support multi-disk archives
+      end
+    end
+  end
+  def get_eocd_offset(file_io, zip_file_size)
+    # Start reading from the _comment_ of the zip file (from the very end).
+    # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
+    implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
+    implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0
+    # Use a soft seek (we might not be able to get as far behind in the IO as we want)
+    # and a soft read (we might not be able to read as many bytes as we want)
+    file_io.seek(implied_position_of_eocd_record)
+    str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
+    raise MissingEOCD unless str_containing_eocd_record
+    eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)
+    raise MissingEOCD unless eocd_idx_in_buf
+    eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
+    log { format('Found EOCD signature at offset %d', eocd_offset) }
+    eocd_offset
+  end
+  # This is tricky. Essentially, we have to scan the maximum possible number
+  # of bytes (that the EOCD can theoretically occupy including the comment),
+  # and we have to find a combination of:
+  #   [EOCD signature, <some ZIP medatata>, comment byte size, the comment of
+  # that size, eof].
+  # The only way I could find to do this was with a sliding window, but
+  # there probably is a better way.
+  def locate_eocd_signature(in_str)
+    # We have to scan from the _very_ tail. We read the very minimum size
+    # the EOCD record can have (up to and including the comment size), using
+    # a sliding window. Once our end offset matches the comment size we found our
+    # EOCD marker.
+    unpack_pattern = 'VvvvvVVv'
+    minimum_record_size = 22
+    end_location = minimum_record_size * -1
+    loop do
+      # If the window is nil, we have rolled off the start of the string, nothing to do here.
+      # We use negative values because if we used positive slice indices
+      # we would have to detect the rollover ourselves
+      break unless window = in_str[end_location, minimum_record_size]
+      window_location = in_str.bytesize + end_location
+      unpacked = window.unpack(unpack_pattern)
+      # If we found the signarue, pick up the comment size, and check if the size of the window
+      # plus that comment size is where we are in the string. If we are - bingo.
+      if unpacked[0] == 0x06054b50 && comment_size = unpacked[-1]
+        assumed_eocd_location = in_str.bytesize - comment_size - minimum_record_size
+        # if the comment size is where we should be at - we found our EOCD
+        return assumed_eocd_location if assumed_eocd_location == window_location
+      end
+      end_location -= 1 # Shift the window back, by one byte, and try again.
+    end
+  end
+  # Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the
+  # EOCD record in the archive by fixed offsets
+  def get_zip64_eocd_location(file_io, eocd_offset)
+    zip64_eocd_loc_offset = eocd_offset
+    zip64_eocd_loc_offset -= 4 # The signature
+    zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
+    zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
+    zip64_eocd_loc_offset -= 4 # Total number of disks
+    log do
+      format(
+        'Will look for the Zip64 EOCD locator signature at offset %d',
+        zip64_eocd_loc_offset)
+    end
+    # If the offset is negative there is certainly no Zip64 EOCD locator here
+    return unless zip64_eocd_loc_offset >= 0
+    file_io.seek(zip64_eocd_loc_offset)
+    assert_signature(file_io, 0x07064b50)
+    log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }
+    disk_num = read_4b(file_io) # number of the disk
+    raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
+    read_8b(file_io)
+  rescue ReadError, InvalidStructure
+    nil
+  end
+  #          num_files_and_central_directory_offset_zip64 is too high. [21.12/15]
+  def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
+    seek(io, zip64_end_of_cdir_location)
+    assert_signature(io, 0x06064b50)
+    zip64_eocdr_size = read_8b(io)
+    zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
+    zip64_eocdr = StringIO.new(zip64_eocdr)
+    skip_ahead_2(zip64_eocdr) # version made by
+    skip_ahead_2(zip64_eocdr) # version needed to extract
+    disk_n = read_4b(zip64_eocdr) # number of this disk
+    disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
+    if disk_n != disk_n_with_eocdr
+      raise UnsupportedFeature, 'The archive spans multiple disks'
+    end
+    num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
+    num_files_total     = read_8b(zip64_eocdr) # files total in the central directory
+    if num_files_this_disk != num_files_total
+      raise UnsupportedFeature, 'The archive spans multiple disks'
+    end
+    log do
+      format(
+        'Zip64 EOCD record states there are %d files in the archive',
+        num_files_total)
+    end
+    central_dir_size    = read_8b(zip64_eocdr) # Size of the central directory
+    central_dir_offset  = read_8b(zip64_eocdr) # Where the central directory starts
+    [num_files_total, central_dir_offset, central_dir_size]
+  end
+  def num_files_and_central_directory_offset(file_io, eocd_offset)
+    seek(file_io, eocd_offset)
+    # The size of the EOCD record is known upfront, so use a strict read
+    eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
+    io = StringIO.new(eocd_record_str)
+    assert_signature(io, 0x06054b50)
+    skip_ahead_2(io) # number_of_this_disk
+    skip_ahead_2(io) # number of the disk with the EOCD record
+    skip_ahead_2(io) # number of entries in the central directory of this disk
+    num_files = read_2b(io)   # number of entries in the central directory total
+    cdir_size = read_4b(io)   # size of the central directory
+    cdir_offset = read_4b(io) # start of central directorty offset
+    [num_files, cdir_offset, cdir_size]
+  end
+  # Is provided as a stub to be overridden in a subclass if you need it. Will report
+  # during various stages of reading. The log message is contained in the return value
+  # of `yield` in the method (the log messages are lazy-evaluated).
+  def log
+    # $stderr.puts(yield)
+  end
+  def parse_out_extra_fields(extra_fields_str)
+    extra_table = {}
+    extras_buf = StringIO.new(extra_fields_str)
+    until extras_buf.eof?
+      extra_id = read_2b(extras_buf)
+      extra_size = read_2b(extras_buf)
+      extra_contents = read_n(extras_buf, extra_size)
+      extra_table[extra_id] = extra_contents
+    end
+    extra_table
+  end
+end

data/spec/attributes_json_spec.rb CHANGED Viewed

@@ -13,7 +13,7 @@ describe FormatParser::AttributesJSON do
     instance.foo = 42
     instance.bar = 'abcdef'
     expect(instance.as_json).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
-    expect(instance.as_json(root: true)).to eq('nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil)
+    expect(instance.as_json(root: true)).to eq('format_parser_file_info' => {'nature' => 'good', 'foo' => 42, 'bar' => 'abcdef', 'baz' => nil})
   end
   it 'is included into file information types' do
@@ -49,4 +49,22 @@ describe FormatParser::AttributesJSON do
     standard_output = JSON.dump(instance)
     expect(pretty_output).not_to eq(standard_output)
   end
+  it 'provides to_json without arguments' do
+    anon_class = Class.new do
+      include FormatParser::AttributesJSON
+      attr_accessor :foo, :bar, :baz
+      def nature
+        'good'
+      end
+    end
+    instance = anon_class.new
+    instance.foo = 42
+    instance.bar = 'abcdef'
+    output = instance.to_json
+    readback = JSON.parse(output, symbolize_names: true)
+    expect(readback).to have_key(:nature)
+  end
 end

data/spec/parsers/pdf_parser_spec.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'spec_helper'
+describe FormatParser::PDFParser do
+  let(:parsed_pdf) {
+    subject.call(
+      File.open(
+        Pathname.new(fixtures_dir).join('PDF').join(pdf_file),
+        'rb'
+      )
+    )
+  }
+  shared_examples :behave_like_pdf do |hash|
+    let(:pdf_file) { hash.fetch(:file) }
+    it 'acts as a pdf' do
+      expect(parsed_pdf).not_to be_nil
+      expect(parsed_pdf.nature).to eq(:document)
+      expect(parsed_pdf.format).to eq(:pdf)
+    end
+    it 'has a correct page count' do
+      expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
+    end
+  end
+  describe 'a PDF file with a missing version header' do
+    let(:pdf_file) { 'not_a.pdf' }
+    it 'does not parse succesfully' do
+      expect(parsed_pdf).to be_nil
+    end
+  end
+  describe 'a PDF file with a correct header but no valid content' do
+    let(:pdf_file) { 'broken.pdf' }
+    pending 'does not parse succesfully'
+  end
+  describe 'exceeding the PDF read limit' do
+    let(:pdf_file) { 'read_limit.pdf' }
+    pending 'does not parse succesfully'
+  end
+  describe 'a PDF file with a missing COUNT_HEADER' do
+    let(:pdf_file) { 'missing_page_count.pdf' }
+    it 'does not return a page count' do
+      expect(parsed_pdf.page_count).to eq(nil)
+    end
+  end
+  describe 'parses a PDF file' do
+    describe 'a single page file' do
+      include_examples :behave_like_pdf, file: '1_page.pdf', page_count: 1
+    end
+    describe 'a multi page pdf file' do
+      include_examples :behave_like_pdf, file: '2_pages.pdf', page_count: 2
+    end
+    describe 'a multi page pdf file with content' do
+      include_examples :behave_like_pdf, file: '10_pages.pdf', page_count: 10
+    end
+  end
+end

data/spec/parsers/zip_parser_spec.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'spec_helper'
+describe FormatParser::ZIPParser do
+  it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
+    fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result).not_to be_nil
+    expect(result.format).to eq(:zip)
+    expect(result.nature).to eq(:archive)
+    expect(result.entries.length).to eq(0xFFFF + 1)
+    entry = result.entries.fetch(5674)
+    expect(entry.type).to eq(:file)
+    expect(entry.size).to eq(47)
+    expect(entry.filename).to eq('file-0005674.txt')
+  end
+  it 'parses a ZIP archive with a few files' do
+    fixture_path = fixtures_dir + '/ZIP/arch_few_entries.zip'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result).not_to be_nil
+    expect(result.format).to eq(:zip)
+    expect(result.nature).to eq(:archive)
+    expect(result.entries.length).to eq(3)
+  end
+  it 'correctly identifies an empty directory' do
+    fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    expect(result).not_to be_nil
+    expect(result.format).to eq(:zip)
+    expect(result.nature).to eq(:archive)
+    expect(result.entries.length).to eq(3)
+    dir_entry = result.entries.last
+    expect(dir_entry.filename).to eq('папочка/')
+    expect(dir_entry.type).to eq(:directory)
+  end
+  it 'returns a result that has a usable JSON representation' do
+    fixture_path = fixtures_dir + '/ZIP/arch_with_empty_dir.zip'
+    fi_io = File.open(fixture_path, 'rb')
+    result = subject.call(fi_io)
+    json_repr = JSON.pretty_generate(result)
+    json_parsed_repr = JSON.parse(json_repr, symbolize_names: true)
+    expect(json_parsed_repr[:nature]).to eq('archive')
+    expect(json_parsed_repr[:format]).to eq('zip')
+    expect(json_parsed_repr[:entries]).to be_kind_of(Array)
+    expect(json_parsed_repr[:entries].length).to eq(3)
+    json_parsed_repr[:entries].each do |e|
+      expect(e[:filename]).to be_kind_of(String)
+      expect(e[:size]).to be_kind_of(Integer)
+      expect(e[:type]).to be_kind_of(String)
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: format_parser
 version: !ruby/object:Gem::Version
-  version: 0.3.5
+  version: 0.4.0
 platform: ruby
 authors:
 - Noah Berman
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-13 00:00:00.000000000 Z
+date: 2018-03-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ks
@@ -159,6 +159,7 @@ files:
 - README.md
 - Rakefile
 - format_parser.gemspec
+- lib/archive.rb
 - lib/attributes_json.rb
 - lib/audio.rb
 - lib/care.rb
@@ -180,10 +181,13 @@ files:
 - lib/parsers/mp3_parser.rb
 - lib/parsers/mp3_parser/id3_v1.rb
 - lib/parsers/mp3_parser/id3_v2.rb
+- lib/parsers/pdf_parser.rb
 - lib/parsers/png_parser.rb
 - lib/parsers/psd_parser.rb
 - lib/parsers/tiff_parser.rb
 - lib/parsers/wav_parser.rb
+- lib/parsers/zip_parser.rb
+- lib/parsers/zip_parser/file_reader.rb
 - lib/read_limiter.rb
 - lib/remote_io.rb
 - lib/video.rb
@@ -201,10 +205,12 @@ files:
 - spec/parsers/jpeg_parser_spec.rb
 - spec/parsers/moov_parser_spec.rb
 - spec/parsers/mp3_parser_spec.rb
+- spec/parsers/pdf_parser_spec.rb
 - spec/parsers/png_parser_spec.rb
 - spec/parsers/psd_parser_spec.rb
 - spec/parsers/tiff_parser_spec.rb
 - spec/parsers/wav_parser_spec.rb
+- spec/parsers/zip_parser_spec.rb
 - spec/read_limiter_spec.rb
 - spec/remote_fetching_spec.rb
 - spec/remote_io_spec.rb