RubyGems - format_parser - Versions diffs - 0.1.0 - Mend

format_parser 0.1.0

Files changed (43) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.rspec +3 -0
data/.travis.yml +11 -0
data/CODE_OF_CONDUCT.md +46 -0
data/CONTRIBUTING.md +157 -0
data/Gemfile +4 -0
data/LICENSE.txt +20 -0
data/README.md +53 -0
data/Rakefile +12 -0
data/format_parser.gemspec +43 -0
data/lib/care.rb +123 -0
data/lib/file_information.rb +70 -0
data/lib/format_parser.rb +55 -0
data/lib/format_parser/version.rb +3 -0
data/lib/io_utils.rb +41 -0
data/lib/parsers/aiff_parser.rb +86 -0
data/lib/parsers/dpx_parser.rb +143 -0
data/lib/parsers/exif_parser.rb +58 -0
data/lib/parsers/gif_parser.rb +49 -0
data/lib/parsers/jpeg_parser.rb +122 -0
data/lib/parsers/png_parser.rb +80 -0
data/lib/parsers/psd_parser.rb +21 -0
data/lib/parsers/tiff_parser.rb +71 -0
data/lib/read_limiter.rb +39 -0
data/lib/remote_io.rb +89 -0
data/spec/aiff_parser_spec.rb +25 -0
data/spec/care_spec.rb +77 -0
data/spec/file_information_spec.rb +16 -0
data/spec/format_parser_spec.rb +23 -0
data/spec/io_utils_spec.rb +42 -0
data/spec/parsers/dpx_parser_spec.rb +29 -0
data/spec/parsers/exif_parser_spec.rb +45 -0
data/spec/parsers/gif_parser_spec.rb +35 -0
data/spec/parsers/jpeg_parser_spec.rb +36 -0
data/spec/parsers/png_parser_spec.rb +33 -0
data/spec/parsers/psd_parser_spec.rb +21 -0
data/spec/parsers/tiff_parser_spec.rb +37 -0
data/spec/read_limiter_spec.rb +35 -0
data/spec/remote_fetching_spec.rb +32 -0
data/spec/remote_io_spec.rb +56 -0
data/spec/spec_helper.rb +22 -0
metadata +189 -0

data/lib/parsers/gif_parser.rb ADDED Viewed

@@ -0,0 +1,49 @@
+class FormatParser::GIFParser
+  HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
+  NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
+  include FormatParser::IOUtils
+  def information_from_io(io)
+    header = safe_read(io, 6)
+    return unless HEADERS.include?(header)
+    w, h = safe_read(io, 4).unpack('vv')
+    gct_byte, bgcolor_index, pixel_aspect_ratio = safe_read(io, 5).unpack('Cvv')
+    # and actually onwards for this:
+    # http://www.matthewflickinger.com/lab/whatsinagif/bits_and_bytes.asp
+    # Determine how big our color table is
+    has_gct = gct_byte[0] == 1
+    bytes_per_color = gct_byte >> 6
+    unpacked_radix = gct_byte & 0b00000111
+    num_colors = 2**(unpacked_radix + 1)
+    gct_table_size = num_colors*bytes_per_color
+    # If we have the global color table - skip over it
+    if has_gct
+      safe_read(io, gct_table_size)
+    end
+    # Now it gets interesting - we are at the place where an
+    # application extension for the NETSCAPE2.0 block will occur.
+    # If it does, it most likely means the application that wrote the
+    # GIF needed looping, and if it did, it means that the GIF is
+    # very, very likely to be animated. To read the actual animation
+    # we need to skip over actual image data frames, which, in case
+    # of our paged reads, will incur
+    potentially_netscape_app_header = safe_read(io, 64)
+    is_animated = potentially_netscape_app_header.include?(NETSCAPE_AND_AUTHENTICATION_CODE)
+    FormatParser::FileInformation.image(
+      file_type: :gif,
+      width_px: w,
+      height_px: h,
+      has_multiple_frames: is_animated,
+      color_mode: :indexed,
+    )
+  end
+  FormatParser.register_parser_constructor self
+end

data/lib/parsers/jpeg_parser.rb ADDED Viewed

@@ -0,0 +1,122 @@
+class FormatParser::JPEGParser
+  include FormatParser::IOUtils
+  class InvalidStructure < StandardError
+  end
+  SOI_MARKER = 0xD8 # start of image
+  SOF_MARKERS = [0xC0..0xC3, 0xC5..0xC7, 0xC9..0xCB, 0xCD..0xCF]
+  EOI_MARKER  = 0xD9  # end of image
+  SOS_MARKER  = 0xDA  # start of stream
+  APP1_MARKER = 0xE1  # maybe EXIF
+  def information_from_io(io)
+    @buf = io
+    @width             = nil
+    @height            = nil
+    @orientation       = nil
+    scan
+  end
+  private
+  def advance(n)
+    safe_read(@buf, n); nil
+  end
+  def read_char
+    safe_read(@buf, 1).unpack('C').first
+  end
+  def read_short
+    safe_read(@buf, 2).unpack('n*').first
+  end
+  def scan
+    # Return early if it is not a JPEG at all
+    signature = read_next_marker
+    return unless signature == SOI_MARKER
+    while marker = read_next_marker
+      case marker
+      when *SOF_MARKERS
+        scan_start_of_frame
+      when EOI_MARKER, SOS_MARKER
+        break
+      when APP1_MARKER
+        scan_app1_frame
+      else
+        skip_frame
+      end
+      # Return at the earliest possible opportunity
+      if @width && @height && @orientation
+        file_info = FormatParser::FileInformation.image(
+          file_type: :jpg,
+          width_px: @width,
+          height_px: @height,
+          orientation: @orientation
+        )
+        return file_info
+      elsif @width && @height
+        file_info = FormatParser::FileInformation.image(
+          file_type: :jpg,
+          width_px: @width,
+          height_px: @height
+        )
+        return file_info
+      end
+    end
+    nil # We could not parse anything
+  rescue InvalidStructure
+    nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
+  end
+  # Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing)
+  # and return the first byte scanned that is not 0xFF
+  def read_next_marker
+    c = read_char while c != 0xFF
+    c = read_char while c == 0xFF
+    c
+  end
+  def scan_start_of_frame
+    length = read_short
+    read_char # depth, unused
+    height = read_short
+    width  = read_short
+    size   = read_char
+    if length == (size * 3) + 8
+      @width, @height = width, height
+    else
+      raise InvalidStructure
+    end
+  end
+  def scan_app1_frame
+    frame = @buf.read(8)
+    if frame.include?("Exif")
+      scanner = FormatParser::EXIFParser.new(:jpeg, @buf)
+      if scanner.scan_image_exif
+        @exif_output = scanner.exif_data
+        @orientation = scanner.orientation unless scanner.orientation.nil?
+        @width = @exif_output.pixel_x_dimension || scanner.width
+        @height = @exif_output.pixel_y_dimension || scanner.height
+      end
+    end
+  end
+  def read_frame
+    length = read_short - 2
+    safe_read(@buf, length)
+  end
+  def skip_frame
+    length = read_short - 2
+    advance(length)
+  end
+  FormatParser.register_parser_constructor self
+end

data/lib/parsers/png_parser.rb ADDED Viewed

@@ -0,0 +1,80 @@
+class FormatParser::PNGParser
+  PNG_HEADER_BYTES = [137, 80, 78, 71, 13, 10, 26, 10].pack('C*')
+  COLOR_TYPES = {
+    0 => :grayscale,
+    2 => :rgb,
+    3 => :indexed,
+    4 => :grayscale, # with alpha
+    6 => :rgba,
+  }
+  TRANSPARENCY_PER_COLOR_TYPE = {
+    0 => true,
+    4 => true, # Grayscale with alpha
+    6 => true,
+  }
+  include FormatParser::IOUtils
+  def chunk_length_and_type(io)
+    safe_read(io, 8).unpack("Na4")
+  end
+  def information_from_io(io)
+    magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
+    return unless magic_bytes == PNG_HEADER_BYTES
+    chunk_length, chunk_type = chunk_length_and_type(io)
+    # For later: look at gAMA and iCCP chunks too. For now,
+    # all we care about is the IHDR chunk, and it must have the
+    # correct length as well.
+    # IHDR _must_ come first, no exceptions. If it doesn't
+    # we should not consider this a valid PNG.
+    return unless chunk_type == "IHDR" && chunk_length == 13
+    chunk_data = safe_read(io, chunk_length)
+    # Width:              4 bytes
+    # Height:             4 bytes
+    # Bit depth:          1 byte
+    # Color type:         1 byte (0, 2, 3, 4, 6)
+    # Compression method: 1 byte
+    # Filter method:      1 byte
+    # Interlace method:   1 byte
+    w, h, bit_depth, color_type,
+      compression_method, filter_method, interlace_method = chunk_data.unpack("N2C5")
+    color_mode = COLOR_TYPES.fetch(color_type)
+    has_transparency = TRANSPARENCY_PER_COLOR_TYPE[color_type]
+    # Read the next chunk. If it turns out to be acTL (animation control)
+    # we are dealing with an APNG.
+    safe_skip(io, 4)
+    # dry-validation won't let booleans be filled with nil so we have to set
+    # has_animation to false by default
+    has_animation = nil
+    num_frames = nil
+    loop_n_times = nil
+    chunk_length, chunk_type = chunk_length_and_type(io)
+    if chunk_length == 8 && chunk_type == 'acTL'
+      # https://wiki.mozilla.org/APNG_Specification#.60acTL.60:_The_Animation_Control_Chunk
+      # Unlike GIF, we do have the frame count that we can recover
+      has_animation = true
+      num_frames, loop_n_times = safe_read(io, 8).unpack('NN')
+    end
+    FormatParser::FileInformation.image(
+      file_type: :png,
+      width_px: w,
+      height_px: h,
+      has_transparency: has_transparency,
+      color_mode: color_mode,
+      has_multiple_frames: has_animation,
+      num_animation_or_video_frames: num_frames,
+    )
+  end
+  FormatParser.register_parser_constructor self
+end

data/lib/parsers/psd_parser.rb ADDED Viewed

@@ -0,0 +1,21 @@
+class FormatParser::PSDParser
+  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
+  include FormatParser::IOUtils
+  def information_from_io(io)
+    magic_bytes = safe_read(io, 4).unpack("C4")
+    return unless magic_bytes == PSD_HEADER
+    # We can be reasonably certain this is a PSD so we grab the height
+    # and width bytes
+    w,h = safe_read(io, 22).unpack("x10N2")
+    FormatParser::FileInformation.image(
+      file_type: :psd,
+      width_px: w,
+      height_px: h,
+    )
+  end
+  FormatParser.register_parser_constructor self
+end

data/lib/parsers/tiff_parser.rb ADDED Viewed

@@ -0,0 +1,71 @@
+class FormatParser::TIFFParser
+  LITTLE_ENDIAN_TIFF_HEADER_BYTES = [0x49, 0x49, 0x2A, 0x0]
+  BIG_ENDIAN_TIFF_HEADER_BYTES = [0x4D, 0x4D, 0x0, 0x2A]
+  WIDTH_TAG  = 0x100
+  HEIGHT_TAG = 0x101
+  include FormatParser::IOUtils
+  def information_from_io(io)
+    magic_bytes = safe_read(io, 4).unpack("C4")
+    endianness = scan_tiff_endianness(magic_bytes)
+    return unless endianness
+    w, h = read_tiff_by_endianness(io, endianness)
+    scanner = FormatParser::EXIFParser.new(:tiff, io)
+    scanner.scan_image_exif
+    if scanner.orientation
+      FormatParser::FileInformation.image(
+        file_type: :tif,
+        width_px: w,
+        height_px: h,
+        orientation: scanner.orientation
+      )
+    else
+      FormatParser::FileInformation.image(
+        file_type: :tif,
+        width_px: w,
+        height_px: h
+      )
+    end
+  end
+  # TIFFs can be either big or little endian, so we check here
+  # and set our unpack method argument to suit.
+  def scan_tiff_endianness(magic_bytes)
+    if magic_bytes == LITTLE_ENDIAN_TIFF_HEADER_BYTES
+      "v"
+    elsif magic_bytes == BIG_ENDIAN_TIFF_HEADER_BYTES
+      "n"
+    else
+      nil
+    end
+  end
+  # The TIFF format stores metadata in a flexible set of information fields
+  # called tags, which are stored in a header referred to as the IFD or
+  # Image File Directory. It is not necessarily in the same place in every image,
+  # so we need to do some work to scan through it and find the tags we need.
+  # For more information the TIFF wikipedia page is a reasonable place to start:
+  # https://en.wikipedia.org/wiki/TIFF
+  def scan_ifd(cache, offset, endianness)
+    entry_count = safe_read(cache, 4).unpack(endianness)[0]
+    entry_count.times do |i|
+      cache.seek(offset + 2 + (12 * i))
+      tag = safe_read(cache, 4).unpack(endianness)[0]
+      if tag == WIDTH_TAG
+        @width = safe_read(cache, 4).unpack(endianness.upcase)[0]
+      elsif tag == HEIGHT_TAG
+        @height = safe_read(cache, 4).unpack(endianness.upcase)[0]
+      end
+    end
+  end
+  def read_tiff_by_endianness(io, endianness)
+    offset = safe_read(io, 4).unpack(endianness.upcase)[0]
+    io.seek(offset)
+    scan_ifd(io, offset, endianness)
+    [@width, @height]
+  end
+  FormatParser.register_parser_constructor self
+end

data/lib/read_limiter.rb ADDED Viewed

@@ -0,0 +1,39 @@
+class FormatParser::ReadLimiter
+  NO_LIMIT = nil
+  class BudgetExceeded < StandardError
+  end
+  def initialize(io, max_bytes: NO_LIMIT, max_reads: NO_LIMIT, max_seeks: NO_LIMIT)
+    @max_bytes = max_bytes
+    @max_reads = max_reads
+    @max_seeks = max_seeks
+    @io = io
+    @seeks = 0
+    @reads = 0
+    @bytes = 0
+  end
+  def seek(to_offset)
+    @seeks += 1
+    if @max_seeks && @seeks > @max_seeks
+      raise BudgetExceeded, "Seek budget exceeded (%d seeks performed)" % @max_seeks
+    end
+    @io.seek(to_offset)
+  end
+  def read(n)
+    @bytes += n
+    @reads += 1
+    if @max_bytes && @bytes > @max_bytes
+      raise BudgetExceeded, "Read bytes budget (%d) exceeded" % @max_bytes
+    end
+    if @max_reads && @reads > @max_reads
+      raise BudgetExceeded, "Number of read() calls exceeded (%d max)" % @max_reads
+    end
+    @io.read(n)
+  end
+end

data/lib/remote_io.rb ADDED Viewed

@@ -0,0 +1,89 @@
+class FormatParser::RemoteIO
+  # Represents a failure that might be retried
+  # (like a 5xx response or a timeout)
+  class IntermittentFailure < StandardError
+  end
+  # Represents a failure that should not be retried
+  # (like a 4xx response or a DNS resolution error)
+  class InvalidRequest < StandardError
+  end
+  # @param uri[URI, String] the remote URL to obtain
+  def initialize(uri)
+    require 'faraday'
+    @uri = uri
+    @pos = 0
+    @remote_size = false
+  end
+  # Emulates IO#seek
+  def seek(offset)
+    @pos = offset
+    0 # always return 0
+  end
+  # Emulates IO#size.
+  #
+  # @return [Fixnum] the size of the remote resource
+  def size
+    raise "Remote size not yet obtained, need to perform at least one read() to get it" unless @remote_size
+    @remote_size
+  end
+  # Emulates IO#read, but requires the number of bytes to read
+  # The read will be limited to the
+  # size of the remote resource relative to the current offset in the IO,
+  # so if you are at offset 0 in the IO of size 10, doing a `read(20)`
+  # will only return you 10 bytes of result, and not raise any exceptions.
+  #
+  # @param n_bytes[Fixnum, nil] how many bytes to read, or `nil` to read all the way to the end
+  # @return [String] the read bytes
+  def read(n_bytes)
+    http_range = (@pos..(@pos + n_bytes - 1))
+    @remote_size, body = request_range(http_range)
+    body.force_encoding(Encoding::BINARY) if body
+    body
+  end
+  protected
+  # Only used internally when reading the remote file
+  #
+  # @param range[Range] the HTTP range of data to fetch from remote
+  # @return [String] the response body of the ranged request
+  def request_range(range)
+    # We use a GET and not a HEAD request followed by a GET because
+    # S3 does not allow HEAD requests if you only presigned your URL for GETs, so we
+    # combine the first GET of a segment and retrieving the size of the resource
+    response = Faraday.get(@uri, nil, range: "bytes=%d-%d" % [range.begin, range.end])
+    case response.status
+    when 200, 206
+      # Figure out of the server supports content ranges, if it doesn't we have no
+      # business working with that server
+      range_header = response.headers['Content-Range']
+      raise InvalidRequest, "No range support at #{@uri}" unless range_header
+      # "Content-Range: bytes 0-0/307404381" is how the response header is structured
+      size = range_header[/\/(\d+)$/, 1].to_i
+      # S3 returns 200 when you request a Range that is fully satisfied by the entire object,
+      # we take that into account here. For other servers, 206 is the expected response code.
+      # Also, if we request a _larger_ range than what can be satisfied by the server,
+      # the response is going to only contain what _can_ be sent and the status is also going
+      # to be 206
+      return [size, response.body]
+    when 416
+      # We return `nil` as the body if we tried to read past the end of the IO,
+      # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
+      # S3 will also handily _not_ supply us with the Content-Range of the actual resource
+      return [nil, nil]
+    when 500..599
+      raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
+    else
+      raise InvalidRequest, "Server at #{@uri} replied with a #{response.status} and refused our request"
+    end
+  end
+end