format_parser 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 893aa7a0c9ab6b3f13930caffd0c455ef341c91f
4
- data.tar.gz: e63bb62ac26c49e1958ab62eb9410058084d7d44
3
+ metadata.gz: 205c4099f44c0080b53e210ea18cebd4484476ba
4
+ data.tar.gz: d8145f8f77be44dab386585679a8c7f3ce48869d
5
5
  SHA512:
6
- metadata.gz: 1ba899600ad478696d35478a57838d2cc66e75a1e4884f9f993a8a5df584184973d65effde17751dd063c2c08edb65bf8a3a3fbbb33595b2cdb614ad31fe0759
7
- data.tar.gz: 3476574566c7dcab2dc7113337107c6fe182860478890c31da0ff6588ccb4d7adcb068ff4dcc3c9d2e01c43d815db4240c7d50e36f4014c69834780db675747f
6
+ metadata.gz: ad845e5216be2e71e81205cade268d72b1be1ee2e5721a7877a5222a03a43c0f10d18abad8fc1c553165ade0665c7bba9ef5d5a79aa954e8c587ac0d027d6d89
7
+ data.tar.gz: 172c0d12205c778dadefe57010c5cfe6635d5398eea385048b8e41b3558a326cba4c332e37b9259abef4589d78f31c39bffb7b9c6adb416417b771cdb2b49272
data/README.md CHANGED
@@ -19,6 +19,7 @@ file_info.width_px #=> 320
19
19
  file_info.height_px #=> 240
20
20
  file_info.orientation #=> :top_left
21
21
  ```
22
+
22
23
  If nothing is detected, the result will be `nil`.
23
24
 
24
25
  ## Design rationale
data/lib/care.rb CHANGED
@@ -12,10 +12,18 @@ class Care
12
12
  @pos = 0
13
13
  end
14
14
 
15
+ def size
16
+ @io.size
17
+ end
18
+
15
19
  def seek(to)
16
20
  @pos = to
17
21
  end
18
22
 
23
+ def pos
24
+ @pos
25
+ end
26
+
19
27
  def read(n_bytes)
20
28
  read = @cache.byteslice(@io, @pos, n_bytes)
21
29
  return nil unless read && !read.empty?
data/lib/format_parser.rb CHANGED
@@ -5,6 +5,7 @@ module FormatParser
5
5
  require_relative 'io_utils'
6
6
  require_relative 'read_limiter'
7
7
  require_relative 'remote_io'
8
+ require_relative 'io_constraint'
8
9
  require_relative 'care'
9
10
 
10
11
  PARSER_MUX = Mutex.new
@@ -17,10 +18,21 @@ module FormatParser
17
18
  end
18
19
 
19
20
  def self.parse_http(url)
20
- parse(RemoteIO.new(url))
21
+ remote_io = RemoteIO.new(url)
22
+ cached_io = Care::IOWrapper.new(remote_io)
23
+
24
+ # Prefetch the first page, since it is very likely to be touched
25
+ # by all parsers anyway. Additionally, when using RemoteIO we need
26
+ # to explicitly obtain the size of the resource, which is only available
27
+ # after having performed at least one successful GET - at least on S3
28
+ cached_io.read(1); cached_io.seek(0)
29
+
30
+ parse(cached_io)
21
31
  end
22
32
 
23
33
  def self.parse(io)
34
+ # If the cache is preconfigured do not apply an extra layer. It is going
35
+ # to be preconfigured when using parse_http.
24
36
  io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
25
37
 
26
38
  # Always instantiate parsers fresh for each input, since they might
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.1.1'
2
+ VERSION = '0.1.2'
3
3
  end
@@ -0,0 +1,37 @@
1
+ # We deliberately want to document and restrict the
2
+ # number of methods an IO-ish object has to implement
3
+ # to be usable with all our parsers. This subset is fairly
4
+ # thin and well defined, and all the various IO limiters
5
+ # and cache facilities in the library are guaranteed to
6
+ # support those methods. This wrapper is used to guarantee
7
+ # that the parser can only call those specific methods and
8
+ # nothing more. Consequently, if the parser uses a gem that
9
+ # for some reason needs additional IO methods to be available
10
+ # this parser has to provide it's own extensions to that end.
11
+ #
12
+ # The rationale for including a method in this subset is as follows:
13
+ # we include a method if other methods can be implemented on top of it.
14
+ # For example, should some parser desire `IO#readbyte`, it can be
15
+ # implemented in terms of a `read()`. Idem for things like `IO#eof?`,
16
+ # `IO#rewind` and friends.
17
+ class FormatParser::IOConstraint
18
+ def initialize(io)
19
+ @io = io
20
+ end
21
+
22
+ def read(n_bytes)
23
+ @io.read(n_bytes)
24
+ end
25
+
26
+ def seek(absolute_offset)
27
+ @io.seek(absolute_offset)
28
+ end
29
+
30
+ def size
31
+ @io.size
32
+ end
33
+
34
+ def pos
35
+ @io.pos
36
+ end
37
+ end
@@ -19,7 +19,7 @@ class FormatParser::AIFFParser
19
19
  ]
20
20
 
21
21
  def information_from_io(io)
22
- io.seek(0)
22
+ io = FormatParser::IOConstraint.new(io)
23
23
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
24
  return unless form_chunk_type == "FORM" && chunk_size > 4
25
25
 
@@ -125,6 +125,7 @@ class FormatParser::DPXParser
125
125
  HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
126
 
127
127
  def information_from_io(io)
128
+ io = FormatParser::IOConstraint.new(io)
128
129
  magic = io.read(4)
129
130
 
130
131
  return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
@@ -1,9 +1,24 @@
1
1
  require 'exifr/jpeg'
2
2
  require 'exifr/tiff'
3
+ require 'delegate'
3
4
 
4
5
  class FormatParser::EXIFParser
5
6
  include FormatParser::IOUtils
6
7
 
8
+ # EXIFR kindly requests the presence of getbyte and readbyte
9
+ # IO methods, which our constrained IO subset does not provide natively
10
+ class IOExt < SimpleDelegator
11
+ def readbyte
12
+ if byte = read(1)
13
+ byte.unpack('C').first
14
+ else
15
+ nil
16
+ end
17
+ end
18
+
19
+ alias_method :getbyte, :readbyte
20
+ end
21
+
7
22
  # Squash exifr's invalid date warning since we do not use that data.
8
23
  logger = Logger.new(nil)
9
24
  EXIFR.logger = logger
@@ -21,9 +36,9 @@ class FormatParser::EXIFParser
21
36
  :left_bottom
22
37
  ]
23
38
 
24
- def initialize(filetype, file_data)
39
+ def initialize(filetype, file_io)
25
40
  @filetype = filetype
26
- @file_data = file_data
41
+ @file_io = IOExt.new(file_io)
27
42
  @exif_data = nil
28
43
  @orientation = nil
29
44
  @height = nil
@@ -31,11 +46,10 @@ class FormatParser::EXIFParser
31
46
  end
32
47
 
33
48
  def scan_image_exif
34
-
35
49
  # Without the magic bytes EXIFR throws an error
36
- @file_data.seek(0)
37
- raw_exif_data = EXIFR::JPEG.new(@file_data) if @filetype == :jpeg
38
- raw_exif_data = EXIFR::TIFF.new(@file_data) if @filetype == :tiff
50
+ @file_io.seek(0)
51
+ raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
52
+ raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
39
53
  # For things that we don't yet have a parser for
40
54
  # we make the raw exif result available
41
55
  @exif_data = raw_exif_data
@@ -5,6 +5,8 @@ class FormatParser::GIFParser
5
5
  include FormatParser::IOUtils
6
6
 
7
7
  def information_from_io(io)
8
+ io = FormatParser::IOConstraint.new(io)
9
+
8
10
  header = safe_read(io, 6)
9
11
  return unless HEADERS.include?(header)
10
12
 
@@ -11,7 +11,7 @@ class FormatParser::JPEGParser
11
11
  APP1_MARKER = 0xE1 # maybe EXIF
12
12
 
13
13
  def information_from_io(io)
14
- @buf = io
14
+ @buf = FormatParser::IOConstraint.new(io)
15
15
  @width = nil
16
16
  @height = nil
17
17
  @orientation = nil
@@ -19,8 +19,9 @@ class FormatParser::PNGParser
19
19
  safe_read(io, 8).unpack("Na4")
20
20
  end
21
21
 
22
-
23
22
  def information_from_io(io)
23
+ io = FormatParser::IOConstraint.new(io)
24
+
24
25
  magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
25
26
  return unless magic_bytes == PNG_HEADER_BYTES
26
27
 
@@ -3,6 +3,8 @@ class FormatParser::PSDParser
3
3
  include FormatParser::IOUtils
4
4
 
5
5
  def information_from_io(io)
6
+ io = FormatParser::IOConstraint.new(io)
7
+
6
8
  magic_bytes = safe_read(io, 4).unpack("C4")
7
9
 
8
10
  return unless magic_bytes == PSD_HEADER
@@ -7,6 +7,8 @@ class FormatParser::TIFFParser
7
7
  include FormatParser::IOUtils
8
8
 
9
9
  def information_from_io(io)
10
+ io = FormatParser::IOConstraint.new(io)
11
+
10
12
  magic_bytes = safe_read(io, 4).unpack("C4")
11
13
  endianness = scan_tiff_endianness(magic_bytes)
12
14
  return unless endianness
data/lib/read_limiter.rb CHANGED
@@ -14,6 +14,14 @@ class FormatParser::ReadLimiter
14
14
  @bytes = 0
15
15
  end
16
16
 
17
+ def size
18
+ @io.size
19
+ end
20
+
21
+ def pos
22
+ @io.pos
23
+ end
24
+
17
25
  def seek(to_offset)
18
26
  @seeks += 1
19
27
  if @max_seeks && @seeks > @max_seeks
data/lib/remote_io.rb CHANGED
@@ -24,11 +24,16 @@ class FormatParser::RemoteIO
24
24
  0 # always return 0
25
25
  end
26
26
 
27
+ # Emulates IO#pos
28
+ def pos
29
+ @pos
30
+ end
31
+
27
32
  # Emulates IO#size.
28
33
  #
29
- # @return [Fixnum] the size of the remote resource
34
+ # @return [Integer] the size of the remote resource
30
35
  def size
31
- raise "Remote size not yet obtained, need to perform at least one read() to get it" unless @remote_size
36
+ raise "Remote size not yet obtained, need to perform at least one read() to retrieve it" unless @remote_size
32
37
  @remote_size
33
38
  end
34
39
 
@@ -42,9 +47,14 @@ class FormatParser::RemoteIO
42
47
  # @return [String] the read bytes
43
48
  def read(n_bytes)
44
49
  http_range = (@pos..(@pos + n_bytes - 1))
45
- @remote_size, body = request_range(http_range)
46
- body.force_encoding(Encoding::BINARY) if body
47
- body
50
+ maybe_size, maybe_body = request_range(http_range)
51
+ if maybe_size && maybe_body
52
+ @remote_size = maybe_size
53
+ @pos += maybe_body.bytesize
54
+ maybe_body.force_encoding(Encoding::ASCII_8BIT)
55
+ else
56
+ nil
57
+ end
48
58
  end
49
59
 
50
60
  protected
@@ -76,10 +86,11 @@ class FormatParser::RemoteIO
76
86
  # to be 206
77
87
  return [size, response.body]
78
88
  when 416
79
- # We return `nil` as the body if we tried to read past the end of the IO,
89
+ # We return `nil` if we tried to read past the end of the IO,
80
90
  # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
81
- # S3 will also handily _not_ supply us with the Content-Range of the actual resource
82
- return [nil, nil]
91
+ # S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
92
+ # cannot hint size with this response - at lease not when working with S3
93
+ return nil
83
94
  when 500..599
84
95
  raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
85
96
  else
data/spec/care_spec.rb CHANGED
@@ -46,6 +46,8 @@ describe Care do
46
46
  end
47
47
 
48
48
  describe Care::IOWrapper do
49
+ it_behaves_like 'an IO object compatible with IOConstraint'
50
+
49
51
  it 'forwards calls to read() to the Care and adjusts internal offsets' do
50
52
  fake_cache_class = Class.new do
51
53
  attr_reader :recorded_calls
@@ -73,5 +75,18 @@ describe Care do
73
75
  expect(second).to eq([io_double, 2, 3])
74
76
  expect(third).to eq([io_double, 11, 5])
75
77
  end
78
+
79
+ it 'implements the complete subset of IOConstraint' do
80
+ methods_not_covered = Set.new(FormatParser::IOConstraint.public_instance_methods) - Set.new(Care::IOWrapper.public_instance_methods)
81
+ expect(methods_not_covered).to be_empty
82
+ end
83
+
84
+ it 'forwards calls to size() to the underlying IO' do
85
+ io_double = double('IO')
86
+ expect(io_double).to receive(:size).and_return(123)
87
+
88
+ subject = Care::IOWrapper.new(io_double)
89
+ expect(subject.size).to eq(123)
90
+ end
76
91
  end
77
92
  end
@@ -1,14 +1,23 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe "ReadLimiter" do
3
+ describe FormatParser::ReadLimiter do
4
4
  let(:io) { StringIO.new(Random.new.bytes(1024)) }
5
5
 
6
+ it_behaves_like 'an IO object compatible with IOConstraint'
7
+
6
8
  it 'does not enforce any limits with default arguments' do
7
9
  reader = FormatParser::ReadLimiter.new(io)
8
10
  2048.times { reader.seek(1) }
9
11
  2048.times { reader.read(4) }
10
12
  end
11
13
 
14
+ it 'passes #pos to the delegate' do
15
+ reader = FormatParser::ReadLimiter.new(io)
16
+ expect(reader.pos).to eq(0)
17
+ io.read(2)
18
+ expect(reader.pos).to eq(2)
19
+ end
20
+
12
21
  it 'enforces the number of seeks' do
13
22
  reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
14
23
  4.times { reader.seek(1) }
@@ -32,4 +41,5 @@ describe "ReadLimiter" do
32
41
  reader.read(1)
33
42
  }.to raise_error(/bytes budget \(512\) exceeded/)
34
43
  end
44
+
35
45
  end
@@ -25,6 +25,37 @@ describe 'Fetching data from HTTP remotes' do
25
25
  expect(file_information.file_nature).to eq(:image)
26
26
  end
27
27
 
28
+ it 'parses the JPEGs exif data' do
29
+ file_information = FormatParser.parse_http('http://localhost:9399/exif-orientation-testimages/jpg/top_left.jpg')
30
+ expect(file_information).not_to be_nil
31
+ expect(file_information.file_nature).to eq(:image)
32
+ expect(file_information.file_type).to eq(:jpg)
33
+ expect(file_information.orientation).to eq(:top_left)
34
+ end
35
+
36
+ it 'parses the TIFFs exif data' do
37
+ file_information = FormatParser.parse_http('http://localhost:9399/TIFF/test.tif')
38
+ expect(file_information).not_to be_nil
39
+ expect(file_information.file_nature).to eq(:image)
40
+ expect(file_information.file_type).to eq(:tif)
41
+ expect(file_information.orientation).to eq(:top_left)
42
+ end
43
+
44
+ describe 'is able to correctly parse orientation for all remote JPEG EXIF examples from FastImage' do
45
+ Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
46
+ filename = File.basename(jpeg_path)
47
+ it "is able to parse #{filename}" do
48
+ remote_jpeg_path = jpeg_path.gsub(fixtures_dir, "http://localhost:9399")
49
+ file_information = FormatParser.parse_http(remote_jpeg_path)
50
+ expect(file_information).not_to be_nil
51
+
52
+ expect(file_information.orientation).to be_kind_of(Symbol)
53
+ # Filenames in this dir correspond with the orientation of the file
54
+ expect(filename.include?(file_information.orientation.to_s)).to be true
55
+ end
56
+ end
57
+ end
58
+
28
59
  after(:all) do
29
60
  @server.stop
30
61
  @server_thread.join(0.5)
@@ -2,6 +2,8 @@ require 'spec_helper'
2
2
 
3
3
  describe FormatParser::RemoteIO do
4
4
 
5
+ it_behaves_like 'an IO object compatible with IOConstraint'
6
+
5
7
  it 'returns the partial content when the server supplies a 206 status' do
6
8
  rio = described_class.new("https://images.invalid/img.jpg")
7
9
 
@@ -44,6 +46,24 @@ describe FormatParser::RemoteIO do
44
46
  expect(rio.read(100)).to be_nil
45
47
  end
46
48
 
49
+ it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
50
+ rio = described_class.new("https://images.invalid/img.jpg")
51
+
52
+ fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
53
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
54
+ rio.read(1)
55
+
56
+ expect(rio.size).to eq(13)
57
+
58
+ fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
59
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=100-199").and_return(fake_resp)
60
+
61
+ rio.seek(100)
62
+ expect(rio.read(100)).to be_nil
63
+
64
+ expect(rio.size).to eq(13)
65
+ end
66
+
47
67
  it 'raises a specific error for all 5xx responses' do
48
68
  rio = described_class.new("https://images.invalid/img.jpg")
49
69
 
@@ -53,4 +73,16 @@ describe FormatParser::RemoteIO do
53
73
  rio.seek(100)
54
74
  expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
55
75
  end
76
+
77
+ it 'maintains and exposes #pos' do
78
+ rio = described_class.new("https://images.invalid/img.jpg")
79
+
80
+ expect(rio.pos).to eq(0)
81
+
82
+ fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
83
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
84
+ rio.read(1)
85
+
86
+ expect(rio.pos).to eq(1)
87
+ end
56
88
  end
data/spec/spec_helper.rb CHANGED
@@ -20,3 +20,12 @@ RSpec.configure do |c|
20
20
  c.include SpecHelpers
21
21
  c.extend SpecHelpers # makes fixtures_dir available for example groups too
22
22
  end
23
+
24
+ RSpec.shared_examples "an IO object compatible with IOConstraint" do
25
+ it 'responds to the same subset of public instance methods' do
26
+ requisite_methods = FormatParser::IOConstraint.public_instance_methods - Object.public_instance_methods
27
+ requisite_methods.each do |requisite|
28
+ expect(described_class.public_instance_methods).to include(requisite), "#{described_class} must respond to #{requisite}"
29
+ end
30
+ end
31
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-01-03 00:00:00.000000000 Z
12
+ date: 2018-01-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -134,6 +134,7 @@ files:
134
134
  - lib/file_information.rb
135
135
  - lib/format_parser.rb
136
136
  - lib/format_parser/version.rb
137
+ - lib/io_constraint.rb
137
138
  - lib/io_utils.rb
138
139
  - lib/parsers/aiff_parser.rb
139
140
  - lib/parsers/dpx_parser.rb