format_parser 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 893aa7a0c9ab6b3f13930caffd0c455ef341c91f
4
- data.tar.gz: e63bb62ac26c49e1958ab62eb9410058084d7d44
3
+ metadata.gz: 205c4099f44c0080b53e210ea18cebd4484476ba
4
+ data.tar.gz: d8145f8f77be44dab386585679a8c7f3ce48869d
5
5
  SHA512:
6
- metadata.gz: 1ba899600ad478696d35478a57838d2cc66e75a1e4884f9f993a8a5df584184973d65effde17751dd063c2c08edb65bf8a3a3fbbb33595b2cdb614ad31fe0759
7
- data.tar.gz: 3476574566c7dcab2dc7113337107c6fe182860478890c31da0ff6588ccb4d7adcb068ff4dcc3c9d2e01c43d815db4240c7d50e36f4014c69834780db675747f
6
+ metadata.gz: ad845e5216be2e71e81205cade268d72b1be1ee2e5721a7877a5222a03a43c0f10d18abad8fc1c553165ade0665c7bba9ef5d5a79aa954e8c587ac0d027d6d89
7
+ data.tar.gz: 172c0d12205c778dadefe57010c5cfe6635d5398eea385048b8e41b3558a326cba4c332e37b9259abef4589d78f31c39bffb7b9c6adb416417b771cdb2b49272
data/README.md CHANGED
@@ -19,6 +19,7 @@ file_info.width_px #=> 320
19
19
  file_info.height_px #=> 240
20
20
  file_info.orientation #=> :top_left
21
21
  ```
22
+
22
23
  If nothing is detected, the result will be `nil`.
23
24
 
24
25
  ## Design rationale
data/lib/care.rb CHANGED
@@ -12,10 +12,18 @@ class Care
12
12
  @pos = 0
13
13
  end
14
14
 
15
+ def size
16
+ @io.size
17
+ end
18
+
15
19
  def seek(to)
16
20
  @pos = to
17
21
  end
18
22
 
23
+ def pos
24
+ @pos
25
+ end
26
+
19
27
  def read(n_bytes)
20
28
  read = @cache.byteslice(@io, @pos, n_bytes)
21
29
  return nil unless read && !read.empty?
data/lib/format_parser.rb CHANGED
@@ -5,6 +5,7 @@ module FormatParser
5
5
  require_relative 'io_utils'
6
6
  require_relative 'read_limiter'
7
7
  require_relative 'remote_io'
8
+ require_relative 'io_constraint'
8
9
  require_relative 'care'
9
10
 
10
11
  PARSER_MUX = Mutex.new
@@ -17,10 +18,21 @@ module FormatParser
17
18
  end
18
19
 
19
20
  def self.parse_http(url)
20
- parse(RemoteIO.new(url))
21
+ remote_io = RemoteIO.new(url)
22
+ cached_io = Care::IOWrapper.new(remote_io)
23
+
24
+ # Prefetch the first page, since it is very likely to be touched
25
+ # by all parsers anyway. Additionally, when using RemoteIO we need
26
+ # to explicitly obtain the size of the resource, which is only available
27
+ # after having performed at least one successful GET - at least on S3
28
+ cached_io.read(1); cached_io.seek(0)
29
+
30
+ parse(cached_io)
21
31
  end
22
32
 
23
33
  def self.parse(io)
34
+ # If the cache is preconfigured do not apply an extra layer. It is going
35
+ # to be preconfigured when using parse_http.
24
36
  io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
25
37
 
26
38
  # Always instantiate parsers fresh for each input, since they might
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.1.1'
2
+ VERSION = '0.1.2'
3
3
  end
@@ -0,0 +1,37 @@
1
+ # We deliberately want to document and restrict the
2
+ # number of methods an IO-ish object has to implement
3
+ # to be usable with all our parsers. This subset is fairly
4
+ # thin and well defined, and all the various IO limiters
5
+ # and cache facilities in the library are guaranteed to
6
+ # support those methods. This wrapper is used to guarantee
7
+ # that the parser can only call those specific methods and
8
+ # nothing more. Consequently, if the parser uses a gem that
9
+ # for some reason needs additional IO methods to be available
10
+ # this parser has to provide it's own extensions to that end.
11
+ #
12
+ # The rationale for including a method in this subset is as follows:
13
+ # we include a method if other methods can be implemented on top of it.
14
+ # For example, should some parser desire `IO#readbyte`, it can be
15
+ # implemented in terms of a `read()`. Idem for things like `IO#eof?`,
16
+ # `IO#rewind` and friends.
17
+ class FormatParser::IOConstraint
18
+ def initialize(io)
19
+ @io = io
20
+ end
21
+
22
+ def read(n_bytes)
23
+ @io.read(n_bytes)
24
+ end
25
+
26
+ def seek(absolute_offset)
27
+ @io.seek(absolute_offset)
28
+ end
29
+
30
+ def size
31
+ @io.size
32
+ end
33
+
34
+ def pos
35
+ @io.pos
36
+ end
37
+ end
@@ -19,7 +19,7 @@ class FormatParser::AIFFParser
19
19
  ]
20
20
 
21
21
  def information_from_io(io)
22
- io.seek(0)
22
+ io = FormatParser::IOConstraint.new(io)
23
23
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
24
24
  return unless form_chunk_type == "FORM" && chunk_size > 4
25
25
 
@@ -125,6 +125,7 @@ class FormatParser::DPXParser
125
125
  HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
126
126
 
127
127
  def information_from_io(io)
128
+ io = FormatParser::IOConstraint.new(io)
128
129
  magic = io.read(4)
129
130
 
130
131
  return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
@@ -1,9 +1,24 @@
1
1
  require 'exifr/jpeg'
2
2
  require 'exifr/tiff'
3
+ require 'delegate'
3
4
 
4
5
  class FormatParser::EXIFParser
5
6
  include FormatParser::IOUtils
6
7
 
8
+ # EXIFR kindly requests the presence of getbyte and readbyte
9
+ # IO methods, which our constrained IO subset does not provide natively
10
+ class IOExt < SimpleDelegator
11
+ def readbyte
12
+ if byte = read(1)
13
+ byte.unpack('C').first
14
+ else
15
+ nil
16
+ end
17
+ end
18
+
19
+ alias_method :getbyte, :readbyte
20
+ end
21
+
7
22
  # Squash exifr's invalid date warning since we do not use that data.
8
23
  logger = Logger.new(nil)
9
24
  EXIFR.logger = logger
@@ -21,9 +36,9 @@ class FormatParser::EXIFParser
21
36
  :left_bottom
22
37
  ]
23
38
 
24
- def initialize(filetype, file_data)
39
+ def initialize(filetype, file_io)
25
40
  @filetype = filetype
26
- @file_data = file_data
41
+ @file_io = IOExt.new(file_io)
27
42
  @exif_data = nil
28
43
  @orientation = nil
29
44
  @height = nil
@@ -31,11 +46,10 @@ class FormatParser::EXIFParser
31
46
  end
32
47
 
33
48
  def scan_image_exif
34
-
35
49
  # Without the magic bytes EXIFR throws an error
36
- @file_data.seek(0)
37
- raw_exif_data = EXIFR::JPEG.new(@file_data) if @filetype == :jpeg
38
- raw_exif_data = EXIFR::TIFF.new(@file_data) if @filetype == :tiff
50
+ @file_io.seek(0)
51
+ raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
52
+ raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
39
53
  # For things that we don't yet have a parser for
40
54
  # we make the raw exif result available
41
55
  @exif_data = raw_exif_data
@@ -5,6 +5,8 @@ class FormatParser::GIFParser
5
5
  include FormatParser::IOUtils
6
6
 
7
7
  def information_from_io(io)
8
+ io = FormatParser::IOConstraint.new(io)
9
+
8
10
  header = safe_read(io, 6)
9
11
  return unless HEADERS.include?(header)
10
12
 
@@ -11,7 +11,7 @@ class FormatParser::JPEGParser
11
11
  APP1_MARKER = 0xE1 # maybe EXIF
12
12
 
13
13
  def information_from_io(io)
14
- @buf = io
14
+ @buf = FormatParser::IOConstraint.new(io)
15
15
  @width = nil
16
16
  @height = nil
17
17
  @orientation = nil
@@ -19,8 +19,9 @@ class FormatParser::PNGParser
19
19
  safe_read(io, 8).unpack("Na4")
20
20
  end
21
21
 
22
-
23
22
  def information_from_io(io)
23
+ io = FormatParser::IOConstraint.new(io)
24
+
24
25
  magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
25
26
  return unless magic_bytes == PNG_HEADER_BYTES
26
27
 
@@ -3,6 +3,8 @@ class FormatParser::PSDParser
3
3
  include FormatParser::IOUtils
4
4
 
5
5
  def information_from_io(io)
6
+ io = FormatParser::IOConstraint.new(io)
7
+
6
8
  magic_bytes = safe_read(io, 4).unpack("C4")
7
9
 
8
10
  return unless magic_bytes == PSD_HEADER
@@ -7,6 +7,8 @@ class FormatParser::TIFFParser
7
7
  include FormatParser::IOUtils
8
8
 
9
9
  def information_from_io(io)
10
+ io = FormatParser::IOConstraint.new(io)
11
+
10
12
  magic_bytes = safe_read(io, 4).unpack("C4")
11
13
  endianness = scan_tiff_endianness(magic_bytes)
12
14
  return unless endianness
data/lib/read_limiter.rb CHANGED
@@ -14,6 +14,14 @@ class FormatParser::ReadLimiter
14
14
  @bytes = 0
15
15
  end
16
16
 
17
+ def size
18
+ @io.size
19
+ end
20
+
21
+ def pos
22
+ @io.pos
23
+ end
24
+
17
25
  def seek(to_offset)
18
26
  @seeks += 1
19
27
  if @max_seeks && @seeks > @max_seeks
data/lib/remote_io.rb CHANGED
@@ -24,11 +24,16 @@ class FormatParser::RemoteIO
24
24
  0 # always return 0
25
25
  end
26
26
 
27
+ # Emulates IO#pos
28
+ def pos
29
+ @pos
30
+ end
31
+
27
32
  # Emulates IO#size.
28
33
  #
29
- # @return [Fixnum] the size of the remote resource
34
+ # @return [Integer] the size of the remote resource
30
35
  def size
31
- raise "Remote size not yet obtained, need to perform at least one read() to get it" unless @remote_size
36
+ raise "Remote size not yet obtained, need to perform at least one read() to retrieve it" unless @remote_size
32
37
  @remote_size
33
38
  end
34
39
 
@@ -42,9 +47,14 @@ class FormatParser::RemoteIO
42
47
  # @return [String] the read bytes
43
48
  def read(n_bytes)
44
49
  http_range = (@pos..(@pos + n_bytes - 1))
45
- @remote_size, body = request_range(http_range)
46
- body.force_encoding(Encoding::BINARY) if body
47
- body
50
+ maybe_size, maybe_body = request_range(http_range)
51
+ if maybe_size && maybe_body
52
+ @remote_size = maybe_size
53
+ @pos += maybe_body.bytesize
54
+ maybe_body.force_encoding(Encoding::ASCII_8BIT)
55
+ else
56
+ nil
57
+ end
48
58
  end
49
59
 
50
60
  protected
@@ -76,10 +86,11 @@ class FormatParser::RemoteIO
76
86
  # to be 206
77
87
  return [size, response.body]
78
88
  when 416
79
- # We return `nil` as the body if we tried to read past the end of the IO,
89
+ # We return `nil` if we tried to read past the end of the IO,
80
90
  # which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
81
- # S3 will also handily _not_ supply us with the Content-Range of the actual resource
82
- return [nil, nil]
91
+ # S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
92
+ # cannot hint size with this response - at lease not when working with S3
93
+ return nil
83
94
  when 500..599
84
95
  raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
85
96
  else
data/spec/care_spec.rb CHANGED
@@ -46,6 +46,8 @@ describe Care do
46
46
  end
47
47
 
48
48
  describe Care::IOWrapper do
49
+ it_behaves_like 'an IO object compatible with IOConstraint'
50
+
49
51
  it 'forwards calls to read() to the Care and adjusts internal offsets' do
50
52
  fake_cache_class = Class.new do
51
53
  attr_reader :recorded_calls
@@ -73,5 +75,18 @@ describe Care do
73
75
  expect(second).to eq([io_double, 2, 3])
74
76
  expect(third).to eq([io_double, 11, 5])
75
77
  end
78
+
79
+ it 'implements the complete subset of IOConstraint' do
80
+ methods_not_covered = Set.new(FormatParser::IOConstraint.public_instance_methods) - Set.new(Care::IOWrapper.public_instance_methods)
81
+ expect(methods_not_covered).to be_empty
82
+ end
83
+
84
+ it 'forwards calls to size() to the underlying IO' do
85
+ io_double = double('IO')
86
+ expect(io_double).to receive(:size).and_return(123)
87
+
88
+ subject = Care::IOWrapper.new(io_double)
89
+ expect(subject.size).to eq(123)
90
+ end
76
91
  end
77
92
  end
@@ -1,14 +1,23 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe "ReadLimiter" do
3
+ describe FormatParser::ReadLimiter do
4
4
  let(:io) { StringIO.new(Random.new.bytes(1024)) }
5
5
 
6
+ it_behaves_like 'an IO object compatible with IOConstraint'
7
+
6
8
  it 'does not enforce any limits with default arguments' do
7
9
  reader = FormatParser::ReadLimiter.new(io)
8
10
  2048.times { reader.seek(1) }
9
11
  2048.times { reader.read(4) }
10
12
  end
11
13
 
14
+ it 'passes #pos to the delegate' do
15
+ reader = FormatParser::ReadLimiter.new(io)
16
+ expect(reader.pos).to eq(0)
17
+ io.read(2)
18
+ expect(reader.pos).to eq(2)
19
+ end
20
+
12
21
  it 'enforces the number of seeks' do
13
22
  reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
14
23
  4.times { reader.seek(1) }
@@ -32,4 +41,5 @@ describe "ReadLimiter" do
32
41
  reader.read(1)
33
42
  }.to raise_error(/bytes budget \(512\) exceeded/)
34
43
  end
44
+
35
45
  end
@@ -25,6 +25,37 @@ describe 'Fetching data from HTTP remotes' do
25
25
  expect(file_information.file_nature).to eq(:image)
26
26
  end
27
27
 
28
+ it 'parses the JPEGs exif data' do
29
+ file_information = FormatParser.parse_http('http://localhost:9399/exif-orientation-testimages/jpg/top_left.jpg')
30
+ expect(file_information).not_to be_nil
31
+ expect(file_information.file_nature).to eq(:image)
32
+ expect(file_information.file_type).to eq(:jpg)
33
+ expect(file_information.orientation).to eq(:top_left)
34
+ end
35
+
36
+ it 'parses the TIFFs exif data' do
37
+ file_information = FormatParser.parse_http('http://localhost:9399/TIFF/test.tif')
38
+ expect(file_information).not_to be_nil
39
+ expect(file_information.file_nature).to eq(:image)
40
+ expect(file_information.file_type).to eq(:tif)
41
+ expect(file_information.orientation).to eq(:top_left)
42
+ end
43
+
44
+ describe 'is able to correctly parse orientation for all remote JPEG EXIF examples from FastImage' do
45
+ Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
46
+ filename = File.basename(jpeg_path)
47
+ it "is able to parse #{filename}" do
48
+ remote_jpeg_path = jpeg_path.gsub(fixtures_dir, "http://localhost:9399")
49
+ file_information = FormatParser.parse_http(remote_jpeg_path)
50
+ expect(file_information).not_to be_nil
51
+
52
+ expect(file_information.orientation).to be_kind_of(Symbol)
53
+ # Filenames in this dir correspond with the orientation of the file
54
+ expect(filename.include?(file_information.orientation.to_s)).to be true
55
+ end
56
+ end
57
+ end
58
+
28
59
  after(:all) do
29
60
  @server.stop
30
61
  @server_thread.join(0.5)
@@ -2,6 +2,8 @@ require 'spec_helper'
2
2
 
3
3
  describe FormatParser::RemoteIO do
4
4
 
5
+ it_behaves_like 'an IO object compatible with IOConstraint'
6
+
5
7
  it 'returns the partial content when the server supplies a 206 status' do
6
8
  rio = described_class.new("https://images.invalid/img.jpg")
7
9
 
@@ -44,6 +46,24 @@ describe FormatParser::RemoteIO do
44
46
  expect(rio.read(100)).to be_nil
45
47
  end
46
48
 
49
+ it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
50
+ rio = described_class.new("https://images.invalid/img.jpg")
51
+
52
+ fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
53
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
54
+ rio.read(1)
55
+
56
+ expect(rio.size).to eq(13)
57
+
58
+ fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
59
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=100-199").and_return(fake_resp)
60
+
61
+ rio.seek(100)
62
+ expect(rio.read(100)).to be_nil
63
+
64
+ expect(rio.size).to eq(13)
65
+ end
66
+
47
67
  it 'raises a specific error for all 5xx responses' do
48
68
  rio = described_class.new("https://images.invalid/img.jpg")
49
69
 
@@ -53,4 +73,16 @@ describe FormatParser::RemoteIO do
53
73
  rio.seek(100)
54
74
  expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
55
75
  end
76
+
77
+ it 'maintains and exposes #pos' do
78
+ rio = described_class.new("https://images.invalid/img.jpg")
79
+
80
+ expect(rio.pos).to eq(0)
81
+
82
+ fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
83
+ expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
84
+ rio.read(1)
85
+
86
+ expect(rio.pos).to eq(1)
87
+ end
56
88
  end
data/spec/spec_helper.rb CHANGED
@@ -20,3 +20,12 @@ RSpec.configure do |c|
20
20
  c.include SpecHelpers
21
21
  c.extend SpecHelpers # makes fixtures_dir available for example groups too
22
22
  end
23
+
24
+ RSpec.shared_examples "an IO object compatible with IOConstraint" do
25
+ it 'responds to the same subset of public instance methods' do
26
+ requisite_methods = FormatParser::IOConstraint.public_instance_methods - Object.public_instance_methods
27
+ requisite_methods.each do |requisite|
28
+ expect(described_class.public_instance_methods).to include(requisite), "#{described_class} must respond to #{requisite}"
29
+ end
30
+ end
31
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-01-03 00:00:00.000000000 Z
12
+ date: 2018-01-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -134,6 +134,7 @@ files:
134
134
  - lib/file_information.rb
135
135
  - lib/format_parser.rb
136
136
  - lib/format_parser/version.rb
137
+ - lib/io_constraint.rb
137
138
  - lib/io_utils.rb
138
139
  - lib/parsers/aiff_parser.rb
139
140
  - lib/parsers/dpx_parser.rb