format_parser 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/care.rb +8 -0
- data/lib/format_parser.rb +13 -1
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_constraint.rb +37 -0
- data/lib/parsers/aiff_parser.rb +1 -1
- data/lib/parsers/dpx_parser.rb +1 -0
- data/lib/parsers/exif_parser.rb +20 -6
- data/lib/parsers/gif_parser.rb +2 -0
- data/lib/parsers/jpeg_parser.rb +1 -1
- data/lib/parsers/png_parser.rb +2 -1
- data/lib/parsers/psd_parser.rb +2 -0
- data/lib/parsers/tiff_parser.rb +2 -0
- data/lib/read_limiter.rb +8 -0
- data/lib/remote_io.rb +19 -8
- data/spec/care_spec.rb +15 -0
- data/spec/read_limiter_spec.rb +11 -1
- data/spec/remote_fetching_spec.rb +31 -0
- data/spec/remote_io_spec.rb +32 -0
- data/spec/spec_helper.rb +9 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 205c4099f44c0080b53e210ea18cebd4484476ba
|
4
|
+
data.tar.gz: d8145f8f77be44dab386585679a8c7f3ce48869d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad845e5216be2e71e81205cade268d72b1be1ee2e5721a7877a5222a03a43c0f10d18abad8fc1c553165ade0665c7bba9ef5d5a79aa954e8c587ac0d027d6d89
|
7
|
+
data.tar.gz: 172c0d12205c778dadefe57010c5cfe6635d5398eea385048b8e41b3558a326cba4c332e37b9259abef4589d78f31c39bffb7b9c6adb416417b771cdb2b49272
|
data/README.md
CHANGED
data/lib/care.rb
CHANGED
@@ -12,10 +12,18 @@ class Care
|
|
12
12
|
@pos = 0
|
13
13
|
end
|
14
14
|
|
15
|
+
def size
|
16
|
+
@io.size
|
17
|
+
end
|
18
|
+
|
15
19
|
def seek(to)
|
16
20
|
@pos = to
|
17
21
|
end
|
18
22
|
|
23
|
+
def pos
|
24
|
+
@pos
|
25
|
+
end
|
26
|
+
|
19
27
|
def read(n_bytes)
|
20
28
|
read = @cache.byteslice(@io, @pos, n_bytes)
|
21
29
|
return nil unless read && !read.empty?
|
data/lib/format_parser.rb
CHANGED
@@ -5,6 +5,7 @@ module FormatParser
|
|
5
5
|
require_relative 'io_utils'
|
6
6
|
require_relative 'read_limiter'
|
7
7
|
require_relative 'remote_io'
|
8
|
+
require_relative 'io_constraint'
|
8
9
|
require_relative 'care'
|
9
10
|
|
10
11
|
PARSER_MUX = Mutex.new
|
@@ -17,10 +18,21 @@ module FormatParser
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def self.parse_http(url)
|
20
|
-
|
21
|
+
remote_io = RemoteIO.new(url)
|
22
|
+
cached_io = Care::IOWrapper.new(remote_io)
|
23
|
+
|
24
|
+
# Prefetch the first page, since it is very likely to be touched
|
25
|
+
# by all parsers anyway. Additionally, when using RemoteIO we need
|
26
|
+
# to explicitly obtain the size of the resource, which is only available
|
27
|
+
# after having performed at least one successful GET - at least on S3
|
28
|
+
cached_io.read(1); cached_io.seek(0)
|
29
|
+
|
30
|
+
parse(cached_io)
|
21
31
|
end
|
22
32
|
|
23
33
|
def self.parse(io)
|
34
|
+
# If the cache is preconfigured do not apply an extra layer. It is going
|
35
|
+
# to be preconfigured when using parse_http.
|
24
36
|
io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
|
25
37
|
|
26
38
|
# Always instantiate parsers fresh for each input, since they might
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# We deliberately want to document and restrict the
|
2
|
+
# number of methods an IO-ish object has to implement
|
3
|
+
# to be usable with all our parsers. This subset is fairly
|
4
|
+
# thin and well defined, and all the various IO limiters
|
5
|
+
# and cache facilities in the library are guaranteed to
|
6
|
+
# support those methods. This wrapper is used to guarantee
|
7
|
+
# that the parser can only call those specific methods and
|
8
|
+
# nothing more. Consequently, if the parser uses a gem that
|
9
|
+
# for some reason needs additional IO methods to be available
|
10
|
+
# this parser has to provide it's own extensions to that end.
|
11
|
+
#
|
12
|
+
# The rationale for including a method in this subset is as follows:
|
13
|
+
# we include a method if other methods can be implemented on top of it.
|
14
|
+
# For example, should some parser desire `IO#readbyte`, it can be
|
15
|
+
# implemented in terms of a `read()`. Idem for things like `IO#eof?`,
|
16
|
+
# `IO#rewind` and friends.
|
17
|
+
class FormatParser::IOConstraint
|
18
|
+
def initialize(io)
|
19
|
+
@io = io
|
20
|
+
end
|
21
|
+
|
22
|
+
def read(n_bytes)
|
23
|
+
@io.read(n_bytes)
|
24
|
+
end
|
25
|
+
|
26
|
+
def seek(absolute_offset)
|
27
|
+
@io.seek(absolute_offset)
|
28
|
+
end
|
29
|
+
|
30
|
+
def size
|
31
|
+
@io.size
|
32
|
+
end
|
33
|
+
|
34
|
+
def pos
|
35
|
+
@io.pos
|
36
|
+
end
|
37
|
+
end
|
data/lib/parsers/aiff_parser.rb
CHANGED
data/lib/parsers/dpx_parser.rb
CHANGED
@@ -125,6 +125,7 @@ class FormatParser::DPXParser
|
|
125
125
|
HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
|
126
126
|
|
127
127
|
def information_from_io(io)
|
128
|
+
io = FormatParser::IOConstraint.new(io)
|
128
129
|
magic = io.read(4)
|
129
130
|
|
130
131
|
return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -1,9 +1,24 @@
|
|
1
1
|
require 'exifr/jpeg'
|
2
2
|
require 'exifr/tiff'
|
3
|
+
require 'delegate'
|
3
4
|
|
4
5
|
class FormatParser::EXIFParser
|
5
6
|
include FormatParser::IOUtils
|
6
7
|
|
8
|
+
# EXIFR kindly requests the presence of getbyte and readbyte
|
9
|
+
# IO methods, which our constrained IO subset does not provide natively
|
10
|
+
class IOExt < SimpleDelegator
|
11
|
+
def readbyte
|
12
|
+
if byte = read(1)
|
13
|
+
byte.unpack('C').first
|
14
|
+
else
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
alias_method :getbyte, :readbyte
|
20
|
+
end
|
21
|
+
|
7
22
|
# Squash exifr's invalid date warning since we do not use that data.
|
8
23
|
logger = Logger.new(nil)
|
9
24
|
EXIFR.logger = logger
|
@@ -21,9 +36,9 @@ class FormatParser::EXIFParser
|
|
21
36
|
:left_bottom
|
22
37
|
]
|
23
38
|
|
24
|
-
def initialize(filetype,
|
39
|
+
def initialize(filetype, file_io)
|
25
40
|
@filetype = filetype
|
26
|
-
@
|
41
|
+
@file_io = IOExt.new(file_io)
|
27
42
|
@exif_data = nil
|
28
43
|
@orientation = nil
|
29
44
|
@height = nil
|
@@ -31,11 +46,10 @@ class FormatParser::EXIFParser
|
|
31
46
|
end
|
32
47
|
|
33
48
|
def scan_image_exif
|
34
|
-
|
35
49
|
# Without the magic bytes EXIFR throws an error
|
36
|
-
@
|
37
|
-
raw_exif_data = EXIFR::JPEG.new(@
|
38
|
-
raw_exif_data = EXIFR::TIFF.new(@
|
50
|
+
@file_io.seek(0)
|
51
|
+
raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
|
52
|
+
raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
|
39
53
|
# For things that we don't yet have a parser for
|
40
54
|
# we make the raw exif result available
|
41
55
|
@exif_data = raw_exif_data
|
data/lib/parsers/gif_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -19,8 +19,9 @@ class FormatParser::PNGParser
|
|
19
19
|
safe_read(io, 8).unpack("Na4")
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
22
|
def information_from_io(io)
|
23
|
+
io = FormatParser::IOConstraint.new(io)
|
24
|
+
|
24
25
|
magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
|
25
26
|
return unless magic_bytes == PNG_HEADER_BYTES
|
26
27
|
|
data/lib/parsers/psd_parser.rb
CHANGED
data/lib/parsers/tiff_parser.rb
CHANGED
@@ -7,6 +7,8 @@ class FormatParser::TIFFParser
|
|
7
7
|
include FormatParser::IOUtils
|
8
8
|
|
9
9
|
def information_from_io(io)
|
10
|
+
io = FormatParser::IOConstraint.new(io)
|
11
|
+
|
10
12
|
magic_bytes = safe_read(io, 4).unpack("C4")
|
11
13
|
endianness = scan_tiff_endianness(magic_bytes)
|
12
14
|
return unless endianness
|
data/lib/read_limiter.rb
CHANGED
data/lib/remote_io.rb
CHANGED
@@ -24,11 +24,16 @@ class FormatParser::RemoteIO
|
|
24
24
|
0 # always return 0
|
25
25
|
end
|
26
26
|
|
27
|
+
# Emulates IO#pos
|
28
|
+
def pos
|
29
|
+
@pos
|
30
|
+
end
|
31
|
+
|
27
32
|
# Emulates IO#size.
|
28
33
|
#
|
29
|
-
# @return [
|
34
|
+
# @return [Integer] the size of the remote resource
|
30
35
|
def size
|
31
|
-
raise "Remote size not yet obtained, need to perform at least one read() to
|
36
|
+
raise "Remote size not yet obtained, need to perform at least one read() to retrieve it" unless @remote_size
|
32
37
|
@remote_size
|
33
38
|
end
|
34
39
|
|
@@ -42,9 +47,14 @@ class FormatParser::RemoteIO
|
|
42
47
|
# @return [String] the read bytes
|
43
48
|
def read(n_bytes)
|
44
49
|
http_range = (@pos..(@pos + n_bytes - 1))
|
45
|
-
|
46
|
-
|
47
|
-
|
50
|
+
maybe_size, maybe_body = request_range(http_range)
|
51
|
+
if maybe_size && maybe_body
|
52
|
+
@remote_size = maybe_size
|
53
|
+
@pos += maybe_body.bytesize
|
54
|
+
maybe_body.force_encoding(Encoding::ASCII_8BIT)
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
48
58
|
end
|
49
59
|
|
50
60
|
protected
|
@@ -76,10 +86,11 @@ class FormatParser::RemoteIO
|
|
76
86
|
# to be 206
|
77
87
|
return [size, response.body]
|
78
88
|
when 416
|
79
|
-
# We return `nil`
|
89
|
+
# We return `nil` if we tried to read past the end of the IO,
|
80
90
|
# which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
|
81
|
-
# S3 will also handily _not_ supply us with the Content-Range of the actual resource
|
82
|
-
|
91
|
+
# S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
|
92
|
+
# cannot hint size with this response - at lease not when working with S3
|
93
|
+
return nil
|
83
94
|
when 500..599
|
84
95
|
raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
|
85
96
|
else
|
data/spec/care_spec.rb
CHANGED
@@ -46,6 +46,8 @@ describe Care do
|
|
46
46
|
end
|
47
47
|
|
48
48
|
describe Care::IOWrapper do
|
49
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
50
|
+
|
49
51
|
it 'forwards calls to read() to the Care and adjusts internal offsets' do
|
50
52
|
fake_cache_class = Class.new do
|
51
53
|
attr_reader :recorded_calls
|
@@ -73,5 +75,18 @@ describe Care do
|
|
73
75
|
expect(second).to eq([io_double, 2, 3])
|
74
76
|
expect(third).to eq([io_double, 11, 5])
|
75
77
|
end
|
78
|
+
|
79
|
+
it 'implements the complete subset of IOConstraint' do
|
80
|
+
methods_not_covered = Set.new(FormatParser::IOConstraint.public_instance_methods) - Set.new(Care::IOWrapper.public_instance_methods)
|
81
|
+
expect(methods_not_covered).to be_empty
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'forwards calls to size() to the underlying IO' do
|
85
|
+
io_double = double('IO')
|
86
|
+
expect(io_double).to receive(:size).and_return(123)
|
87
|
+
|
88
|
+
subject = Care::IOWrapper.new(io_double)
|
89
|
+
expect(subject.size).to eq(123)
|
90
|
+
end
|
76
91
|
end
|
77
92
|
end
|
data/spec/read_limiter_spec.rb
CHANGED
@@ -1,14 +1,23 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe FormatParser::ReadLimiter do
|
4
4
|
let(:io) { StringIO.new(Random.new.bytes(1024)) }
|
5
5
|
|
6
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
7
|
+
|
6
8
|
it 'does not enforce any limits with default arguments' do
|
7
9
|
reader = FormatParser::ReadLimiter.new(io)
|
8
10
|
2048.times { reader.seek(1) }
|
9
11
|
2048.times { reader.read(4) }
|
10
12
|
end
|
11
13
|
|
14
|
+
it 'passes #pos to the delegate' do
|
15
|
+
reader = FormatParser::ReadLimiter.new(io)
|
16
|
+
expect(reader.pos).to eq(0)
|
17
|
+
io.read(2)
|
18
|
+
expect(reader.pos).to eq(2)
|
19
|
+
end
|
20
|
+
|
12
21
|
it 'enforces the number of seeks' do
|
13
22
|
reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
|
14
23
|
4.times { reader.seek(1) }
|
@@ -32,4 +41,5 @@ describe "ReadLimiter" do
|
|
32
41
|
reader.read(1)
|
33
42
|
}.to raise_error(/bytes budget \(512\) exceeded/)
|
34
43
|
end
|
44
|
+
|
35
45
|
end
|
@@ -25,6 +25,37 @@ describe 'Fetching data from HTTP remotes' do
|
|
25
25
|
expect(file_information.file_nature).to eq(:image)
|
26
26
|
end
|
27
27
|
|
28
|
+
it 'parses the JPEGs exif data' do
|
29
|
+
file_information = FormatParser.parse_http('http://localhost:9399/exif-orientation-testimages/jpg/top_left.jpg')
|
30
|
+
expect(file_information).not_to be_nil
|
31
|
+
expect(file_information.file_nature).to eq(:image)
|
32
|
+
expect(file_information.file_type).to eq(:jpg)
|
33
|
+
expect(file_information.orientation).to eq(:top_left)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'parses the TIFFs exif data' do
|
37
|
+
file_information = FormatParser.parse_http('http://localhost:9399/TIFF/test.tif')
|
38
|
+
expect(file_information).not_to be_nil
|
39
|
+
expect(file_information.file_nature).to eq(:image)
|
40
|
+
expect(file_information.file_type).to eq(:tif)
|
41
|
+
expect(file_information.orientation).to eq(:top_left)
|
42
|
+
end
|
43
|
+
|
44
|
+
describe 'is able to correctly parse orientation for all remote JPEG EXIF examples from FastImage' do
|
45
|
+
Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
|
46
|
+
filename = File.basename(jpeg_path)
|
47
|
+
it "is able to parse #{filename}" do
|
48
|
+
remote_jpeg_path = jpeg_path.gsub(fixtures_dir, "http://localhost:9399")
|
49
|
+
file_information = FormatParser.parse_http(remote_jpeg_path)
|
50
|
+
expect(file_information).not_to be_nil
|
51
|
+
|
52
|
+
expect(file_information.orientation).to be_kind_of(Symbol)
|
53
|
+
# Filenames in this dir correspond with the orientation of the file
|
54
|
+
expect(filename.include?(file_information.orientation.to_s)).to be true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
28
59
|
after(:all) do
|
29
60
|
@server.stop
|
30
61
|
@server_thread.join(0.5)
|
data/spec/remote_io_spec.rb
CHANGED
@@ -2,6 +2,8 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe FormatParser::RemoteIO do
|
4
4
|
|
5
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
6
|
+
|
5
7
|
it 'returns the partial content when the server supplies a 206 status' do
|
6
8
|
rio = described_class.new("https://images.invalid/img.jpg")
|
7
9
|
|
@@ -44,6 +46,24 @@ describe FormatParser::RemoteIO do
|
|
44
46
|
expect(rio.read(100)).to be_nil
|
45
47
|
end
|
46
48
|
|
49
|
+
it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
|
50
|
+
rio = described_class.new("https://images.invalid/img.jpg")
|
51
|
+
|
52
|
+
fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
|
53
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
|
54
|
+
rio.read(1)
|
55
|
+
|
56
|
+
expect(rio.size).to eq(13)
|
57
|
+
|
58
|
+
fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
|
59
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=100-199").and_return(fake_resp)
|
60
|
+
|
61
|
+
rio.seek(100)
|
62
|
+
expect(rio.read(100)).to be_nil
|
63
|
+
|
64
|
+
expect(rio.size).to eq(13)
|
65
|
+
end
|
66
|
+
|
47
67
|
it 'raises a specific error for all 5xx responses' do
|
48
68
|
rio = described_class.new("https://images.invalid/img.jpg")
|
49
69
|
|
@@ -53,4 +73,16 @@ describe FormatParser::RemoteIO do
|
|
53
73
|
rio.seek(100)
|
54
74
|
expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
|
55
75
|
end
|
76
|
+
|
77
|
+
it 'maintains and exposes #pos' do
|
78
|
+
rio = described_class.new("https://images.invalid/img.jpg")
|
79
|
+
|
80
|
+
expect(rio.pos).to eq(0)
|
81
|
+
|
82
|
+
fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
|
83
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
|
84
|
+
rio.read(1)
|
85
|
+
|
86
|
+
expect(rio.pos).to eq(1)
|
87
|
+
end
|
56
88
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -20,3 +20,12 @@ RSpec.configure do |c|
|
|
20
20
|
c.include SpecHelpers
|
21
21
|
c.extend SpecHelpers # makes fixtures_dir available for example groups too
|
22
22
|
end
|
23
|
+
|
24
|
+
RSpec.shared_examples "an IO object compatible with IOConstraint" do
|
25
|
+
it 'responds to the same subset of public instance methods' do
|
26
|
+
requisite_methods = FormatParser::IOConstraint.public_instance_methods - Object.public_instance_methods
|
27
|
+
requisite_methods.each do |requisite|
|
28
|
+
expect(described_class.public_instance_methods).to include(requisite), "#{described_class} must respond to #{requisite}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-01-
|
12
|
+
date: 2018-01-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/file_information.rb
|
135
135
|
- lib/format_parser.rb
|
136
136
|
- lib/format_parser/version.rb
|
137
|
+
- lib/io_constraint.rb
|
137
138
|
- lib/io_utils.rb
|
138
139
|
- lib/parsers/aiff_parser.rb
|
139
140
|
- lib/parsers/dpx_parser.rb
|