format_parser 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/care.rb +8 -0
- data/lib/format_parser.rb +13 -1
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_constraint.rb +37 -0
- data/lib/parsers/aiff_parser.rb +1 -1
- data/lib/parsers/dpx_parser.rb +1 -0
- data/lib/parsers/exif_parser.rb +20 -6
- data/lib/parsers/gif_parser.rb +2 -0
- data/lib/parsers/jpeg_parser.rb +1 -1
- data/lib/parsers/png_parser.rb +2 -1
- data/lib/parsers/psd_parser.rb +2 -0
- data/lib/parsers/tiff_parser.rb +2 -0
- data/lib/read_limiter.rb +8 -0
- data/lib/remote_io.rb +19 -8
- data/spec/care_spec.rb +15 -0
- data/spec/read_limiter_spec.rb +11 -1
- data/spec/remote_fetching_spec.rb +31 -0
- data/spec/remote_io_spec.rb +32 -0
- data/spec/spec_helper.rb +9 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 205c4099f44c0080b53e210ea18cebd4484476ba
|
4
|
+
data.tar.gz: d8145f8f77be44dab386585679a8c7f3ce48869d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad845e5216be2e71e81205cade268d72b1be1ee2e5721a7877a5222a03a43c0f10d18abad8fc1c553165ade0665c7bba9ef5d5a79aa954e8c587ac0d027d6d89
|
7
|
+
data.tar.gz: 172c0d12205c778dadefe57010c5cfe6635d5398eea385048b8e41b3558a326cba4c332e37b9259abef4589d78f31c39bffb7b9c6adb416417b771cdb2b49272
|
data/README.md
CHANGED
data/lib/care.rb
CHANGED
@@ -12,10 +12,18 @@ class Care
|
|
12
12
|
@pos = 0
|
13
13
|
end
|
14
14
|
|
15
|
+
def size
|
16
|
+
@io.size
|
17
|
+
end
|
18
|
+
|
15
19
|
def seek(to)
|
16
20
|
@pos = to
|
17
21
|
end
|
18
22
|
|
23
|
+
def pos
|
24
|
+
@pos
|
25
|
+
end
|
26
|
+
|
19
27
|
def read(n_bytes)
|
20
28
|
read = @cache.byteslice(@io, @pos, n_bytes)
|
21
29
|
return nil unless read && !read.empty?
|
data/lib/format_parser.rb
CHANGED
@@ -5,6 +5,7 @@ module FormatParser
|
|
5
5
|
require_relative 'io_utils'
|
6
6
|
require_relative 'read_limiter'
|
7
7
|
require_relative 'remote_io'
|
8
|
+
require_relative 'io_constraint'
|
8
9
|
require_relative 'care'
|
9
10
|
|
10
11
|
PARSER_MUX = Mutex.new
|
@@ -17,10 +18,21 @@ module FormatParser
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def self.parse_http(url)
|
20
|
-
|
21
|
+
remote_io = RemoteIO.new(url)
|
22
|
+
cached_io = Care::IOWrapper.new(remote_io)
|
23
|
+
|
24
|
+
# Prefetch the first page, since it is very likely to be touched
|
25
|
+
# by all parsers anyway. Additionally, when using RemoteIO we need
|
26
|
+
# to explicitly obtain the size of the resource, which is only available
|
27
|
+
# after having performed at least one successful GET - at least on S3
|
28
|
+
cached_io.read(1); cached_io.seek(0)
|
29
|
+
|
30
|
+
parse(cached_io)
|
21
31
|
end
|
22
32
|
|
23
33
|
def self.parse(io)
|
34
|
+
# If the cache is preconfigured do not apply an extra layer. It is going
|
35
|
+
# to be preconfigured when using parse_http.
|
24
36
|
io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
|
25
37
|
|
26
38
|
# Always instantiate parsers fresh for each input, since they might
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# We deliberately want to document and restrict the
|
2
|
+
# number of methods an IO-ish object has to implement
|
3
|
+
# to be usable with all our parsers. This subset is fairly
|
4
|
+
# thin and well defined, and all the various IO limiters
|
5
|
+
# and cache facilities in the library are guaranteed to
|
6
|
+
# support those methods. This wrapper is used to guarantee
|
7
|
+
# that the parser can only call those specific methods and
|
8
|
+
# nothing more. Consequently, if the parser uses a gem that
|
9
|
+
# for some reason needs additional IO methods to be available
|
10
|
+
# this parser has to provide it's own extensions to that end.
|
11
|
+
#
|
12
|
+
# The rationale for including a method in this subset is as follows:
|
13
|
+
# we include a method if other methods can be implemented on top of it.
|
14
|
+
# For example, should some parser desire `IO#readbyte`, it can be
|
15
|
+
# implemented in terms of a `read()`. Idem for things like `IO#eof?`,
|
16
|
+
# `IO#rewind` and friends.
|
17
|
+
class FormatParser::IOConstraint
|
18
|
+
def initialize(io)
|
19
|
+
@io = io
|
20
|
+
end
|
21
|
+
|
22
|
+
def read(n_bytes)
|
23
|
+
@io.read(n_bytes)
|
24
|
+
end
|
25
|
+
|
26
|
+
def seek(absolute_offset)
|
27
|
+
@io.seek(absolute_offset)
|
28
|
+
end
|
29
|
+
|
30
|
+
def size
|
31
|
+
@io.size
|
32
|
+
end
|
33
|
+
|
34
|
+
def pos
|
35
|
+
@io.pos
|
36
|
+
end
|
37
|
+
end
|
data/lib/parsers/aiff_parser.rb
CHANGED
data/lib/parsers/dpx_parser.rb
CHANGED
@@ -125,6 +125,7 @@ class FormatParser::DPXParser
|
|
125
125
|
HEADER_SIZE = SIZEOF[DPX_INFO] # Does not include the initial 4 bytes
|
126
126
|
|
127
127
|
def information_from_io(io)
|
128
|
+
io = FormatParser::IOConstraint.new(io)
|
128
129
|
magic = io.read(4)
|
129
130
|
|
130
131
|
return nil unless [BE_MAGIC, LE_MAGIC].include?(magic)
|
data/lib/parsers/exif_parser.rb
CHANGED
@@ -1,9 +1,24 @@
|
|
1
1
|
require 'exifr/jpeg'
|
2
2
|
require 'exifr/tiff'
|
3
|
+
require 'delegate'
|
3
4
|
|
4
5
|
class FormatParser::EXIFParser
|
5
6
|
include FormatParser::IOUtils
|
6
7
|
|
8
|
+
# EXIFR kindly requests the presence of getbyte and readbyte
|
9
|
+
# IO methods, which our constrained IO subset does not provide natively
|
10
|
+
class IOExt < SimpleDelegator
|
11
|
+
def readbyte
|
12
|
+
if byte = read(1)
|
13
|
+
byte.unpack('C').first
|
14
|
+
else
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
alias_method :getbyte, :readbyte
|
20
|
+
end
|
21
|
+
|
7
22
|
# Squash exifr's invalid date warning since we do not use that data.
|
8
23
|
logger = Logger.new(nil)
|
9
24
|
EXIFR.logger = logger
|
@@ -21,9 +36,9 @@ class FormatParser::EXIFParser
|
|
21
36
|
:left_bottom
|
22
37
|
]
|
23
38
|
|
24
|
-
def initialize(filetype,
|
39
|
+
def initialize(filetype, file_io)
|
25
40
|
@filetype = filetype
|
26
|
-
@
|
41
|
+
@file_io = IOExt.new(file_io)
|
27
42
|
@exif_data = nil
|
28
43
|
@orientation = nil
|
29
44
|
@height = nil
|
@@ -31,11 +46,10 @@ class FormatParser::EXIFParser
|
|
31
46
|
end
|
32
47
|
|
33
48
|
def scan_image_exif
|
34
|
-
|
35
49
|
# Without the magic bytes EXIFR throws an error
|
36
|
-
@
|
37
|
-
raw_exif_data = EXIFR::JPEG.new(@
|
38
|
-
raw_exif_data = EXIFR::TIFF.new(@
|
50
|
+
@file_io.seek(0)
|
51
|
+
raw_exif_data = EXIFR::JPEG.new(@file_io) if @filetype == :jpeg
|
52
|
+
raw_exif_data = EXIFR::TIFF.new(@file_io) if @filetype == :tiff
|
39
53
|
# For things that we don't yet have a parser for
|
40
54
|
# we make the raw exif result available
|
41
55
|
@exif_data = raw_exif_data
|
data/lib/parsers/gif_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -19,8 +19,9 @@ class FormatParser::PNGParser
|
|
19
19
|
safe_read(io, 8).unpack("Na4")
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
22
|
def information_from_io(io)
|
23
|
+
io = FormatParser::IOConstraint.new(io)
|
24
|
+
|
24
25
|
magic_bytes = safe_read(io, PNG_HEADER_BYTES.bytesize)
|
25
26
|
return unless magic_bytes == PNG_HEADER_BYTES
|
26
27
|
|
data/lib/parsers/psd_parser.rb
CHANGED
data/lib/parsers/tiff_parser.rb
CHANGED
@@ -7,6 +7,8 @@ class FormatParser::TIFFParser
|
|
7
7
|
include FormatParser::IOUtils
|
8
8
|
|
9
9
|
def information_from_io(io)
|
10
|
+
io = FormatParser::IOConstraint.new(io)
|
11
|
+
|
10
12
|
magic_bytes = safe_read(io, 4).unpack("C4")
|
11
13
|
endianness = scan_tiff_endianness(magic_bytes)
|
12
14
|
return unless endianness
|
data/lib/read_limiter.rb
CHANGED
data/lib/remote_io.rb
CHANGED
@@ -24,11 +24,16 @@ class FormatParser::RemoteIO
|
|
24
24
|
0 # always return 0
|
25
25
|
end
|
26
26
|
|
27
|
+
# Emulates IO#pos
|
28
|
+
def pos
|
29
|
+
@pos
|
30
|
+
end
|
31
|
+
|
27
32
|
# Emulates IO#size.
|
28
33
|
#
|
29
|
-
# @return [
|
34
|
+
# @return [Integer] the size of the remote resource
|
30
35
|
def size
|
31
|
-
raise "Remote size not yet obtained, need to perform at least one read() to
|
36
|
+
raise "Remote size not yet obtained, need to perform at least one read() to retrieve it" unless @remote_size
|
32
37
|
@remote_size
|
33
38
|
end
|
34
39
|
|
@@ -42,9 +47,14 @@ class FormatParser::RemoteIO
|
|
42
47
|
# @return [String] the read bytes
|
43
48
|
def read(n_bytes)
|
44
49
|
http_range = (@pos..(@pos + n_bytes - 1))
|
45
|
-
|
46
|
-
|
47
|
-
|
50
|
+
maybe_size, maybe_body = request_range(http_range)
|
51
|
+
if maybe_size && maybe_body
|
52
|
+
@remote_size = maybe_size
|
53
|
+
@pos += maybe_body.bytesize
|
54
|
+
maybe_body.force_encoding(Encoding::ASCII_8BIT)
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
48
58
|
end
|
49
59
|
|
50
60
|
protected
|
@@ -76,10 +86,11 @@ class FormatParser::RemoteIO
|
|
76
86
|
# to be 206
|
77
87
|
return [size, response.body]
|
78
88
|
when 416
|
79
|
-
# We return `nil`
|
89
|
+
# We return `nil` if we tried to read past the end of the IO,
|
80
90
|
# which satisfies the Ruby IO convention. The caller should deal with `nil` being the result of a read()
|
81
|
-
# S3 will also handily _not_ supply us with the Content-Range of the actual resource
|
82
|
-
|
91
|
+
# S3 will also handily _not_ supply us with the Content-Range of the actual resource, so we
|
92
|
+
# cannot hint size with this response - at lease not when working with S3
|
93
|
+
return nil
|
83
94
|
when 500..599
|
84
95
|
raise IntermittentFailure, "Server at #{@uri} replied with a #{response.status} and we might want to retry"
|
85
96
|
else
|
data/spec/care_spec.rb
CHANGED
@@ -46,6 +46,8 @@ describe Care do
|
|
46
46
|
end
|
47
47
|
|
48
48
|
describe Care::IOWrapper do
|
49
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
50
|
+
|
49
51
|
it 'forwards calls to read() to the Care and adjusts internal offsets' do
|
50
52
|
fake_cache_class = Class.new do
|
51
53
|
attr_reader :recorded_calls
|
@@ -73,5 +75,18 @@ describe Care do
|
|
73
75
|
expect(second).to eq([io_double, 2, 3])
|
74
76
|
expect(third).to eq([io_double, 11, 5])
|
75
77
|
end
|
78
|
+
|
79
|
+
it 'implements the complete subset of IOConstraint' do
|
80
|
+
methods_not_covered = Set.new(FormatParser::IOConstraint.public_instance_methods) - Set.new(Care::IOWrapper.public_instance_methods)
|
81
|
+
expect(methods_not_covered).to be_empty
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'forwards calls to size() to the underlying IO' do
|
85
|
+
io_double = double('IO')
|
86
|
+
expect(io_double).to receive(:size).and_return(123)
|
87
|
+
|
88
|
+
subject = Care::IOWrapper.new(io_double)
|
89
|
+
expect(subject.size).to eq(123)
|
90
|
+
end
|
76
91
|
end
|
77
92
|
end
|
data/spec/read_limiter_spec.rb
CHANGED
@@ -1,14 +1,23 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe FormatParser::ReadLimiter do
|
4
4
|
let(:io) { StringIO.new(Random.new.bytes(1024)) }
|
5
5
|
|
6
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
7
|
+
|
6
8
|
it 'does not enforce any limits with default arguments' do
|
7
9
|
reader = FormatParser::ReadLimiter.new(io)
|
8
10
|
2048.times { reader.seek(1) }
|
9
11
|
2048.times { reader.read(4) }
|
10
12
|
end
|
11
13
|
|
14
|
+
it 'passes #pos to the delegate' do
|
15
|
+
reader = FormatParser::ReadLimiter.new(io)
|
16
|
+
expect(reader.pos).to eq(0)
|
17
|
+
io.read(2)
|
18
|
+
expect(reader.pos).to eq(2)
|
19
|
+
end
|
20
|
+
|
12
21
|
it 'enforces the number of seeks' do
|
13
22
|
reader = FormatParser::ReadLimiter.new(io, max_seeks: 4)
|
14
23
|
4.times { reader.seek(1) }
|
@@ -32,4 +41,5 @@ describe "ReadLimiter" do
|
|
32
41
|
reader.read(1)
|
33
42
|
}.to raise_error(/bytes budget \(512\) exceeded/)
|
34
43
|
end
|
44
|
+
|
35
45
|
end
|
@@ -25,6 +25,37 @@ describe 'Fetching data from HTTP remotes' do
|
|
25
25
|
expect(file_information.file_nature).to eq(:image)
|
26
26
|
end
|
27
27
|
|
28
|
+
it 'parses the JPEGs exif data' do
|
29
|
+
file_information = FormatParser.parse_http('http://localhost:9399/exif-orientation-testimages/jpg/top_left.jpg')
|
30
|
+
expect(file_information).not_to be_nil
|
31
|
+
expect(file_information.file_nature).to eq(:image)
|
32
|
+
expect(file_information.file_type).to eq(:jpg)
|
33
|
+
expect(file_information.orientation).to eq(:top_left)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'parses the TIFFs exif data' do
|
37
|
+
file_information = FormatParser.parse_http('http://localhost:9399/TIFF/test.tif')
|
38
|
+
expect(file_information).not_to be_nil
|
39
|
+
expect(file_information.file_nature).to eq(:image)
|
40
|
+
expect(file_information.file_type).to eq(:tif)
|
41
|
+
expect(file_information.orientation).to eq(:top_left)
|
42
|
+
end
|
43
|
+
|
44
|
+
describe 'is able to correctly parse orientation for all remote JPEG EXIF examples from FastImage' do
|
45
|
+
Dir.glob(fixtures_dir + '/exif-orientation-testimages/jpg/*.jpg').each do |jpeg_path|
|
46
|
+
filename = File.basename(jpeg_path)
|
47
|
+
it "is able to parse #{filename}" do
|
48
|
+
remote_jpeg_path = jpeg_path.gsub(fixtures_dir, "http://localhost:9399")
|
49
|
+
file_information = FormatParser.parse_http(remote_jpeg_path)
|
50
|
+
expect(file_information).not_to be_nil
|
51
|
+
|
52
|
+
expect(file_information.orientation).to be_kind_of(Symbol)
|
53
|
+
# Filenames in this dir correspond with the orientation of the file
|
54
|
+
expect(filename.include?(file_information.orientation.to_s)).to be true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
28
59
|
after(:all) do
|
29
60
|
@server.stop
|
30
61
|
@server_thread.join(0.5)
|
data/spec/remote_io_spec.rb
CHANGED
@@ -2,6 +2,8 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe FormatParser::RemoteIO do
|
4
4
|
|
5
|
+
it_behaves_like 'an IO object compatible with IOConstraint'
|
6
|
+
|
5
7
|
it 'returns the partial content when the server supplies a 206 status' do
|
6
8
|
rio = described_class.new("https://images.invalid/img.jpg")
|
7
9
|
|
@@ -44,6 +46,24 @@ describe FormatParser::RemoteIO do
|
|
44
46
|
expect(rio.read(100)).to be_nil
|
45
47
|
end
|
46
48
|
|
49
|
+
it 'does not overwrite size when the range cannot be satisfied and the response is 416' do
|
50
|
+
rio = described_class.new("https://images.invalid/img.jpg")
|
51
|
+
|
52
|
+
fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
|
53
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
|
54
|
+
rio.read(1)
|
55
|
+
|
56
|
+
expect(rio.size).to eq(13)
|
57
|
+
|
58
|
+
fake_resp = double(headers: {}, status: 416, body: 'You jumped off the end of the file maam')
|
59
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=100-199").and_return(fake_resp)
|
60
|
+
|
61
|
+
rio.seek(100)
|
62
|
+
expect(rio.read(100)).to be_nil
|
63
|
+
|
64
|
+
expect(rio.size).to eq(13)
|
65
|
+
end
|
66
|
+
|
47
67
|
it 'raises a specific error for all 5xx responses' do
|
48
68
|
rio = described_class.new("https://images.invalid/img.jpg")
|
49
69
|
|
@@ -53,4 +73,16 @@ describe FormatParser::RemoteIO do
|
|
53
73
|
rio.seek(100)
|
54
74
|
expect { rio.read(100) }.to raise_error(/replied with a 502 and we might want to retry/)
|
55
75
|
end
|
76
|
+
|
77
|
+
it 'maintains and exposes #pos' do
|
78
|
+
rio = described_class.new("https://images.invalid/img.jpg")
|
79
|
+
|
80
|
+
expect(rio.pos).to eq(0)
|
81
|
+
|
82
|
+
fake_resp = double(headers: {'Content-Range' => 'bytes 0-0/13'}, status: 206, body: 'a')
|
83
|
+
expect(Faraday).to receive(:get).with("https://images.invalid/img.jpg", nil, range: "bytes=0-0").and_return(fake_resp)
|
84
|
+
rio.read(1)
|
85
|
+
|
86
|
+
expect(rio.pos).to eq(1)
|
87
|
+
end
|
56
88
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -20,3 +20,12 @@ RSpec.configure do |c|
|
|
20
20
|
c.include SpecHelpers
|
21
21
|
c.extend SpecHelpers # makes fixtures_dir available for example groups too
|
22
22
|
end
|
23
|
+
|
24
|
+
RSpec.shared_examples "an IO object compatible with IOConstraint" do
|
25
|
+
it 'responds to the same subset of public instance methods' do
|
26
|
+
requisite_methods = FormatParser::IOConstraint.public_instance_methods - Object.public_instance_methods
|
27
|
+
requisite_methods.each do |requisite|
|
28
|
+
expect(described_class.public_instance_methods).to include(requisite), "#{described_class} must respond to #{requisite}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-01-
|
12
|
+
date: 2018-01-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/file_information.rb
|
135
135
|
- lib/format_parser.rb
|
136
136
|
- lib/format_parser/version.rb
|
137
|
+
- lib/io_constraint.rb
|
137
138
|
- lib/io_utils.rb
|
138
139
|
- lib/parsers/aiff_parser.rb
|
139
140
|
- lib/parsers/dpx_parser.rb
|