format_parser 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/format_parser.gemspec +1 -1
- data/lib/format_parser.rb +22 -19
- data/lib/format_parser/version.rb +1 -1
- data/lib/io_constraint.rb +0 -1
- data/lib/parsers/exif_parser.rb +16 -2
- data/lib/read_limiter.rb +6 -0
- data/spec/format_parser_spec.rb +20 -0
- data/spec/parsers/exif_parser_spec.rb +26 -0
- data/spec/read_limiter_spec.rb +10 -0
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 99c08f5e2482e524cf7b6c54eb4de371bbf1a107
|
4
|
+
data.tar.gz: 3c33f9336c6ec7479b03102f8dbc4ab8610a6420
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9892a12a6007744c70baa01200d432b40868d6f8d854b506dea7349ef84282133f49ab244a9d383b7772221d29dc950ae07c604b57247784f42aae74a03474b
|
7
|
+
data.tar.gz: 643b274aa45ef8a11dde7e178d03036488767ba56f4e4680edf647ebb5098aff52c08bcecc9578c73bd6835aa235ac31358032806e0c28c5c113d8edad9734e1
|
data/format_parser.gemspec
CHANGED
@@ -39,5 +39,5 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_development_dependency 'simplecov', '~> 0.15'
|
40
40
|
spec.add_development_dependency 'pry', '~> 0.11'
|
41
41
|
spec.add_development_dependency 'yard', '~> 0.9'
|
42
|
-
spec.add_development_dependency 'wetransfer_style', '0.
|
42
|
+
spec.add_development_dependency 'wetransfer_style', '0.5.0'
|
43
43
|
end
|
data/lib/format_parser.rb
CHANGED
@@ -13,9 +13,11 @@ module FormatParser
|
|
13
13
|
require_relative 'care'
|
14
14
|
|
15
15
|
PARSER_MUX = Mutex.new
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
|
17
|
+
MAX_BYTES_READ_PER_PARSER = 512 * 1024
|
18
|
+
MAX_READS_PER_PARSER = 64 * 1024
|
19
|
+
MAX_SEEKS_PER_PARSER = 64 * 1024
|
20
|
+
MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
|
19
21
|
|
20
22
|
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
21
23
|
parser_provided_formats = Array(formats)
|
@@ -46,24 +48,17 @@ module FormatParser
|
|
46
48
|
end
|
47
49
|
|
48
50
|
def self.parse_http(url, **kwargs)
|
49
|
-
|
50
|
-
cached_io = Care::IOWrapper.new(remote_io)
|
51
|
-
|
52
|
-
# Prefetch the first page, since it is very likely to be touched
|
53
|
-
# by all parsers anyway. Additionally, when using RemoteIO we need
|
54
|
-
# to explicitly obtain the size of the resource, which is only available
|
55
|
-
# after having performed at least one successful GET - at least on S3
|
56
|
-
cached_io.read(1)
|
57
|
-
cached_io.seek(0)
|
58
|
-
|
59
|
-
parse(cached_io, **kwargs)
|
51
|
+
parse(RemoteIO.new(url), **kwargs)
|
60
52
|
end
|
61
53
|
|
62
54
|
# Return all by default
|
63
55
|
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
|
64
|
-
#
|
65
|
-
#
|
66
|
-
|
56
|
+
# Limit the number of cached _pages_ we may fetch. This allows us to limit the number
|
57
|
+
# of page faults (page cache misses) a parser may incur
|
58
|
+
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: MAX_CACHE_PAGE_FAULTS_PER_PARSER)
|
59
|
+
|
60
|
+
# Then configure a layer of caching on top of that
|
61
|
+
cached_io = Care::IOWrapper.new(read_limiter_under_cache)
|
67
62
|
|
68
63
|
# How many results has the user asked for? Used to determinate whether an array
|
69
64
|
# is returned or not.
|
@@ -83,9 +78,15 @@ module FormatParser
|
|
83
78
|
|
84
79
|
results = parsers.lazy.map do |parser|
|
85
80
|
# We need to rewind for each parser, anew
|
86
|
-
|
81
|
+
cached_io.seek(0)
|
82
|
+
|
83
|
+
# ...and we have to reset the cache page fault limit, each parser is allowed to cause N page faults
|
84
|
+
# - i.e. this is not a shared limit
|
85
|
+
read_limiter_under_cache.reset_limits!
|
86
|
+
|
87
87
|
# Limit how many operations the parser can perform
|
88
|
-
limited_io = ReadLimiter.new(
|
88
|
+
limited_io = ReadLimiter.new(cached_io, max_bytes: MAX_BYTES_READ_PER_PARSER, max_reads: MAX_READS_PER_PARSER, max_seeks: MAX_SEEKS_PER_PARSER)
|
89
|
+
|
89
90
|
begin
|
90
91
|
parser.call(limited_io)
|
91
92
|
rescue IOUtils::InvalidRead
|
@@ -95,6 +96,8 @@ module FormatParser
|
|
95
96
|
# The parser tried to read too much - most likely the file structure
|
96
97
|
# caused the parser to go off-track. Strictly speaking we should log this
|
97
98
|
# and examine the file more closely.
|
99
|
+
# Or the parser caused too many cache pages to be fetched, which likely means we should not allow
|
100
|
+
# it to continue
|
98
101
|
end
|
99
102
|
end.reject(&:nil?).take(amount)
|
100
103
|
|
data/lib/io_constraint.rb
CHANGED
data/lib/parsers/exif_parser.rb
CHANGED
@@ -5,8 +5,8 @@ require 'delegate'
|
|
5
5
|
class FormatParser::EXIFParser
|
6
6
|
include FormatParser::IOUtils
|
7
7
|
|
8
|
-
# EXIFR kindly requests the presence of
|
9
|
-
#
|
8
|
+
# EXIFR kindly requests the presence of a few more methods than what our IOConstraint
|
9
|
+
# is willing to provide, but they can be derived from the available ones
|
10
10
|
class IOExt < SimpleDelegator
|
11
11
|
def readbyte
|
12
12
|
if byte = read(1)
|
@@ -14,6 +14,20 @@ class FormatParser::EXIFParser
|
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
17
|
+
def seek(n, seek_mode = IO::SEEK_SET)
|
18
|
+
io = __getobj__
|
19
|
+
case seek_mode
|
20
|
+
when IO::SEEK_SET
|
21
|
+
io.seek(n)
|
22
|
+
when IO::SEEK_CUR
|
23
|
+
io.seek(io.pos + n)
|
24
|
+
when IO::SEEK_END
|
25
|
+
io.seek(io.size + n)
|
26
|
+
else
|
27
|
+
raise Errno::EINVAL
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
17
31
|
alias_method :getbyte, :readbyte
|
18
32
|
end
|
19
33
|
|
data/lib/read_limiter.rb
CHANGED
data/spec/format_parser_spec.rb
CHANGED
@@ -21,6 +21,26 @@ describe FormatParser do
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
+
it 'fails gracefully when a parser module reads more and more causing page faults and prevents too many reads on the source' do
|
25
|
+
exploit = ->(io) {
|
26
|
+
loop {
|
27
|
+
skip = 16 * 1024
|
28
|
+
io.read(1)
|
29
|
+
io.seek(io.pos + skip)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
FormatParser.register_parser exploit, natures: :document, formats: :exploit
|
33
|
+
|
34
|
+
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 4))
|
35
|
+
|
36
|
+
expect(sample_io).to receive(:read).at_most(4).times.and_call_original
|
37
|
+
|
38
|
+
result = FormatParser.parse(sample_io, formats: [:exploit])
|
39
|
+
expect(result).to be_nil
|
40
|
+
|
41
|
+
FormatParser.deregister_parser(exploit)
|
42
|
+
end
|
43
|
+
|
24
44
|
describe 'multiple values return' do
|
25
45
|
let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
|
26
46
|
let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
|
@@ -30,4 +30,30 @@ describe FormatParser::EXIFParser do
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
end
|
33
|
+
|
34
|
+
describe 'IOExt' do
|
35
|
+
it 'supports readbyte' do
|
36
|
+
io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
|
37
|
+
expect(io.readbyte).to eq(104)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'supports getbyte' do
|
41
|
+
io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
|
42
|
+
expect(io.getbyte).to eq(104)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'supports seek modes' do
|
46
|
+
io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
|
47
|
+
io.seek(1, IO::SEEK_SET)
|
48
|
+
|
49
|
+
io.seek(1, IO::SEEK_CUR)
|
50
|
+
expect(io.read(1)).to eq('l')
|
51
|
+
|
52
|
+
io.seek(-1, IO::SEEK_END)
|
53
|
+
expect(io.read(1)).to eq('o')
|
54
|
+
|
55
|
+
io.seek(1)
|
56
|
+
expect(io.read(1)).to eq('e')
|
57
|
+
end
|
58
|
+
end
|
33
59
|
end
|
data/spec/read_limiter_spec.rb
CHANGED
@@ -41,4 +41,14 @@ describe FormatParser::ReadLimiter do
|
|
41
41
|
reader.read(1)
|
42
42
|
}.to raise_error(/bytes budget \(512\) exceeded/)
|
43
43
|
end
|
44
|
+
|
45
|
+
it 'can be reset!' do
|
46
|
+
reader = FormatParser::ReadLimiter.new(io, max_bytes: 512)
|
47
|
+
reader.read(512)
|
48
|
+
expect {
|
49
|
+
reader.read(1)
|
50
|
+
}.to raise_error(/budget/)
|
51
|
+
reader.reset_limits!
|
52
|
+
reader.read(1)
|
53
|
+
end
|
44
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-03-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|
@@ -129,14 +129,14 @@ dependencies:
|
|
129
129
|
requirements:
|
130
130
|
- - '='
|
131
131
|
- !ruby/object:Gem::Version
|
132
|
-
version: 0.
|
132
|
+
version: 0.5.0
|
133
133
|
type: :development
|
134
134
|
prerelease: false
|
135
135
|
version_requirements: !ruby/object:Gem::Requirement
|
136
136
|
requirements:
|
137
137
|
- - '='
|
138
138
|
- !ruby/object:Gem::Version
|
139
|
-
version: 0.
|
139
|
+
version: 0.5.0
|
140
140
|
description: |-
|
141
141
|
A Ruby library for prying open files you can convert to a previewable format, such as video, image and audio files. It includes
|
142
142
|
a number of parser modules that try to recover metadata useful for post-processing and layout while reading the absolute
|
@@ -230,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
230
230
|
version: '0'
|
231
231
|
requirements: []
|
232
232
|
rubyforge_project:
|
233
|
-
rubygems_version: 2.
|
233
|
+
rubygems_version: 2.6.11
|
234
234
|
signing_key:
|
235
235
|
specification_version: 4
|
236
236
|
summary: A library for efficient parsing of file metadata
|