format_parser 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a00aab34ab0fd5faeec5fa6069ab81b2d066b14
4
- data.tar.gz: 51a4617a89c10524a88879cdabe34dd4ccdb24d3
3
+ metadata.gz: 99c08f5e2482e524cf7b6c54eb4de371bbf1a107
4
+ data.tar.gz: 3c33f9336c6ec7479b03102f8dbc4ab8610a6420
5
5
  SHA512:
6
- metadata.gz: f9865f5245c785756a9ce84a89e2d17f2dbcf15a2172c3399d5112eb2d2f5ad613e233d641b31b47bce04d1d992a25a48d7e495b6025965e6599480c280dc9f6
7
- data.tar.gz: 563cc7d7b3c8e6011ecc1d0dff1c245183c8d3da8a04c4669030ab009e6762183689f708f67b8b7ece4d7e2fe793f53771cf2efbcc1d186f473393bd4d76caac
6
+ metadata.gz: b9892a12a6007744c70baa01200d432b40868d6f8d854b506dea7349ef84282133f49ab244a9d383b7772221d29dc950ae07c604b57247784f42aae74a03474b
7
+ data.tar.gz: 643b274aa45ef8a11dde7e178d03036488767ba56f4e4680edf647ebb5098aff52c08bcecc9578c73bd6835aa235ac31358032806e0c28c5c113d8edad9734e1
@@ -39,5 +39,5 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency 'simplecov', '~> 0.15'
40
40
  spec.add_development_dependency 'pry', '~> 0.11'
41
41
  spec.add_development_dependency 'yard', '~> 0.9'
42
- spec.add_development_dependency 'wetransfer_style', '0.4.0'
42
+ spec.add_development_dependency 'wetransfer_style', '0.5.0'
43
43
  end
data/lib/format_parser.rb CHANGED
@@ -13,9 +13,11 @@ module FormatParser
13
13
  require_relative 'care'
14
14
 
15
15
  PARSER_MUX = Mutex.new
16
- MAX_BYTES = 512 * 1024
17
- MAX_READS = 64 * 1024
18
- MAX_SEEKS = 64 * 1024
16
+
17
+ MAX_BYTES_READ_PER_PARSER = 512 * 1024
18
+ MAX_READS_PER_PARSER = 64 * 1024
19
+ MAX_SEEKS_PER_PARSER = 64 * 1024
20
+ MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
19
21
 
20
22
  def self.register_parser(callable_or_responding_to_new, formats:, natures:)
21
23
  parser_provided_formats = Array(formats)
@@ -46,24 +48,17 @@ module FormatParser
46
48
  end
47
49
 
48
50
  def self.parse_http(url, **kwargs)
49
- remote_io = RemoteIO.new(url)
50
- cached_io = Care::IOWrapper.new(remote_io)
51
-
52
- # Prefetch the first page, since it is very likely to be touched
53
- # by all parsers anyway. Additionally, when using RemoteIO we need
54
- # to explicitly obtain the size of the resource, which is only available
55
- # after having performed at least one successful GET - at least on S3
56
- cached_io.read(1)
57
- cached_io.seek(0)
58
-
59
- parse(cached_io, **kwargs)
51
+ parse(RemoteIO.new(url), **kwargs)
60
52
  end
61
53
 
62
54
  # Return all by default
63
55
  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
64
- # If the cache is preconfigured do not apply an extra layer. It is going
65
- # to be preconfigured when using parse_http.
66
- io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
56
+ # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
57
+ # of page faults (page cache misses) a parser may incur
58
+ read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: MAX_CACHE_PAGE_FAULTS_PER_PARSER)
59
+
60
+ # Then configure a layer of caching on top of that
61
+ cached_io = Care::IOWrapper.new(read_limiter_under_cache)
67
62
 
68
63
  # How many results has the user asked for? Used to determinate whether an array
69
64
  # is returned or not.
@@ -83,9 +78,15 @@ module FormatParser
83
78
 
84
79
  results = parsers.lazy.map do |parser|
85
80
  # We need to rewind for each parser, anew
86
- io.seek(0)
81
+ cached_io.seek(0)
82
+
83
+ # ...and we have to reset the cache page fault limit, each parser is allowed to cause N page faults
84
+ # - i.e. this is not a shared limit
85
+ read_limiter_under_cache.reset_limits!
86
+
87
87
  # Limit how many operations the parser can perform
88
- limited_io = ReadLimiter.new(io, max_bytes: MAX_BYTES, max_reads: MAX_READS, max_seeks: MAX_SEEKS)
88
+ limited_io = ReadLimiter.new(cached_io, max_bytes: MAX_BYTES_READ_PER_PARSER, max_reads: MAX_READS_PER_PARSER, max_seeks: MAX_SEEKS_PER_PARSER)
89
+
89
90
  begin
90
91
  parser.call(limited_io)
91
92
  rescue IOUtils::InvalidRead
@@ -95,6 +96,8 @@ module FormatParser
95
96
  # The parser tried to read too much - most likely the file structure
96
97
  # caused the parser to go off-track. Strictly speaking we should log this
97
98
  # and examine the file more closely.
99
+ # Or the parser caused too many cache pages to be fetched, which likely means we should not allow
100
+ # it to continue
98
101
  end
99
102
  end.reject(&:nil?).take(amount)
100
103
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.3.3'
2
+ VERSION = '0.3.4'
3
3
  end
data/lib/io_constraint.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  # We deliberately want to document and restrict the
3
2
  # number of methods an IO-ish object has to implement
4
3
  # to be usable with all our parsers. This subset is fairly
@@ -5,8 +5,8 @@ require 'delegate'
5
5
  class FormatParser::EXIFParser
6
6
  include FormatParser::IOUtils
7
7
 
8
- # EXIFR kindly requests the presence of getbyte and readbyte
9
- # IO methods, which our constrained IO subset does not provide natively
8
+ # EXIFR kindly requests the presence of a few more methods than what our IOConstraint
9
+ # is willing to provide, but they can be derived from the available ones
10
10
  class IOExt < SimpleDelegator
11
11
  def readbyte
12
12
  if byte = read(1)
@@ -14,6 +14,20 @@ class FormatParser::EXIFParser
14
14
  end
15
15
  end
16
16
 
17
+ def seek(n, seek_mode = IO::SEEK_SET)
18
+ io = __getobj__
19
+ case seek_mode
20
+ when IO::SEEK_SET
21
+ io.seek(n)
22
+ when IO::SEEK_CUR
23
+ io.seek(io.pos + n)
24
+ when IO::SEEK_END
25
+ io.seek(io.size + n)
26
+ else
27
+ raise Errno::EINVAL
28
+ end
29
+ end
30
+
17
31
  alias_method :getbyte, :readbyte
18
32
  end
19
33
 
data/lib/read_limiter.rb CHANGED
@@ -45,4 +45,10 @@ class FormatParser::ReadLimiter
45
45
 
46
46
  @io.read(n)
47
47
  end
48
+
49
+ def reset_limits!
50
+ @seeks = 0
51
+ @reads = 0
52
+ @bytes = 0
53
+ end
48
54
  end
@@ -21,6 +21,26 @@ describe FormatParser do
21
21
  end
22
22
  end
23
23
 
24
+ it 'fails gracefully when a parser module reads more and more causing page faults and prevents too many reads on the source' do
25
+ exploit = ->(io) {
26
+ loop {
27
+ skip = 16 * 1024
28
+ io.read(1)
29
+ io.seek(io.pos + skip)
30
+ }
31
+ }
32
+ FormatParser.register_parser exploit, natures: :document, formats: :exploit
33
+
34
+ sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 4))
35
+
36
+ expect(sample_io).to receive(:read).at_most(4).times.and_call_original
37
+
38
+ result = FormatParser.parse(sample_io, formats: [:exploit])
39
+ expect(result).to be_nil
40
+
41
+ FormatParser.deregister_parser(exploit)
42
+ end
43
+
24
44
  describe 'multiple values return' do
25
45
  let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
26
46
  let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
@@ -30,4 +30,30 @@ describe FormatParser::EXIFParser do
30
30
  end
31
31
  end
32
32
  end
33
+
34
+ describe 'IOExt' do
35
+ it 'supports readbyte' do
36
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
37
+ expect(io.readbyte).to eq(104)
38
+ end
39
+
40
+ it 'supports getbyte' do
41
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
42
+ expect(io.getbyte).to eq(104)
43
+ end
44
+
45
+ it 'supports seek modes' do
46
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
47
+ io.seek(1, IO::SEEK_SET)
48
+
49
+ io.seek(1, IO::SEEK_CUR)
50
+ expect(io.read(1)).to eq('l')
51
+
52
+ io.seek(-1, IO::SEEK_END)
53
+ expect(io.read(1)).to eq('o')
54
+
55
+ io.seek(1)
56
+ expect(io.read(1)).to eq('e')
57
+ end
58
+ end
33
59
  end
@@ -41,4 +41,14 @@ describe FormatParser::ReadLimiter do
41
41
  reader.read(1)
42
42
  }.to raise_error(/bytes budget \(512\) exceeded/)
43
43
  end
44
+
45
+ it 'can be reset!' do
46
+ reader = FormatParser::ReadLimiter.new(io, max_bytes: 512)
47
+ reader.read(512)
48
+ expect {
49
+ reader.read(1)
50
+ }.to raise_error(/budget/)
51
+ reader.reset_limits!
52
+ reader.read(1)
53
+ end
44
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-02-27 00:00:00.000000000 Z
12
+ date: 2018-03-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -129,14 +129,14 @@ dependencies:
129
129
  requirements:
130
130
  - - '='
131
131
  - !ruby/object:Gem::Version
132
- version: 0.4.0
132
+ version: 0.5.0
133
133
  type: :development
134
134
  prerelease: false
135
135
  version_requirements: !ruby/object:Gem::Requirement
136
136
  requirements:
137
137
  - - '='
138
138
  - !ruby/object:Gem::Version
139
- version: 0.4.0
139
+ version: 0.5.0
140
140
  description: |-
141
141
  A Ruby library for prying open files you can convert to a previewable format, such as video, image and audio files. It includes
142
142
  a number of parser modules that try to recover metadata useful for post-processing and layout while reading the absolute
@@ -230,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
230
230
  version: '0'
231
231
  requirements: []
232
232
  rubyforge_project:
233
- rubygems_version: 2.5.2
233
+ rubygems_version: 2.6.11
234
234
  signing_key:
235
235
  specification_version: 4
236
236
  summary: A library for efficient parsing of file metadata