format_parser 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a00aab34ab0fd5faeec5fa6069ab81b2d066b14
4
- data.tar.gz: 51a4617a89c10524a88879cdabe34dd4ccdb24d3
3
+ metadata.gz: 99c08f5e2482e524cf7b6c54eb4de371bbf1a107
4
+ data.tar.gz: 3c33f9336c6ec7479b03102f8dbc4ab8610a6420
5
5
  SHA512:
6
- metadata.gz: f9865f5245c785756a9ce84a89e2d17f2dbcf15a2172c3399d5112eb2d2f5ad613e233d641b31b47bce04d1d992a25a48d7e495b6025965e6599480c280dc9f6
7
- data.tar.gz: 563cc7d7b3c8e6011ecc1d0dff1c245183c8d3da8a04c4669030ab009e6762183689f708f67b8b7ece4d7e2fe793f53771cf2efbcc1d186f473393bd4d76caac
6
+ metadata.gz: b9892a12a6007744c70baa01200d432b40868d6f8d854b506dea7349ef84282133f49ab244a9d383b7772221d29dc950ae07c604b57247784f42aae74a03474b
7
+ data.tar.gz: 643b274aa45ef8a11dde7e178d03036488767ba56f4e4680edf647ebb5098aff52c08bcecc9578c73bd6835aa235ac31358032806e0c28c5c113d8edad9734e1
@@ -39,5 +39,5 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency 'simplecov', '~> 0.15'
40
40
  spec.add_development_dependency 'pry', '~> 0.11'
41
41
  spec.add_development_dependency 'yard', '~> 0.9'
42
- spec.add_development_dependency 'wetransfer_style', '0.4.0'
42
+ spec.add_development_dependency 'wetransfer_style', '0.5.0'
43
43
  end
data/lib/format_parser.rb CHANGED
@@ -13,9 +13,11 @@ module FormatParser
13
13
  require_relative 'care'
14
14
 
15
15
  PARSER_MUX = Mutex.new
16
- MAX_BYTES = 512 * 1024
17
- MAX_READS = 64 * 1024
18
- MAX_SEEKS = 64 * 1024
16
+
17
+ MAX_BYTES_READ_PER_PARSER = 512 * 1024
18
+ MAX_READS_PER_PARSER = 64 * 1024
19
+ MAX_SEEKS_PER_PARSER = 64 * 1024
20
+ MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
19
21
 
20
22
  def self.register_parser(callable_or_responding_to_new, formats:, natures:)
21
23
  parser_provided_formats = Array(formats)
@@ -46,24 +48,17 @@ module FormatParser
46
48
  end
47
49
 
48
50
  def self.parse_http(url, **kwargs)
49
- remote_io = RemoteIO.new(url)
50
- cached_io = Care::IOWrapper.new(remote_io)
51
-
52
- # Prefetch the first page, since it is very likely to be touched
53
- # by all parsers anyway. Additionally, when using RemoteIO we need
54
- # to explicitly obtain the size of the resource, which is only available
55
- # after having performed at least one successful GET - at least on S3
56
- cached_io.read(1)
57
- cached_io.seek(0)
58
-
59
- parse(cached_io, **kwargs)
51
+ parse(RemoteIO.new(url), **kwargs)
60
52
  end
61
53
 
62
54
  # Return all by default
63
55
  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
64
- # If the cache is preconfigured do not apply an extra layer. It is going
65
- # to be preconfigured when using parse_http.
66
- io = Care::IOWrapper.new(io) unless io.is_a?(Care::IOWrapper)
56
+ # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
57
+ # of page faults (page cache misses) a parser may incur
58
+ read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: MAX_CACHE_PAGE_FAULTS_PER_PARSER)
59
+
60
+ # Then configure a layer of caching on top of that
61
+ cached_io = Care::IOWrapper.new(read_limiter_under_cache)
67
62
 
68
63
  # How many results has the user asked for? Used to determinate whether an array
69
64
  # is returned or not.
@@ -83,9 +78,15 @@ module FormatParser
83
78
 
84
79
  results = parsers.lazy.map do |parser|
85
80
  # We need to rewind for each parser, anew
86
- io.seek(0)
81
+ cached_io.seek(0)
82
+
83
+ # ...and we have to reset the cache page fault limit, each parser is allowed to cause N page faults
84
+ # - i.e. this is not a shared limit
85
+ read_limiter_under_cache.reset_limits!
86
+
87
87
  # Limit how many operations the parser can perform
88
- limited_io = ReadLimiter.new(io, max_bytes: MAX_BYTES, max_reads: MAX_READS, max_seeks: MAX_SEEKS)
88
+ limited_io = ReadLimiter.new(cached_io, max_bytes: MAX_BYTES_READ_PER_PARSER, max_reads: MAX_READS_PER_PARSER, max_seeks: MAX_SEEKS_PER_PARSER)
89
+
89
90
  begin
90
91
  parser.call(limited_io)
91
92
  rescue IOUtils::InvalidRead
@@ -95,6 +96,8 @@ module FormatParser
95
96
  # The parser tried to read too much - most likely the file structure
96
97
  # caused the parser to go off-track. Strictly speaking we should log this
97
98
  # and examine the file more closely.
99
+ # Or the parser caused too many cache pages to be fetched, which likely means we should not allow
100
+ # it to continue
98
101
  end
99
102
  end.reject(&:nil?).take(amount)
100
103
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.3.3'
2
+ VERSION = '0.3.4'
3
3
  end
data/lib/io_constraint.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  # We deliberately want to document and restrict the
3
2
  # number of methods an IO-ish object has to implement
4
3
  # to be usable with all our parsers. This subset is fairly
@@ -5,8 +5,8 @@ require 'delegate'
5
5
  class FormatParser::EXIFParser
6
6
  include FormatParser::IOUtils
7
7
 
8
- # EXIFR kindly requests the presence of getbyte and readbyte
9
- # IO methods, which our constrained IO subset does not provide natively
8
+ # EXIFR kindly requests the presence of a few more methods than what our IOConstraint
9
+ # is willing to provide, but they can be derived from the available ones
10
10
  class IOExt < SimpleDelegator
11
11
  def readbyte
12
12
  if byte = read(1)
@@ -14,6 +14,20 @@ class FormatParser::EXIFParser
14
14
  end
15
15
  end
16
16
 
17
+ def seek(n, seek_mode = IO::SEEK_SET)
18
+ io = __getobj__
19
+ case seek_mode
20
+ when IO::SEEK_SET
21
+ io.seek(n)
22
+ when IO::SEEK_CUR
23
+ io.seek(io.pos + n)
24
+ when IO::SEEK_END
25
+ io.seek(io.size + n)
26
+ else
27
+ raise Errno::EINVAL
28
+ end
29
+ end
30
+
17
31
  alias_method :getbyte, :readbyte
18
32
  end
19
33
 
data/lib/read_limiter.rb CHANGED
@@ -45,4 +45,10 @@ class FormatParser::ReadLimiter
45
45
 
46
46
  @io.read(n)
47
47
  end
48
+
49
+ def reset_limits!
50
+ @seeks = 0
51
+ @reads = 0
52
+ @bytes = 0
53
+ end
48
54
  end
@@ -21,6 +21,26 @@ describe FormatParser do
21
21
  end
22
22
  end
23
23
 
24
+ it 'fails gracefully when a parser module reads more and more causing page faults and prevents too many reads on the source' do
25
+ exploit = ->(io) {
26
+ loop {
27
+ skip = 16 * 1024
28
+ io.read(1)
29
+ io.seek(io.pos + skip)
30
+ }
31
+ }
32
+ FormatParser.register_parser exploit, natures: :document, formats: :exploit
33
+
34
+ sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 4))
35
+
36
+ expect(sample_io).to receive(:read).at_most(4).times.and_call_original
37
+
38
+ result = FormatParser.parse(sample_io, formats: [:exploit])
39
+ expect(result).to be_nil
40
+
41
+ FormatParser.deregister_parser(exploit)
42
+ end
43
+
24
44
  describe 'multiple values return' do
25
45
  let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
26
46
  let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
@@ -30,4 +30,30 @@ describe FormatParser::EXIFParser do
30
30
  end
31
31
  end
32
32
  end
33
+
34
+ describe 'IOExt' do
35
+ it 'supports readbyte' do
36
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
37
+ expect(io.readbyte).to eq(104)
38
+ end
39
+
40
+ it 'supports getbyte' do
41
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
42
+ expect(io.getbyte).to eq(104)
43
+ end
44
+
45
+ it 'supports seek modes' do
46
+ io = FormatParser::EXIFParser::IOExt.new(StringIO.new('hello'))
47
+ io.seek(1, IO::SEEK_SET)
48
+
49
+ io.seek(1, IO::SEEK_CUR)
50
+ expect(io.read(1)).to eq('l')
51
+
52
+ io.seek(-1, IO::SEEK_END)
53
+ expect(io.read(1)).to eq('o')
54
+
55
+ io.seek(1)
56
+ expect(io.read(1)).to eq('e')
57
+ end
58
+ end
33
59
  end
@@ -41,4 +41,14 @@ describe FormatParser::ReadLimiter do
41
41
  reader.read(1)
42
42
  }.to raise_error(/bytes budget \(512\) exceeded/)
43
43
  end
44
+
45
+ it 'can be reset!' do
46
+ reader = FormatParser::ReadLimiter.new(io, max_bytes: 512)
47
+ reader.read(512)
48
+ expect {
49
+ reader.read(1)
50
+ }.to raise_error(/budget/)
51
+ reader.reset_limits!
52
+ reader.read(1)
53
+ end
44
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-02-27 00:00:00.000000000 Z
12
+ date: 2018-03-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks
@@ -129,14 +129,14 @@ dependencies:
129
129
  requirements:
130
130
  - - '='
131
131
  - !ruby/object:Gem::Version
132
- version: 0.4.0
132
+ version: 0.5.0
133
133
  type: :development
134
134
  prerelease: false
135
135
  version_requirements: !ruby/object:Gem::Requirement
136
136
  requirements:
137
137
  - - '='
138
138
  - !ruby/object:Gem::Version
139
- version: 0.4.0
139
+ version: 0.5.0
140
140
  description: |-
141
141
  A Ruby library for prying open files you can convert to a previewable format, such as video, image and audio files. It includes
142
142
  a number of parser modules that try to recover metadata useful for post-processing and layout while reading the absolute
@@ -230,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
230
230
  version: '0'
231
231
  requirements: []
232
232
  rubyforge_project:
233
- rubygems_version: 2.5.2
233
+ rubygems_version: 2.6.11
234
234
  signing_key:
235
235
  specification_version: 4
236
236
  summary: A library for efficient parsing of file metadata