format_parser 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
4
- data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
3
+ metadata.gz: ebde95ad53411a3d4ae4db175bab7b90d568476302df66842c83710f88ad2e15
4
+ data.tar.gz: cfb0e6a00c9ca1f8447e71e0800643327b05114be1b592d888052ba94ed91a98
5
5
  SHA512:
6
- metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
7
- data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
6
+ metadata.gz: 1c6509362046f64b49472b3323f154ffc509f4f6d308d32ebc889299e470cc46d07cbe7864a5b8021a4e59d922d530cedf1e83852402305dab50f8c297ac9d8a
7
+ data.tar.gz: f8143722bd3dbc9b0b431732fc717d15dcd70c73e08bb2b76025f6e919976f6a62bb244f7525445387169e1ef7cea8e048084818255ea413458de5981e8fa0af
data/lib/care.rb CHANGED
@@ -7,9 +7,9 @@ class Care
7
7
  DEFAULT_PAGE_SIZE = 128 * 1024
8
8
 
9
9
  class IOWrapper
10
- def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
10
+ def initialize(io, page_size: DEFAULT_PAGE_SIZE)
11
+ @cache = Cache.new(page_size)
11
12
  @io = io
12
- @cache = cache
13
13
  @pos = 0
14
14
  end
15
15
 
@@ -109,6 +109,21 @@ class Care
109
109
  @pages[page_i] ||= read_page(io, page_i)
110
110
  end
111
111
 
112
+ def inspect
113
+ # To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
114
+
115
+ # Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
116
+ oid_str = (object_id << 1).to_s(16).rjust(16, '0')
117
+
118
+ ivars = instance_variables
119
+ ivars.delete(:@pages)
120
+ ivars_str = ivars.map do |ivar|
121
+ "#{ivar}=#{instance_variable_get(ivar).inspect}"
122
+ end.join(' ')
123
+ synthetic_vars = 'num_hydrated_pages=%d' % @pages.length
124
+ '#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
125
+ end
126
+
112
127
  def read_page(io, page_i)
113
128
  io.seek(page_i * @page_size)
114
129
  read_result = io.read(@page_size)
data/lib/format_parser.rb CHANGED
@@ -9,16 +9,13 @@ module FormatParser
9
9
  require_relative 'archive'
10
10
  require_relative 'io_utils'
11
11
  require_relative 'read_limiter'
12
+ require_relative 'read_limits_config'
12
13
  require_relative 'remote_io'
13
14
  require_relative 'io_constraint'
14
15
  require_relative 'care'
15
16
 
16
17
  PARSER_MUX = Mutex.new
17
-
18
- MAX_BYTES_READ_PER_PARSER = 512 * 1024
19
- MAX_READS_PER_PARSER = 64 * 1024
20
- MAX_SEEKS_PER_PARSER = 64 * 1024
21
- MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
18
+ MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
22
19
 
23
20
  def self.register_parser(callable_or_responding_to_new, formats:, natures:)
24
21
  parser_provided_formats = Array(formats)
@@ -54,12 +51,20 @@ module FormatParser
54
51
 
55
52
  # Return all by default
56
53
  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
54
+ # We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
55
+ # requests to be dispatched and so on. These should be _balanced_ with one another- for example,
56
+ # we cannot tell a parser that it is limited to reading 1024 bytes while at the same time
57
+ # limiting the size of the cache pages it may slurp in to less than that amount, since
58
+ # it can quickly become frustrating. The limits configurator computes these limits
59
+ # for us, in a fairly balanced way, based on one setting.
60
+ limit_config = FormatParser::ReadLimitsConfig.new(MAX_BYTES_READ_PER_PARSER)
61
+
57
62
  # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
58
63
  # of page faults (page cache misses) a parser may incur
59
- read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: MAX_CACHE_PAGE_FAULTS_PER_PARSER)
64
+ read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limit_config.max_pagefaults_per_parser)
60
65
 
61
66
  # Then configure a layer of caching on top of that
62
- cached_io = Care::IOWrapper.new(read_limiter_under_cache)
67
+ cached_io = Care::IOWrapper.new(read_limiter_under_cache, page_size: limit_config.cache_page_size)
63
68
 
64
69
  # How many results has the user asked for? Used to determinate whether an array
65
70
  # is returned or not.
@@ -77,16 +82,16 @@ module FormatParser
77
82
  # between invocations, and would complicate threading situations
78
83
  parsers = parsers_for(natures, formats)
79
84
 
80
- results = parsers.lazy.map do |parser|
81
- # We need to rewind for each parser, anew
82
- cached_io.seek(0)
85
+ # Limit how many operations the parser can perform
86
+ limited_io = ReadLimiter.new(cached_io, max_bytes: limit_config.max_read_bytes_per_parser, max_reads: limit_config.max_reads_per_parser, max_seeks: limit_config.max_seeks_per_parser)
83
87
 
84
- # ...and we have to reset the cache page fault limit, each parser is allowed to cause N page faults
85
- # - i.e. this is not a shared limit
88
+ results = parsers.lazy.map do |parser|
89
+ # Reset all the read limits, per parser
90
+ limited_io.reset_limits!
86
91
  read_limiter_under_cache.reset_limits!
87
92
 
88
- # Limit how many operations the parser can perform
89
- limited_io = ReadLimiter.new(cached_io, max_bytes: MAX_BYTES_READ_PER_PARSER, max_reads: MAX_READS_PER_PARSER, max_seeks: MAX_SEEKS_PER_PARSER)
93
+ # We need to rewind for each parser, anew
94
+ limited_io.seek(0)
90
95
 
91
96
  begin
92
97
  parser.call(limited_io)
@@ -107,6 +112,7 @@ module FormatParser
107
112
  end.reject(&:nil?).take(amount)
108
113
 
109
114
  return results.first if amount == 1
115
+
110
116
  # Convert the results from a lazy enumerator to an Array.
111
117
  results.to_a
112
118
  end
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.6.0'
2
+ VERSION = '0.7.0'
3
3
  end
@@ -0,0 +1,29 @@
1
+ class FormatParser::ReadLimitsConfig
2
+ MAX_PAGE_FAULTS = 8
3
+
4
+ def initialize(total_bytes_available_per_parser)
5
+ @max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
6
+ end
7
+
8
+ def max_read_bytes_per_parser
9
+ @max_read_bytes_per_parser
10
+ end
11
+
12
+ def cache_page_size
13
+ @max_read_bytes_per_parser / 4
14
+ end
15
+
16
+ def max_pagefaults_per_parser
17
+ MAX_PAGE_FAULTS
18
+ end
19
+
20
+ def max_reads_per_parser
21
+ # Imagine we read per single byte
22
+ @max_read_bytes_per_parser / 2
23
+ end
24
+
25
+ def max_seeks_per_parser
26
+ # Imagine we have to seek once per byte
27
+ @max_read_bytes_per_parser / 2
28
+ end
29
+ end
data/spec/care_spec.rb CHANGED
@@ -4,6 +4,11 @@ describe Care do
4
4
  describe Care::Cache do
5
5
  let(:source) { StringIO.new('Hello there, this is our little caching reader') }
6
6
 
7
+ it 'provides #inspect but does not output the actual @pages instance variable with it' do
8
+ cache = Care::Cache.new(3)
9
+ expect(cache.inspect).not_to include('@pages')
10
+ end
11
+
7
12
  it 'performs correct reads at various offsets' do
8
13
  cache = Care::Cache.new(3)
9
14
  expect(cache.byteslice(source, 0, 3)).to eq('Hel')
@@ -72,9 +77,10 @@ describe Care do
72
77
  end
73
78
 
74
79
  cache_double = fake_cache_class.new
80
+ expect(Care::Cache).to receive(:new).and_return(cache_double)
75
81
  io_double = double('IO')
76
82
 
77
- subject = Care::IOWrapper.new(io_double, cache_double)
83
+ subject = Care::IOWrapper.new(io_double)
78
84
 
79
85
  expect(subject.pos).to eq(0)
80
86
  subject.read(2)
@@ -31,9 +31,9 @@ describe FormatParser do
31
31
  }
32
32
  FormatParser.register_parser exploit, natures: :document, formats: :exploit
33
33
 
34
- sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 4))
34
+ sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
35
35
 
36
- expect(sample_io).to receive(:read).at_most(4).times.and_call_original
36
+ expect(sample_io).to receive(:read).at_most(8).times.and_call_original
37
37
 
38
38
  result = FormatParser.parse(sample_io, formats: [:exploit])
39
39
  expect(result).to be_nil
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::ReadLimitsConfig do
4
+ it 'provides balanced values based on the initial byte read limit per parser' do
5
+ config = FormatParser::ReadLimitsConfig.new(1024)
6
+
7
+ expect(config.max_read_bytes_per_parser).to be_kind_of(Integer)
8
+ expect(config.max_read_bytes_per_parser).to be > 0
9
+
10
+ expect(config.cache_page_size).to be_kind_of(Integer)
11
+ expect(config.cache_page_size).to be > 0
12
+
13
+ expect(config.max_pagefaults_per_parser).to be_kind_of(Integer)
14
+ expect(config.max_pagefaults_per_parser).to be > 0
15
+
16
+ expect(config.max_reads_per_parser).to be_kind_of(Integer)
17
+ expect(config.max_reads_per_parser).to be > 0
18
+
19
+ expect(config.max_seeks_per_parser).to be_kind_of(Integer)
20
+ expect(config.max_seeks_per_parser).to be > 0
21
+ end
22
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -191,6 +191,7 @@ files:
191
191
  - lib/parsers/zip_parser/file_reader.rb
192
192
  - lib/parsers/zip_parser/office_formats.rb
193
193
  - lib/read_limiter.rb
194
+ - lib/read_limits_config.rb
194
195
  - lib/remote_io.rb
195
196
  - lib/video.rb
196
197
  - spec/attributes_json_spec.rb
@@ -216,6 +217,7 @@ files:
216
217
  - spec/parsers/wav_parser_spec.rb
217
218
  - spec/parsers/zip_parser_spec.rb
218
219
  - spec/read_limiter_spec.rb
220
+ - spec/read_limits_config_spec.rb
219
221
  - spec/remote_fetching_spec.rb
220
222
  - spec/remote_io_spec.rb
221
223
  - spec/spec_helper.rb