format_parser 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fbc5d4476d94ccbeb0f50c3bd0406cb57d1fc5fa640afbbf8057d874ecc128a3
4
- data.tar.gz: fa4421fa509d0f0528beb4c4b19dc1a63cd66fbed29142ed33b9b2aeae9067c0
3
+ metadata.gz: ebde95ad53411a3d4ae4db175bab7b90d568476302df66842c83710f88ad2e15
4
+ data.tar.gz: cfb0e6a00c9ca1f8447e71e0800643327b05114be1b592d888052ba94ed91a98
5
5
  SHA512:
6
- metadata.gz: 6c3abee783f5b2c35da44231bbec2bfabd1ff25cafe45716694751f865e4cae6c71286dcf2053770f925391f8b3e0cef72daa351c43de03d0ed8b0900e4eabad
7
- data.tar.gz: ab9ef11e37f5d2610ded657483a688876402cde506af5f830dd2f60fe2a8d432c182227a5c514050e79cfb059906fca2d384b366a6e64969ad7f5b5de81b8f43
6
+ metadata.gz: 1c6509362046f64b49472b3323f154ffc509f4f6d308d32ebc889299e470cc46d07cbe7864a5b8021a4e59d922d530cedf1e83852402305dab50f8c297ac9d8a
7
+ data.tar.gz: f8143722bd3dbc9b0b431732fc717d15dcd70c73e08bb2b76025f6e919976f6a62bb244f7525445387169e1ef7cea8e048084818255ea413458de5981e8fa0af
data/lib/care.rb CHANGED
@@ -7,9 +7,9 @@ class Care
7
7
  DEFAULT_PAGE_SIZE = 128 * 1024
8
8
 
9
9
  class IOWrapper
10
- def initialize(io, cache = Cache.new(DEFAULT_PAGE_SIZE))
10
+ def initialize(io, page_size: DEFAULT_PAGE_SIZE)
11
+ @cache = Cache.new(page_size)
11
12
  @io = io
12
- @cache = cache
13
13
  @pos = 0
14
14
  end
15
15
 
@@ -109,6 +109,21 @@ class Care
109
109
  @pages[page_i] ||= read_page(io, page_i)
110
110
  end
111
111
 
112
+ def inspect
113
+ # To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
114
+
115
+ # Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
116
+ oid_str = (object_id << 1).to_s(16).rjust(16, '0')
117
+
118
+ ivars = instance_variables
119
+ ivars.delete(:@pages)
120
+ ivars_str = ivars.map do |ivar|
121
+ "#{ivar}=#{instance_variable_get(ivar).inspect}"
122
+ end.join(' ')
123
+ synthetic_vars = 'num_hydrated_pages=%d' % @pages.length
124
+ '#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
125
+ end
126
+
112
127
  def read_page(io, page_i)
113
128
  io.seek(page_i * @page_size)
114
129
  read_result = io.read(@page_size)
data/lib/format_parser.rb CHANGED
@@ -9,16 +9,13 @@ module FormatParser
9
9
  require_relative 'archive'
10
10
  require_relative 'io_utils'
11
11
  require_relative 'read_limiter'
12
+ require_relative 'read_limits_config'
12
13
  require_relative 'remote_io'
13
14
  require_relative 'io_constraint'
14
15
  require_relative 'care'
15
16
 
16
17
  PARSER_MUX = Mutex.new
17
-
18
- MAX_BYTES_READ_PER_PARSER = 512 * 1024
19
- MAX_READS_PER_PARSER = 64 * 1024
20
- MAX_SEEKS_PER_PARSER = 64 * 1024
21
- MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
18
+ MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
22
19
 
23
20
  def self.register_parser(callable_or_responding_to_new, formats:, natures:)
24
21
  parser_provided_formats = Array(formats)
@@ -54,12 +51,20 @@ module FormatParser
54
51
 
55
52
  # Return all by default
56
53
  def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
54
+ # We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
55
+ # requests to be dispatched and so on. These should be _balanced_ with one another- for example,
56
+ # we cannot tell a parser that it is limited to reading 1024 bytes while at the same time
57
+ # limiting the size of the cache pages it may slurp in to less than that amount, since
58
+ # it can quickly become frustrating. The limits configurator computes these limits
59
+ # for us, in a fairly balanced way, based on one setting.
60
+ limit_config = FormatParser::ReadLimitsConfig.new(MAX_BYTES_READ_PER_PARSER)
61
+
57
62
  # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
58
63
  # of page faults (page cache misses) a parser may incur
59
- read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: MAX_CACHE_PAGE_FAULTS_PER_PARSER)
64
+ read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limit_config.max_pagefaults_per_parser)
60
65
 
61
66
  # Then configure a layer of caching on top of that
62
- cached_io = Care::IOWrapper.new(read_limiter_under_cache)
67
+ cached_io = Care::IOWrapper.new(read_limiter_under_cache, page_size: limit_config.cache_page_size)
63
68
 
64
69
  # How many results has the user asked for? Used to determinate whether an array
65
70
  # is returned or not.
@@ -77,16 +82,16 @@ module FormatParser
77
82
  # between invocations, and would complicate threading situations
78
83
  parsers = parsers_for(natures, formats)
79
84
 
80
- results = parsers.lazy.map do |parser|
81
- # We need to rewind for each parser, anew
82
- cached_io.seek(0)
85
+ # Limit how many operations the parser can perform
86
+ limited_io = ReadLimiter.new(cached_io, max_bytes: limit_config.max_read_bytes_per_parser, max_reads: limit_config.max_reads_per_parser, max_seeks: limit_config.max_seeks_per_parser)
83
87
 
84
- # ...and we have to reset the cache page fault limit, each parser is allowed to cause N page faults
85
- # - i.e. this is not a shared limit
88
+ results = parsers.lazy.map do |parser|
89
+ # Reset all the read limits, per parser
90
+ limited_io.reset_limits!
86
91
  read_limiter_under_cache.reset_limits!
87
92
 
88
- # Limit how many operations the parser can perform
89
- limited_io = ReadLimiter.new(cached_io, max_bytes: MAX_BYTES_READ_PER_PARSER, max_reads: MAX_READS_PER_PARSER, max_seeks: MAX_SEEKS_PER_PARSER)
93
+ # We need to rewind for each parser, anew
94
+ limited_io.seek(0)
90
95
 
91
96
  begin
92
97
  parser.call(limited_io)
@@ -107,6 +112,7 @@ module FormatParser
107
112
  end.reject(&:nil?).take(amount)
108
113
 
109
114
  return results.first if amount == 1
115
+
110
116
  # Convert the results from a lazy enumerator to an Array.
111
117
  results.to_a
112
118
  end
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.6.0'
2
+ VERSION = '0.7.0'
3
3
  end
@@ -0,0 +1,29 @@
1
+ class FormatParser::ReadLimitsConfig
2
+ MAX_PAGE_FAULTS = 8
3
+
4
+ def initialize(total_bytes_available_per_parser)
5
+ @max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
6
+ end
7
+
8
+ def max_read_bytes_per_parser
9
+ @max_read_bytes_per_parser
10
+ end
11
+
12
+ def cache_page_size
13
+ @max_read_bytes_per_parser / 4
14
+ end
15
+
16
+ def max_pagefaults_per_parser
17
+ MAX_PAGE_FAULTS
18
+ end
19
+
20
+ def max_reads_per_parser
21
+ # Imagine we read per single byte
22
+ @max_read_bytes_per_parser / 2
23
+ end
24
+
25
+ def max_seeks_per_parser
26
+ # Imagine we have to seek once per byte
27
+ @max_read_bytes_per_parser / 2
28
+ end
29
+ end
data/spec/care_spec.rb CHANGED
@@ -4,6 +4,11 @@ describe Care do
4
4
  describe Care::Cache do
5
5
  let(:source) { StringIO.new('Hello there, this is our little caching reader') }
6
6
 
7
+ it 'provides #inspect but does not output the actual @pages instance variable with it' do
8
+ cache = Care::Cache.new(3)
9
+ expect(cache.inspect).not_to include('@pages')
10
+ end
11
+
7
12
  it 'performs correct reads at various offsets' do
8
13
  cache = Care::Cache.new(3)
9
14
  expect(cache.byteslice(source, 0, 3)).to eq('Hel')
@@ -72,9 +77,10 @@ describe Care do
72
77
  end
73
78
 
74
79
  cache_double = fake_cache_class.new
80
+ expect(Care::Cache).to receive(:new).and_return(cache_double)
75
81
  io_double = double('IO')
76
82
 
77
- subject = Care::IOWrapper.new(io_double, cache_double)
83
+ subject = Care::IOWrapper.new(io_double)
78
84
 
79
85
  expect(subject.pos).to eq(0)
80
86
  subject.read(2)
@@ -31,9 +31,9 @@ describe FormatParser do
31
31
  }
32
32
  FormatParser.register_parser exploit, natures: :document, formats: :exploit
33
33
 
34
- sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 4))
34
+ sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
35
35
 
36
- expect(sample_io).to receive(:read).at_most(4).times.and_call_original
36
+ expect(sample_io).to receive(:read).at_most(8).times.and_call_original
37
37
 
38
38
  result = FormatParser.parse(sample_io, formats: [:exploit])
39
39
  expect(result).to be_nil
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::ReadLimitsConfig do
4
+ it 'provides balanced values based on the initial byte read limit per parser' do
5
+ config = FormatParser::ReadLimitsConfig.new(1024)
6
+
7
+ expect(config.max_read_bytes_per_parser).to be_kind_of(Integer)
8
+ expect(config.max_read_bytes_per_parser).to be > 0
9
+
10
+ expect(config.cache_page_size).to be_kind_of(Integer)
11
+ expect(config.cache_page_size).to be > 0
12
+
13
+ expect(config.max_pagefaults_per_parser).to be_kind_of(Integer)
14
+ expect(config.max_pagefaults_per_parser).to be > 0
15
+
16
+ expect(config.max_reads_per_parser).to be_kind_of(Integer)
17
+ expect(config.max_reads_per_parser).to be > 0
18
+
19
+ expect(config.max_seeks_per_parser).to be_kind_of(Integer)
20
+ expect(config.max_seeks_per_parser).to be > 0
21
+ end
22
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -191,6 +191,7 @@ files:
191
191
  - lib/parsers/zip_parser/file_reader.rb
192
192
  - lib/parsers/zip_parser/office_formats.rb
193
193
  - lib/read_limiter.rb
194
+ - lib/read_limits_config.rb
194
195
  - lib/remote_io.rb
195
196
  - lib/video.rb
196
197
  - spec/attributes_json_spec.rb
@@ -216,6 +217,7 @@ files:
216
217
  - spec/parsers/wav_parser_spec.rb
217
218
  - spec/parsers/zip_parser_spec.rb
218
219
  - spec/read_limiter_spec.rb
220
+ - spec/read_limits_config_spec.rb
219
221
  - spec/remote_fetching_spec.rb
220
222
  - spec/remote_io_spec.rb
221
223
  - spec/spec_helper.rb