format_parser 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/care.rb +17 -2
- data/lib/format_parser.rb +20 -14
- data/lib/format_parser/version.rb +1 -1
- data/lib/read_limits_config.rb +29 -0
- data/spec/care_spec.rb +7 -1
- data/spec/format_parser_spec.rb +2 -2
- data/spec/read_limits_config_spec.rb +22 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebde95ad53411a3d4ae4db175bab7b90d568476302df66842c83710f88ad2e15
|
4
|
+
data.tar.gz: cfb0e6a00c9ca1f8447e71e0800643327b05114be1b592d888052ba94ed91a98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c6509362046f64b49472b3323f154ffc509f4f6d308d32ebc889299e470cc46d07cbe7864a5b8021a4e59d922d530cedf1e83852402305dab50f8c297ac9d8a
|
7
|
+
data.tar.gz: f8143722bd3dbc9b0b431732fc717d15dcd70c73e08bb2b76025f6e919976f6a62bb244f7525445387169e1ef7cea8e048084818255ea413458de5981e8fa0af
|
data/lib/care.rb
CHANGED
@@ -7,9 +7,9 @@ class Care
|
|
7
7
|
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
8
|
|
9
9
|
class IOWrapper
|
10
|
-
def initialize(io,
|
10
|
+
def initialize(io, page_size: DEFAULT_PAGE_SIZE)
|
11
|
+
@cache = Cache.new(page_size)
|
11
12
|
@io = io
|
12
|
-
@cache = cache
|
13
13
|
@pos = 0
|
14
14
|
end
|
15
15
|
|
@@ -109,6 +109,21 @@ class Care
|
|
109
109
|
@pages[page_i] ||= read_page(io, page_i)
|
110
110
|
end
|
111
111
|
|
112
|
+
def inspect
|
113
|
+
# To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
|
114
|
+
|
115
|
+
# Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
|
116
|
+
oid_str = (object_id << 1).to_s(16).rjust(16, '0')
|
117
|
+
|
118
|
+
ivars = instance_variables
|
119
|
+
ivars.delete(:@pages)
|
120
|
+
ivars_str = ivars.map do |ivar|
|
121
|
+
"#{ivar}=#{instance_variable_get(ivar).inspect}"
|
122
|
+
end.join(' ')
|
123
|
+
synthetic_vars = 'num_hydrated_pages=%d' % @pages.length
|
124
|
+
'#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
|
125
|
+
end
|
126
|
+
|
112
127
|
def read_page(io, page_i)
|
113
128
|
io.seek(page_i * @page_size)
|
114
129
|
read_result = io.read(@page_size)
|
data/lib/format_parser.rb
CHANGED
@@ -9,16 +9,13 @@ module FormatParser
|
|
9
9
|
require_relative 'archive'
|
10
10
|
require_relative 'io_utils'
|
11
11
|
require_relative 'read_limiter'
|
12
|
+
require_relative 'read_limits_config'
|
12
13
|
require_relative 'remote_io'
|
13
14
|
require_relative 'io_constraint'
|
14
15
|
require_relative 'care'
|
15
16
|
|
16
17
|
PARSER_MUX = Mutex.new
|
17
|
-
|
18
|
-
MAX_BYTES_READ_PER_PARSER = 512 * 1024
|
19
|
-
MAX_READS_PER_PARSER = 64 * 1024
|
20
|
-
MAX_SEEKS_PER_PARSER = 64 * 1024
|
21
|
-
MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
|
18
|
+
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
22
19
|
|
23
20
|
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
24
21
|
parser_provided_formats = Array(formats)
|
@@ -54,12 +51,20 @@ module FormatParser
|
|
54
51
|
|
55
52
|
# Return all by default
|
56
53
|
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
|
54
|
+
# We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
|
55
|
+
# requests to be dispatched and so on. These should be _balanced_ with one another- for example,
|
56
|
+
# we cannot tell a parser that it is limited to reading 1024 bytes while at the same time
|
57
|
+
# limiting the size of the cache pages it may slurp in to less than that amount, since
|
58
|
+
# it can quickly become frustrating. The limits configurator computes these limits
|
59
|
+
# for us, in a fairly balanced way, based on one setting.
|
60
|
+
limit_config = FormatParser::ReadLimitsConfig.new(MAX_BYTES_READ_PER_PARSER)
|
61
|
+
|
57
62
|
# Limit the number of cached _pages_ we may fetch. This allows us to limit the number
|
58
63
|
# of page faults (page cache misses) a parser may incur
|
59
|
-
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads:
|
64
|
+
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limit_config.max_pagefaults_per_parser)
|
60
65
|
|
61
66
|
# Then configure a layer of caching on top of that
|
62
|
-
cached_io = Care::IOWrapper.new(read_limiter_under_cache)
|
67
|
+
cached_io = Care::IOWrapper.new(read_limiter_under_cache, page_size: limit_config.cache_page_size)
|
63
68
|
|
64
69
|
# How many results has the user asked for? Used to determinate whether an array
|
65
70
|
# is returned or not.
|
@@ -77,16 +82,16 @@ module FormatParser
|
|
77
82
|
# between invocations, and would complicate threading situations
|
78
83
|
parsers = parsers_for(natures, formats)
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
cached_io.seek(0)
|
85
|
+
# Limit how many operations the parser can perform
|
86
|
+
limited_io = ReadLimiter.new(cached_io, max_bytes: limit_config.max_read_bytes_per_parser, max_reads: limit_config.max_reads_per_parser, max_seeks: limit_config.max_seeks_per_parser)
|
83
87
|
|
84
|
-
|
85
|
-
#
|
88
|
+
results = parsers.lazy.map do |parser|
|
89
|
+
# Reset all the read limits, per parser
|
90
|
+
limited_io.reset_limits!
|
86
91
|
read_limiter_under_cache.reset_limits!
|
87
92
|
|
88
|
-
#
|
89
|
-
limited_io
|
93
|
+
# We need to rewind for each parser, anew
|
94
|
+
limited_io.seek(0)
|
90
95
|
|
91
96
|
begin
|
92
97
|
parser.call(limited_io)
|
@@ -107,6 +112,7 @@ module FormatParser
|
|
107
112
|
end.reject(&:nil?).take(amount)
|
108
113
|
|
109
114
|
return results.first if amount == 1
|
115
|
+
|
110
116
|
# Convert the results from a lazy enumerator to an Array.
|
111
117
|
results.to_a
|
112
118
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class FormatParser::ReadLimitsConfig
|
2
|
+
MAX_PAGE_FAULTS = 8
|
3
|
+
|
4
|
+
def initialize(total_bytes_available_per_parser)
|
5
|
+
@max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
|
6
|
+
end
|
7
|
+
|
8
|
+
def max_read_bytes_per_parser
|
9
|
+
@max_read_bytes_per_parser
|
10
|
+
end
|
11
|
+
|
12
|
+
def cache_page_size
|
13
|
+
@max_read_bytes_per_parser / 4
|
14
|
+
end
|
15
|
+
|
16
|
+
def max_pagefaults_per_parser
|
17
|
+
MAX_PAGE_FAULTS
|
18
|
+
end
|
19
|
+
|
20
|
+
def max_reads_per_parser
|
21
|
+
# Imagine we read per single byte
|
22
|
+
@max_read_bytes_per_parser / 2
|
23
|
+
end
|
24
|
+
|
25
|
+
def max_seeks_per_parser
|
26
|
+
# Imagine we have to seek once per byte
|
27
|
+
@max_read_bytes_per_parser / 2
|
28
|
+
end
|
29
|
+
end
|
data/spec/care_spec.rb
CHANGED
@@ -4,6 +4,11 @@ describe Care do
|
|
4
4
|
describe Care::Cache do
|
5
5
|
let(:source) { StringIO.new('Hello there, this is our little caching reader') }
|
6
6
|
|
7
|
+
it 'provides #inspect but does not output the actual @pages instance variable with it' do
|
8
|
+
cache = Care::Cache.new(3)
|
9
|
+
expect(cache.inspect).not_to include('@pages')
|
10
|
+
end
|
11
|
+
|
7
12
|
it 'performs correct reads at various offsets' do
|
8
13
|
cache = Care::Cache.new(3)
|
9
14
|
expect(cache.byteslice(source, 0, 3)).to eq('Hel')
|
@@ -72,9 +77,10 @@ describe Care do
|
|
72
77
|
end
|
73
78
|
|
74
79
|
cache_double = fake_cache_class.new
|
80
|
+
expect(Care::Cache).to receive(:new).and_return(cache_double)
|
75
81
|
io_double = double('IO')
|
76
82
|
|
77
|
-
subject = Care::IOWrapper.new(io_double
|
83
|
+
subject = Care::IOWrapper.new(io_double)
|
78
84
|
|
79
85
|
expect(subject.pos).to eq(0)
|
80
86
|
subject.read(2)
|
data/spec/format_parser_spec.rb
CHANGED
@@ -31,9 +31,9 @@ describe FormatParser do
|
|
31
31
|
}
|
32
32
|
FormatParser.register_parser exploit, natures: :document, formats: :exploit
|
33
33
|
|
34
|
-
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 *
|
34
|
+
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
|
35
35
|
|
36
|
-
expect(sample_io).to receive(:read).at_most(
|
36
|
+
expect(sample_io).to receive(:read).at_most(8).times.and_call_original
|
37
37
|
|
38
38
|
result = FormatParser.parse(sample_io, formats: [:exploit])
|
39
39
|
expect(result).to be_nil
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::ReadLimitsConfig do
|
4
|
+
it 'provides balanced values based on the initial byte read limit per parser' do
|
5
|
+
config = FormatParser::ReadLimitsConfig.new(1024)
|
6
|
+
|
7
|
+
expect(config.max_read_bytes_per_parser).to be_kind_of(Integer)
|
8
|
+
expect(config.max_read_bytes_per_parser).to be > 0
|
9
|
+
|
10
|
+
expect(config.cache_page_size).to be_kind_of(Integer)
|
11
|
+
expect(config.cache_page_size).to be > 0
|
12
|
+
|
13
|
+
expect(config.max_pagefaults_per_parser).to be_kind_of(Integer)
|
14
|
+
expect(config.max_pagefaults_per_parser).to be > 0
|
15
|
+
|
16
|
+
expect(config.max_reads_per_parser).to be_kind_of(Integer)
|
17
|
+
expect(config.max_reads_per_parser).to be > 0
|
18
|
+
|
19
|
+
expect(config.max_seeks_per_parser).to be_kind_of(Integer)
|
20
|
+
expect(config.max_seeks_per_parser).to be > 0
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -191,6 +191,7 @@ files:
|
|
191
191
|
- lib/parsers/zip_parser/file_reader.rb
|
192
192
|
- lib/parsers/zip_parser/office_formats.rb
|
193
193
|
- lib/read_limiter.rb
|
194
|
+
- lib/read_limits_config.rb
|
194
195
|
- lib/remote_io.rb
|
195
196
|
- lib/video.rb
|
196
197
|
- spec/attributes_json_spec.rb
|
@@ -216,6 +217,7 @@ files:
|
|
216
217
|
- spec/parsers/wav_parser_spec.rb
|
217
218
|
- spec/parsers/zip_parser_spec.rb
|
218
219
|
- spec/read_limiter_spec.rb
|
220
|
+
- spec/read_limits_config_spec.rb
|
219
221
|
- spec/remote_fetching_spec.rb
|
220
222
|
- spec/remote_io_spec.rb
|
221
223
|
- spec/spec_helper.rb
|