format_parser 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/care.rb +17 -2
- data/lib/format_parser.rb +20 -14
- data/lib/format_parser/version.rb +1 -1
- data/lib/read_limits_config.rb +29 -0
- data/spec/care_spec.rb +7 -1
- data/spec/format_parser_spec.rb +2 -2
- data/spec/read_limits_config_spec.rb +22 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebde95ad53411a3d4ae4db175bab7b90d568476302df66842c83710f88ad2e15
|
4
|
+
data.tar.gz: cfb0e6a00c9ca1f8447e71e0800643327b05114be1b592d888052ba94ed91a98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c6509362046f64b49472b3323f154ffc509f4f6d308d32ebc889299e470cc46d07cbe7864a5b8021a4e59d922d530cedf1e83852402305dab50f8c297ac9d8a
|
7
|
+
data.tar.gz: f8143722bd3dbc9b0b431732fc717d15dcd70c73e08bb2b76025f6e919976f6a62bb244f7525445387169e1ef7cea8e048084818255ea413458de5981e8fa0af
|
data/lib/care.rb
CHANGED
@@ -7,9 +7,9 @@ class Care
|
|
7
7
|
DEFAULT_PAGE_SIZE = 128 * 1024
|
8
8
|
|
9
9
|
class IOWrapper
|
10
|
-
def initialize(io,
|
10
|
+
def initialize(io, page_size: DEFAULT_PAGE_SIZE)
|
11
|
+
@cache = Cache.new(page_size)
|
11
12
|
@io = io
|
12
|
-
@cache = cache
|
13
13
|
@pos = 0
|
14
14
|
end
|
15
15
|
|
@@ -109,6 +109,21 @@ class Care
|
|
109
109
|
@pages[page_i] ||= read_page(io, page_i)
|
110
110
|
end
|
111
111
|
|
112
|
+
def inspect
|
113
|
+
# To avoid page _contents_ in the inspect outputs we need to implement our own inspect.
|
114
|
+
|
115
|
+
# Simulate the builtin object ID output https://stackoverflow.com/a/11765495/153886
|
116
|
+
oid_str = (object_id << 1).to_s(16).rjust(16, '0')
|
117
|
+
|
118
|
+
ivars = instance_variables
|
119
|
+
ivars.delete(:@pages)
|
120
|
+
ivars_str = ivars.map do |ivar|
|
121
|
+
"#{ivar}=#{instance_variable_get(ivar).inspect}"
|
122
|
+
end.join(' ')
|
123
|
+
synthetic_vars = 'num_hydrated_pages=%d' % @pages.length
|
124
|
+
'#<%s:%s %s %s>' % [self.class, oid_str, synthetic_vars, ivars_str]
|
125
|
+
end
|
126
|
+
|
112
127
|
def read_page(io, page_i)
|
113
128
|
io.seek(page_i * @page_size)
|
114
129
|
read_result = io.read(@page_size)
|
data/lib/format_parser.rb
CHANGED
@@ -9,16 +9,13 @@ module FormatParser
|
|
9
9
|
require_relative 'archive'
|
10
10
|
require_relative 'io_utils'
|
11
11
|
require_relative 'read_limiter'
|
12
|
+
require_relative 'read_limits_config'
|
12
13
|
require_relative 'remote_io'
|
13
14
|
require_relative 'io_constraint'
|
14
15
|
require_relative 'care'
|
15
16
|
|
16
17
|
PARSER_MUX = Mutex.new
|
17
|
-
|
18
|
-
MAX_BYTES_READ_PER_PARSER = 512 * 1024
|
19
|
-
MAX_READS_PER_PARSER = 64 * 1024
|
20
|
-
MAX_SEEKS_PER_PARSER = 64 * 1024
|
21
|
-
MAX_CACHE_PAGE_FAULTS_PER_PARSER = 4
|
18
|
+
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
22
19
|
|
23
20
|
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
24
21
|
parser_provided_formats = Array(formats)
|
@@ -54,12 +51,20 @@ module FormatParser
|
|
54
51
|
|
55
52
|
# Return all by default
|
56
53
|
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first)
|
54
|
+
# We need to apply various limits so that parsers do not over-read, do not cause too many HTTP
|
55
|
+
# requests to be dispatched and so on. These should be _balanced_ with one another- for example,
|
56
|
+
# we cannot tell a parser that it is limited to reading 1024 bytes while at the same time
|
57
|
+
# limiting the size of the cache pages it may slurp in to less than that amount, since
|
58
|
+
# it can quickly become frustrating. The limits configurator computes these limits
|
59
|
+
# for us, in a fairly balanced way, based on one setting.
|
60
|
+
limit_config = FormatParser::ReadLimitsConfig.new(MAX_BYTES_READ_PER_PARSER)
|
61
|
+
|
57
62
|
# Limit the number of cached _pages_ we may fetch. This allows us to limit the number
|
58
63
|
# of page faults (page cache misses) a parser may incur
|
59
|
-
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads:
|
64
|
+
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limit_config.max_pagefaults_per_parser)
|
60
65
|
|
61
66
|
# Then configure a layer of caching on top of that
|
62
|
-
cached_io = Care::IOWrapper.new(read_limiter_under_cache)
|
67
|
+
cached_io = Care::IOWrapper.new(read_limiter_under_cache, page_size: limit_config.cache_page_size)
|
63
68
|
|
64
69
|
# How many results has the user asked for? Used to determinate whether an array
|
65
70
|
# is returned or not.
|
@@ -77,16 +82,16 @@ module FormatParser
|
|
77
82
|
# between invocations, and would complicate threading situations
|
78
83
|
parsers = parsers_for(natures, formats)
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
cached_io.seek(0)
|
85
|
+
# Limit how many operations the parser can perform
|
86
|
+
limited_io = ReadLimiter.new(cached_io, max_bytes: limit_config.max_read_bytes_per_parser, max_reads: limit_config.max_reads_per_parser, max_seeks: limit_config.max_seeks_per_parser)
|
83
87
|
|
84
|
-
|
85
|
-
#
|
88
|
+
results = parsers.lazy.map do |parser|
|
89
|
+
# Reset all the read limits, per parser
|
90
|
+
limited_io.reset_limits!
|
86
91
|
read_limiter_under_cache.reset_limits!
|
87
92
|
|
88
|
-
#
|
89
|
-
limited_io
|
93
|
+
# We need to rewind for each parser, anew
|
94
|
+
limited_io.seek(0)
|
90
95
|
|
91
96
|
begin
|
92
97
|
parser.call(limited_io)
|
@@ -107,6 +112,7 @@ module FormatParser
|
|
107
112
|
end.reject(&:nil?).take(amount)
|
108
113
|
|
109
114
|
return results.first if amount == 1
|
115
|
+
|
110
116
|
# Convert the results from a lazy enumerator to an Array.
|
111
117
|
results.to_a
|
112
118
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class FormatParser::ReadLimitsConfig
|
2
|
+
MAX_PAGE_FAULTS = 8
|
3
|
+
|
4
|
+
def initialize(total_bytes_available_per_parser)
|
5
|
+
@max_read_bytes_per_parser = total_bytes_available_per_parser.to_i
|
6
|
+
end
|
7
|
+
|
8
|
+
def max_read_bytes_per_parser
|
9
|
+
@max_read_bytes_per_parser
|
10
|
+
end
|
11
|
+
|
12
|
+
def cache_page_size
|
13
|
+
@max_read_bytes_per_parser / 4
|
14
|
+
end
|
15
|
+
|
16
|
+
def max_pagefaults_per_parser
|
17
|
+
MAX_PAGE_FAULTS
|
18
|
+
end
|
19
|
+
|
20
|
+
def max_reads_per_parser
|
21
|
+
# Imagine we read per single byte
|
22
|
+
@max_read_bytes_per_parser / 2
|
23
|
+
end
|
24
|
+
|
25
|
+
def max_seeks_per_parser
|
26
|
+
# Imagine we have to seek once per byte
|
27
|
+
@max_read_bytes_per_parser / 2
|
28
|
+
end
|
29
|
+
end
|
data/spec/care_spec.rb
CHANGED
@@ -4,6 +4,11 @@ describe Care do
|
|
4
4
|
describe Care::Cache do
|
5
5
|
let(:source) { StringIO.new('Hello there, this is our little caching reader') }
|
6
6
|
|
7
|
+
it 'provides #inspect but does not output the actual @pages instance variable with it' do
|
8
|
+
cache = Care::Cache.new(3)
|
9
|
+
expect(cache.inspect).not_to include('@pages')
|
10
|
+
end
|
11
|
+
|
7
12
|
it 'performs correct reads at various offsets' do
|
8
13
|
cache = Care::Cache.new(3)
|
9
14
|
expect(cache.byteslice(source, 0, 3)).to eq('Hel')
|
@@ -72,9 +77,10 @@ describe Care do
|
|
72
77
|
end
|
73
78
|
|
74
79
|
cache_double = fake_cache_class.new
|
80
|
+
expect(Care::Cache).to receive(:new).and_return(cache_double)
|
75
81
|
io_double = double('IO')
|
76
82
|
|
77
|
-
subject = Care::IOWrapper.new(io_double
|
83
|
+
subject = Care::IOWrapper.new(io_double)
|
78
84
|
|
79
85
|
expect(subject.pos).to eq(0)
|
80
86
|
subject.read(2)
|
data/spec/format_parser_spec.rb
CHANGED
@@ -31,9 +31,9 @@ describe FormatParser do
|
|
31
31
|
}
|
32
32
|
FormatParser.register_parser exploit, natures: :document, formats: :exploit
|
33
33
|
|
34
|
-
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 *
|
34
|
+
sample_io = StringIO.new(Random.new.bytes(1024 * 1024 * 8))
|
35
35
|
|
36
|
-
expect(sample_io).to receive(:read).at_most(
|
36
|
+
expect(sample_io).to receive(:read).at_most(8).times.and_call_original
|
37
37
|
|
38
38
|
result = FormatParser.parse(sample_io, formats: [:exploit])
|
39
39
|
expect(result).to be_nil
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::ReadLimitsConfig do
|
4
|
+
it 'provides balanced values based on the initial byte read limit per parser' do
|
5
|
+
config = FormatParser::ReadLimitsConfig.new(1024)
|
6
|
+
|
7
|
+
expect(config.max_read_bytes_per_parser).to be_kind_of(Integer)
|
8
|
+
expect(config.max_read_bytes_per_parser).to be > 0
|
9
|
+
|
10
|
+
expect(config.cache_page_size).to be_kind_of(Integer)
|
11
|
+
expect(config.cache_page_size).to be > 0
|
12
|
+
|
13
|
+
expect(config.max_pagefaults_per_parser).to be_kind_of(Integer)
|
14
|
+
expect(config.max_pagefaults_per_parser).to be > 0
|
15
|
+
|
16
|
+
expect(config.max_reads_per_parser).to be_kind_of(Integer)
|
17
|
+
expect(config.max_reads_per_parser).to be > 0
|
18
|
+
|
19
|
+
expect(config.max_seeks_per_parser).to be_kind_of(Integer)
|
20
|
+
expect(config.max_seeks_per_parser).to be > 0
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -191,6 +191,7 @@ files:
|
|
191
191
|
- lib/parsers/zip_parser/file_reader.rb
|
192
192
|
- lib/parsers/zip_parser/office_formats.rb
|
193
193
|
- lib/read_limiter.rb
|
194
|
+
- lib/read_limits_config.rb
|
194
195
|
- lib/remote_io.rb
|
195
196
|
- lib/video.rb
|
196
197
|
- spec/attributes_json_spec.rb
|
@@ -216,6 +217,7 @@ files:
|
|
216
217
|
- spec/parsers/wav_parser_spec.rb
|
217
218
|
- spec/parsers/zip_parser_spec.rb
|
218
219
|
- spec/read_limiter_spec.rb
|
220
|
+
- spec/read_limits_config_spec.rb
|
219
221
|
- spec/remote_fetching_spec.rb
|
220
222
|
- spec/remote_io_spec.rb
|
221
223
|
- spec/spec_helper.rb
|