format_parser 0.14.1 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -0
- data/lib/format_parser.rb +19 -2
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/gif_parser.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +1 -1
- data/lib/parsers/moov_parser.rb +1 -1
- data/lib/parsers/mp3_parser.rb +1 -1
- data/lib/parsers/pdf_parser.rb +1 -1
- data/lib/parsers/png_parser.rb +2 -1
- data/lib/parsers/zip_parser.rb +1 -1
- data/spec/format_parser_spec.rb +38 -0
- data/spec/remote_fetching_spec.rb +7 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32b5a18eb53b1c5197a828757056354c655225fe7047418a615bfb8f950a6805
|
4
|
+
data.tar.gz: df289b5f4039f89d78b35e481efe50ea95f654497a7783bbaf0a35b997d630ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 358c318bf9f5262edb1acde1854e5caebaac3e945502d1d164f982f0d8fd26eb717ed5c43b3311f130d6ca2804b8c2bb1c0f1a596f462b01c59cd5a79267b0fb
|
7
|
+
data.tar.gz: 9482a5702896f47ba6da0d99d0e5ef31d51981bdd9271a764872fd06f0a210e19202c1c4fbd111111645d894703646648e8a7f003df7ea2419deef3254ea32da
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## 0.15.0
|
2
|
+
* Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
|
3
|
+
on detection confidence and file format popularity at WT.
|
4
|
+
|
1
5
|
## 0.14.1
|
2
6
|
* Care caching: Clear pages more deliberately instead of relegating them to GC
|
3
7
|
* JPEG: Clear the EXIF buffer explicitly
|
data/README.md
CHANGED
@@ -157,6 +157,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
157
157
|
### M4A
|
158
158
|
- fixture.m4a was created by one of the project maintainers and is MIT licensed
|
159
159
|
|
160
|
+
### PNG
|
161
|
+
- `simulator_screenie.png` provided by [Rens Verhoeven](https://github.com/renssies)
|
162
|
+
|
160
163
|
### TIFF
|
161
164
|
- `Shinbutsureijoushuincho.tiff` is obtained from Wikimedia Commons and is Creative Commons licensed
|
162
165
|
- `IMG_9266_*.tif` and all it's variations were created by the project maintainers
|
data/lib/format_parser.rb
CHANGED
@@ -28,14 +28,22 @@ module FormatParser
|
|
28
28
|
PARSER_MUX = Mutex.new
|
29
29
|
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
30
30
|
|
31
|
+
# The value will ensure the parser having it will be applied to the file last.
|
32
|
+
LEAST_PRIORITY = 99
|
33
|
+
|
31
34
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
32
35
|
# provides out of the box registers itself using this method.
|
33
36
|
#
|
34
37
|
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
35
38
|
# @param formats[Array<Symbol>] file formats that the parser provides
|
36
39
|
# @param natures[Array<Symbol>] file natures that the parser provides
|
40
|
+
# @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
|
41
|
+
# detection and have the most popular file formats should get a lower priority (0 or 1), the default
|
42
|
+
# priority is 99. Before parsing parsers get sorted according to their priority value ascending, so parsers
|
43
|
+
# with a lower priority value will be applied first, and if a single result is requested, will also return
|
44
|
+
# first.
|
37
45
|
# @return void
|
38
|
-
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
46
|
+
def self.register_parser(callable_or_responding_to_new, formats:, natures:, priority: LEAST_PRIORITY)
|
39
47
|
parser_provided_formats = Array(formats)
|
40
48
|
parser_provided_natures = Array(natures)
|
41
49
|
PARSER_MUX.synchronize do
|
@@ -51,6 +59,8 @@ module FormatParser
|
|
51
59
|
@parsers_per_format[provided_format] ||= Set.new
|
52
60
|
@parsers_per_format[provided_format] << callable_or_responding_to_new
|
53
61
|
end
|
62
|
+
@parser_priorities ||= {}
|
63
|
+
@parser_priorities[callable_or_responding_to_new] = priority
|
54
64
|
end
|
55
65
|
end
|
56
66
|
|
@@ -65,6 +75,7 @@ module FormatParser
|
|
65
75
|
(@parsers || []).delete(callable_or_responding_to_new)
|
66
76
|
(@parsers_per_nature || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
|
67
77
|
(@parsers_per_format || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
|
78
|
+
(@parser_priorities || {}).delete(callable_or_responding_to_new)
|
68
79
|
end
|
69
80
|
end
|
70
81
|
|
@@ -229,7 +240,13 @@ module FormatParser
|
|
229
240
|
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
|
230
241
|
end
|
231
242
|
|
232
|
-
|
243
|
+
# Order the parsers according to their priority value. The ones having a lower
|
244
|
+
# value will sort higher and will be applied sooner
|
245
|
+
factories_in_order_of_priority = factories.to_a.sort do |parser_factory_a, parser_factory_b|
|
246
|
+
@parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
|
247
|
+
end
|
248
|
+
|
249
|
+
factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
233
250
|
end
|
234
251
|
|
235
252
|
# Instantiates a parser object (an object that responds to `#call`) from a given class
|
data/lib/parsers/gif_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
data/lib/parsers/moov_parser.rb
CHANGED
@@ -83,5 +83,5 @@ class FormatParser::MOOVParser
|
|
83
83
|
maybe_atom_size >= minimum_ftyp_atom_size && maybe_ftyp_atom_signature == 'ftyp'
|
84
84
|
end
|
85
85
|
|
86
|
-
FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values
|
86
|
+
FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values, priority: 1
|
87
87
|
end
|
data/lib/parsers/mp3_parser.rb
CHANGED
data/lib/parsers/pdf_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -70,5 +70,6 @@ class FormatParser::PNGParser
|
|
70
70
|
)
|
71
71
|
end
|
72
72
|
|
73
|
-
|
73
|
+
# Give it priority 1 since priority 0 is reserved for JPEG, our most popular
|
74
|
+
FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
|
74
75
|
end
|
data/lib/parsers/zip_parser.rb
CHANGED
data/spec/format_parser_spec.rb
CHANGED
@@ -34,6 +34,36 @@ describe FormatParser do
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
it 'triggers parsers in a certain order that corresponds to the parser priorities' do
|
38
|
+
file_contents = StringIO.new('a' * 4096)
|
39
|
+
|
40
|
+
parsers_called_order = []
|
41
|
+
expect_any_instance_of(FormatParser::PNGParser).to receive(:call) { |instance|
|
42
|
+
parsers_called_order << instance.class
|
43
|
+
nil
|
44
|
+
}
|
45
|
+
expect_any_instance_of(FormatParser::MP3Parser).to receive(:call) { |instance|
|
46
|
+
parsers_called_order << instance.class
|
47
|
+
nil
|
48
|
+
}
|
49
|
+
expect_any_instance_of(FormatParser::ZIPParser).to receive(:call) { |instance|
|
50
|
+
parsers_called_order << instance.class
|
51
|
+
nil
|
52
|
+
}
|
53
|
+
|
54
|
+
FormatParser.parse(file_contents)
|
55
|
+
|
56
|
+
png_parser_idx = parsers_called_order.index(FormatParser::PNGParser)
|
57
|
+
mp3_parser_idx = parsers_called_order.index(FormatParser::MP3Parser)
|
58
|
+
zip_parser_idx = parsers_called_order.index(FormatParser::ZIPParser)
|
59
|
+
|
60
|
+
# The PNG parser should have been applied first
|
61
|
+
expect(png_parser_idx).to be < zip_parser_idx
|
62
|
+
# ...and the ZIP parser second (MP3 is the most omnivorous since there
|
63
|
+
# is no clear header or footer in the file
|
64
|
+
expect(mp3_parser_idx).to be > zip_parser_idx
|
65
|
+
end
|
66
|
+
|
37
67
|
it "returns either a valid result or a nil for all fuzzed inputs at seed #{RSpec.configuration.seed}" do
|
38
68
|
r = Random.new(RSpec.configuration.seed)
|
39
69
|
1024.times do
|
@@ -63,6 +93,14 @@ describe FormatParser do
|
|
63
93
|
FormatParser.deregister_parser(exploit)
|
64
94
|
end
|
65
95
|
|
96
|
+
it 'correctly detects a PNG as a PNG without falling back to another filetype' do
|
97
|
+
File.open(fixtures_dir + '/PNG/simulator_screenie.png', 'rb') do |file|
|
98
|
+
file_information = FormatParser.parse(file)
|
99
|
+
expect(file_information).not_to be_nil
|
100
|
+
expect(file_information.format).to eq(:png)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
66
104
|
describe 'when multiple results are requested' do
|
67
105
|
let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
|
68
106
|
let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
|
@@ -72,6 +72,13 @@ describe 'Fetching data from HTTP remotes' do
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
|
+
it 'correctly detects a PNG as a PNG without falling back to another filetype' do
|
76
|
+
remote_png_url = 'http://localhost:9399/PNG/simulator_screenie.png'
|
77
|
+
file_information = FormatParser.parse_http(remote_png_url)
|
78
|
+
expect(file_information).not_to be_nil
|
79
|
+
expect(file_information.format).to eq(:png)
|
80
|
+
end
|
81
|
+
|
75
82
|
describe 'when parsing remote fixtures' do
|
76
83
|
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
77
84
|
filename = File.basename(fixture_path)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-08-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|