format_parser 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -0
- data/lib/format_parser.rb +19 -2
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/gif_parser.rb +1 -1
- data/lib/parsers/jpeg_parser.rb +1 -1
- data/lib/parsers/moov_parser.rb +1 -1
- data/lib/parsers/mp3_parser.rb +1 -1
- data/lib/parsers/pdf_parser.rb +1 -1
- data/lib/parsers/png_parser.rb +2 -1
- data/lib/parsers/zip_parser.rb +1 -1
- data/spec/format_parser_spec.rb +38 -0
- data/spec/remote_fetching_spec.rb +7 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32b5a18eb53b1c5197a828757056354c655225fe7047418a615bfb8f950a6805
|
4
|
+
data.tar.gz: df289b5f4039f89d78b35e481efe50ea95f654497a7783bbaf0a35b997d630ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 358c318bf9f5262edb1acde1854e5caebaac3e945502d1d164f982f0d8fd26eb717ed5c43b3311f130d6ca2804b8c2bb1c0f1a596f462b01c59cd5a79267b0fb
|
7
|
+
data.tar.gz: 9482a5702896f47ba6da0d99d0e5ef31d51981bdd9271a764872fd06f0a210e19202c1c4fbd111111645d894703646648e8a7f003df7ea2419deef3254ea32da
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## 0.15.0
|
2
|
+
* Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
|
3
|
+
on detection confidence and file format popularity at WT.
|
4
|
+
|
1
5
|
## 0.14.1
|
2
6
|
* Care caching: Clear pages more deliberately instead of relegating them to GC
|
3
7
|
* JPEG: Clear the EXIF buffer explicitly
|
data/README.md
CHANGED
@@ -157,6 +157,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
157
157
|
### M4A
|
158
158
|
- fixture.m4a was created by one of the project maintainers and is MIT licensed
|
159
159
|
|
160
|
+
### PNG
|
161
|
+
- `simulator_screenie.png` provided by [Rens Verhoeven](https://github.com/renssies)
|
162
|
+
|
160
163
|
### TIFF
|
161
164
|
- `Shinbutsureijoushuincho.tiff` is obtained from Wikimedia Commons and is Creative Commons licensed
|
162
165
|
- `IMG_9266_*.tif` and all it's variations were created by the project maintainers
|
data/lib/format_parser.rb
CHANGED
@@ -28,14 +28,22 @@ module FormatParser
|
|
28
28
|
PARSER_MUX = Mutex.new
|
29
29
|
MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
|
30
30
|
|
31
|
+
# The value will ensure the parser having it will be applied to the file last.
|
32
|
+
LEAST_PRIORITY = 99
|
33
|
+
|
31
34
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
32
35
|
# provides out of the box registers itself using this method.
|
33
36
|
#
|
34
37
|
# @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
|
35
38
|
# @param formats[Array<Symbol>] file formats that the parser provides
|
36
39
|
# @param natures[Array<Symbol>] file natures that the parser provides
|
40
|
+
# @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
|
41
|
+
# detection and have the most popular file formats should get a lower priority (0 or 1), the default
|
42
|
+
# priority is 99. Before parsing parsers get sorted according to their priority value ascending, so parsers
|
43
|
+
# with a lower priority value will be applied first, and if a single result is requested, will also return
|
44
|
+
# first.
|
37
45
|
# @return void
|
38
|
-
def self.register_parser(callable_or_responding_to_new, formats:, natures:)
|
46
|
+
def self.register_parser(callable_or_responding_to_new, formats:, natures:, priority: LEAST_PRIORITY)
|
39
47
|
parser_provided_formats = Array(formats)
|
40
48
|
parser_provided_natures = Array(natures)
|
41
49
|
PARSER_MUX.synchronize do
|
@@ -51,6 +59,8 @@ module FormatParser
|
|
51
59
|
@parsers_per_format[provided_format] ||= Set.new
|
52
60
|
@parsers_per_format[provided_format] << callable_or_responding_to_new
|
53
61
|
end
|
62
|
+
@parser_priorities ||= {}
|
63
|
+
@parser_priorities[callable_or_responding_to_new] = priority
|
54
64
|
end
|
55
65
|
end
|
56
66
|
|
@@ -65,6 +75,7 @@ module FormatParser
|
|
65
75
|
(@parsers || []).delete(callable_or_responding_to_new)
|
66
76
|
(@parsers_per_nature || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
|
67
77
|
(@parsers_per_format || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
|
78
|
+
(@parser_priorities || {}).delete(callable_or_responding_to_new)
|
68
79
|
end
|
69
80
|
end
|
70
81
|
|
@@ -229,7 +240,13 @@ module FormatParser
|
|
229
240
|
raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
|
230
241
|
end
|
231
242
|
|
232
|
-
|
243
|
+
# Order the parsers according to their priority value. The ones having a lower
|
244
|
+
# value will sort higher and will be applied sooner
|
245
|
+
factories_in_order_of_priority = factories.to_a.sort do |parser_factory_a, parser_factory_b|
|
246
|
+
@parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
|
247
|
+
end
|
248
|
+
|
249
|
+
factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
233
250
|
end
|
234
251
|
|
235
252
|
# Instantiates a parser object (an object that responds to `#call`) from a given class
|
data/lib/parsers/gif_parser.rb
CHANGED
data/lib/parsers/jpeg_parser.rb
CHANGED
data/lib/parsers/moov_parser.rb
CHANGED
@@ -83,5 +83,5 @@ class FormatParser::MOOVParser
|
|
83
83
|
maybe_atom_size >= minimum_ftyp_atom_size && maybe_ftyp_atom_signature == 'ftyp'
|
84
84
|
end
|
85
85
|
|
86
|
-
FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values
|
86
|
+
FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values, priority: 1
|
87
87
|
end
|
data/lib/parsers/mp3_parser.rb
CHANGED
data/lib/parsers/pdf_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -70,5 +70,6 @@ class FormatParser::PNGParser
|
|
70
70
|
)
|
71
71
|
end
|
72
72
|
|
73
|
-
|
73
|
+
# Give it priority 1 since priority 0 is reserved for JPEG, our most popular
|
74
|
+
FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
|
74
75
|
end
|
data/lib/parsers/zip_parser.rb
CHANGED
data/spec/format_parser_spec.rb
CHANGED
@@ -34,6 +34,36 @@ describe FormatParser do
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
it 'triggers parsers in a certain order that corresponds to the parser priorities' do
|
38
|
+
file_contents = StringIO.new('a' * 4096)
|
39
|
+
|
40
|
+
parsers_called_order = []
|
41
|
+
expect_any_instance_of(FormatParser::PNGParser).to receive(:call) { |instance|
|
42
|
+
parsers_called_order << instance.class
|
43
|
+
nil
|
44
|
+
}
|
45
|
+
expect_any_instance_of(FormatParser::MP3Parser).to receive(:call) { |instance|
|
46
|
+
parsers_called_order << instance.class
|
47
|
+
nil
|
48
|
+
}
|
49
|
+
expect_any_instance_of(FormatParser::ZIPParser).to receive(:call) { |instance|
|
50
|
+
parsers_called_order << instance.class
|
51
|
+
nil
|
52
|
+
}
|
53
|
+
|
54
|
+
FormatParser.parse(file_contents)
|
55
|
+
|
56
|
+
png_parser_idx = parsers_called_order.index(FormatParser::PNGParser)
|
57
|
+
mp3_parser_idx = parsers_called_order.index(FormatParser::MP3Parser)
|
58
|
+
zip_parser_idx = parsers_called_order.index(FormatParser::ZIPParser)
|
59
|
+
|
60
|
+
# The PNG parser should have been applied first
|
61
|
+
expect(png_parser_idx).to be < zip_parser_idx
|
62
|
+
# ...and the ZIP parser second (MP3 is the most omnivorous since there
|
63
|
+
# is no clear header or footer in the file
|
64
|
+
expect(mp3_parser_idx).to be > zip_parser_idx
|
65
|
+
end
|
66
|
+
|
37
67
|
it "returns either a valid result or a nil for all fuzzed inputs at seed #{RSpec.configuration.seed}" do
|
38
68
|
r = Random.new(RSpec.configuration.seed)
|
39
69
|
1024.times do
|
@@ -63,6 +93,14 @@ describe FormatParser do
|
|
63
93
|
FormatParser.deregister_parser(exploit)
|
64
94
|
end
|
65
95
|
|
96
|
+
it 'correctly detects a PNG as a PNG without falling back to another filetype' do
|
97
|
+
File.open(fixtures_dir + '/PNG/simulator_screenie.png', 'rb') do |file|
|
98
|
+
file_information = FormatParser.parse(file)
|
99
|
+
expect(file_information).not_to be_nil
|
100
|
+
expect(file_information.format).to eq(:png)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
66
104
|
describe 'when multiple results are requested' do
|
67
105
|
let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
|
68
106
|
let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
|
@@ -72,6 +72,13 @@ describe 'Fetching data from HTTP remotes' do
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
|
+
it 'correctly detects a PNG as a PNG without falling back to another filetype' do
|
76
|
+
remote_png_url = 'http://localhost:9399/PNG/simulator_screenie.png'
|
77
|
+
file_information = FormatParser.parse_http(remote_png_url)
|
78
|
+
expect(file_information).not_to be_nil
|
79
|
+
expect(file_information.format).to eq(:png)
|
80
|
+
end
|
81
|
+
|
75
82
|
describe 'when parsing remote fixtures' do
|
76
83
|
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
77
84
|
filename = File.basename(fixture_path)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-08-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|