format_parser 0.14.1 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90513000619c11b5a98669f465add204ca68465065681424627adcdf66c634dd
4
- data.tar.gz: 1515277ec5edc3bedc7631a7a3e5b56975010e1d7a84b5e166b4eae6977eb90a
3
+ metadata.gz: 32b5a18eb53b1c5197a828757056354c655225fe7047418a615bfb8f950a6805
4
+ data.tar.gz: df289b5f4039f89d78b35e481efe50ea95f654497a7783bbaf0a35b997d630ed
5
5
  SHA512:
6
- metadata.gz: ec36c70f585ebc72089be82bba55b21b83d53d59a4233f33be4e78cbde5f7b12bdb99587238a00cfdf023a24daafadc8803d43e88bacd994c97cfe47c3aac4f1
7
- data.tar.gz: 1e964c45d81b43def5575331f02f0949892ddc06f006c7b83275f72cffafefef25cef6b2b8627db0edb6969a8da1e72fa612f600091630fe0822f842062e75d1
6
+ metadata.gz: 358c318bf9f5262edb1acde1854e5caebaac3e945502d1d164f982f0d8fd26eb717ed5c43b3311f130d6ca2804b8c2bb1c0f1a596f462b01c59cd5a79267b0fb
7
+ data.tar.gz: 9482a5702896f47ba6da0d99d0e5ef31d51981bdd9271a764872fd06f0a210e19202c1c4fbd111111645d894703646648e8a7f003df7ea2419deef3254ea32da
@@ -1,3 +1,7 @@
1
+ ## 0.15.0
2
+ * Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
3
+ on detection confidence and file format popularity at WT.
4
+
1
5
  ## 0.14.1
2
6
  * Care caching: Clear pages more deliberately instead of relegating them to GC
3
7
  * JPEG: Clear the EXIF buffer explicitly
data/README.md CHANGED
@@ -157,6 +157,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
157
157
  ### M4A
158
158
  - fixture.m4a was created by one of the project maintainers and is MIT licensed
159
159
 
160
+ ### PNG
161
+ - `simulator_screenie.png` provided by [Rens Verhoeven](https://github.com/renssies)
162
+
160
163
  ### TIFF
161
164
  - `Shinbutsureijoushuincho.tiff` is obtained from Wikimedia Commons and is Creative Commons licensed
162
165
  - `IMG_9266_*.tif` and all it's variations were created by the project maintainers
@@ -28,14 +28,22 @@ module FormatParser
28
28
  PARSER_MUX = Mutex.new
29
29
  MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
30
30
 
31
+ # The value will ensure the parser having it will be applied to the file last.
32
+ LEAST_PRIORITY = 99
33
+
31
34
  # Register a parser object to be used to perform file format detection. Each parser FormatParser
32
35
  # provides out of the box registers itself using this method.
33
36
  #
34
37
  # @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
35
38
  # @param formats[Array<Symbol>] file formats that the parser provides
36
39
  # @param natures[Array<Symbol>] file natures that the parser provides
40
+ # @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
41
+ # detection and have the most popular file formats should get a lower priority (0 or 1), the default
42
+ # priority is 99. Before parsing parsers get sorted according to their priority value ascending, so parsers
43
+ # with a lower priority value will be applied first, and if a single result is requested, will also return
44
+ # first.
37
45
  # @return void
38
- def self.register_parser(callable_or_responding_to_new, formats:, natures:)
46
+ def self.register_parser(callable_or_responding_to_new, formats:, natures:, priority: LEAST_PRIORITY)
39
47
  parser_provided_formats = Array(formats)
40
48
  parser_provided_natures = Array(natures)
41
49
  PARSER_MUX.synchronize do
@@ -51,6 +59,8 @@ module FormatParser
51
59
  @parsers_per_format[provided_format] ||= Set.new
52
60
  @parsers_per_format[provided_format] << callable_or_responding_to_new
53
61
  end
62
+ @parser_priorities ||= {}
63
+ @parser_priorities[callable_or_responding_to_new] = priority
54
64
  end
55
65
  end
56
66
 
@@ -65,6 +75,7 @@ module FormatParser
65
75
  (@parsers || []).delete(callable_or_responding_to_new)
66
76
  (@parsers_per_nature || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
67
77
  (@parsers_per_format || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
78
+ (@parser_priorities || {}).delete(callable_or_responding_to_new)
68
79
  end
69
80
  end
70
81
 
@@ -229,7 +240,13 @@ module FormatParser
229
240
  raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
230
241
  end
231
242
 
232
- factories.map { |callable_or_class| instantiate_parser(callable_or_class) }
243
+ # Order the parsers according to their priority value. The ones having a lower
244
+ # value will sort higher and will be applied sooner
245
+ factories_in_order_of_priority = factories.to_a.sort do |parser_factory_a, parser_factory_b|
246
+ @parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
247
+ end
248
+
249
+ factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
233
250
  end
234
251
 
235
252
  # Instantiates a parser object (an object that responds to `#call`) from a given class
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.14.1'
2
+ VERSION = '0.15.0'
3
3
  end
@@ -44,5 +44,5 @@ class FormatParser::GIFParser
44
44
  )
45
45
  end
46
46
 
47
- FormatParser.register_parser self, natures: :image, formats: :gif
47
+ FormatParser.register_parser self, natures: :image, formats: :gif, priority: 0
48
48
  end
@@ -163,5 +163,5 @@ class FormatParser::JPEGParser
163
163
  safe_skip(@buf, length)
164
164
  end
165
165
 
166
- FormatParser.register_parser self, natures: :image, formats: :jpg
166
+ FormatParser.register_parser self, natures: :image, formats: :jpg, priority: 0
167
167
  end
@@ -83,5 +83,5 @@ class FormatParser::MOOVParser
83
83
  maybe_atom_size >= minimum_ftyp_atom_size && maybe_ftyp_atom_signature == 'ftyp'
84
84
  end
85
85
 
86
- FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values
86
+ FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values, priority: 1
87
87
  end
@@ -275,5 +275,5 @@ class FormatParser::MP3Parser
275
275
  end
276
276
  end
277
277
 
278
- FormatParser.register_parser self, natures: :audio, formats: :mp3
278
+ FormatParser.register_parser self, natures: :audio, formats: :mp3, priority: 99
279
279
  end
@@ -17,5 +17,5 @@ class FormatParser::PDFParser
17
17
  FormatParser::Document.new(format: :pdf)
18
18
  end
19
19
 
20
- FormatParser.register_parser self, natures: :document, formats: :pdf
20
+ FormatParser.register_parser self, natures: :document, formats: :pdf, priority: 1
21
21
  end
@@ -70,5 +70,6 @@ class FormatParser::PNGParser
70
70
  )
71
71
  end
72
72
 
73
- FormatParser.register_parser self, natures: :image, formats: :png
73
+ # Give it priority 1 since priority 0 is reserved for JPEG, our most popular
74
+ FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
74
75
  end
@@ -54,5 +54,5 @@ class FormatParser::ZIPParser
54
54
  end
55
55
  end
56
56
 
57
- FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
57
+ FormatParser.register_parser self, natures: [:archive, :document], formats: :zip, priority: 2
58
58
  end
@@ -34,6 +34,36 @@ describe FormatParser do
34
34
  end
35
35
  end
36
36
 
37
+ it 'triggers parsers in a certain order that corresponds to the parser priorities' do
38
+ file_contents = StringIO.new('a' * 4096)
39
+
40
+ parsers_called_order = []
41
+ expect_any_instance_of(FormatParser::PNGParser).to receive(:call) { |instance|
42
+ parsers_called_order << instance.class
43
+ nil
44
+ }
45
+ expect_any_instance_of(FormatParser::MP3Parser).to receive(:call) { |instance|
46
+ parsers_called_order << instance.class
47
+ nil
48
+ }
49
+ expect_any_instance_of(FormatParser::ZIPParser).to receive(:call) { |instance|
50
+ parsers_called_order << instance.class
51
+ nil
52
+ }
53
+
54
+ FormatParser.parse(file_contents)
55
+
56
+ png_parser_idx = parsers_called_order.index(FormatParser::PNGParser)
57
+ mp3_parser_idx = parsers_called_order.index(FormatParser::MP3Parser)
58
+ zip_parser_idx = parsers_called_order.index(FormatParser::ZIPParser)
59
+
60
+ # The PNG parser should have been applied first
61
+ expect(png_parser_idx).to be < zip_parser_idx
62
+ # ...and the ZIP parser second (MP3 is the most omnivorous since there
63
+ # is no clear header or footer in the file
64
+ expect(mp3_parser_idx).to be > zip_parser_idx
65
+ end
66
+
37
67
  it "returns either a valid result or a nil for all fuzzed inputs at seed #{RSpec.configuration.seed}" do
38
68
  r = Random.new(RSpec.configuration.seed)
39
69
  1024.times do
@@ -63,6 +93,14 @@ describe FormatParser do
63
93
  FormatParser.deregister_parser(exploit)
64
94
  end
65
95
 
96
+ it 'correctly detects a PNG as a PNG without falling back to another filetype' do
97
+ File.open(fixtures_dir + '/PNG/simulator_screenie.png', 'rb') do |file|
98
+ file_information = FormatParser.parse(file)
99
+ expect(file_information).not_to be_nil
100
+ expect(file_information.format).to eq(:png)
101
+ end
102
+ end
103
+
66
104
  describe 'when multiple results are requested' do
67
105
  let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
68
106
  let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
@@ -72,6 +72,13 @@ describe 'Fetching data from HTTP remotes' do
72
72
  end
73
73
  end
74
74
 
75
+ it 'correctly detects a PNG as a PNG without falling back to another filetype' do
76
+ remote_png_url = 'http://localhost:9399/PNG/simulator_screenie.png'
77
+ file_information = FormatParser.parse_http(remote_png_url)
78
+ expect(file_information).not_to be_nil
79
+ expect(file_information.format).to eq(:png)
80
+ end
81
+
75
82
  describe 'when parsing remote fixtures' do
76
83
  Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
77
84
  filename = File.basename(fixture_path)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-07-27 00:00:00.000000000 Z
12
+ date: 2018-08-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks