format_parser 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90513000619c11b5a98669f465add204ca68465065681424627adcdf66c634dd
4
- data.tar.gz: 1515277ec5edc3bedc7631a7a3e5b56975010e1d7a84b5e166b4eae6977eb90a
3
+ metadata.gz: 32b5a18eb53b1c5197a828757056354c655225fe7047418a615bfb8f950a6805
4
+ data.tar.gz: df289b5f4039f89d78b35e481efe50ea95f654497a7783bbaf0a35b997d630ed
5
5
  SHA512:
6
- metadata.gz: ec36c70f585ebc72089be82bba55b21b83d53d59a4233f33be4e78cbde5f7b12bdb99587238a00cfdf023a24daafadc8803d43e88bacd994c97cfe47c3aac4f1
7
- data.tar.gz: 1e964c45d81b43def5575331f02f0949892ddc06f006c7b83275f72cffafefef25cef6b2b8627db0edb6969a8da1e72fa612f600091630fe0822f842062e75d1
6
+ metadata.gz: 358c318bf9f5262edb1acde1854e5caebaac3e945502d1d164f982f0d8fd26eb717ed5c43b3311f130d6ca2804b8c2bb1c0f1a596f462b01c59cd5a79267b0fb
7
+ data.tar.gz: 9482a5702896f47ba6da0d99d0e5ef31d51981bdd9271a764872fd06f0a210e19202c1c4fbd111111645d894703646648e8a7f003df7ea2419deef3254ea32da
@@ -1,3 +1,7 @@
1
+ ## 0.15.0
2
+ * Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
3
+ on detection confidence and file format popularity at WT.
4
+
1
5
  ## 0.14.1
2
6
  * Care caching: Clear pages more deliberately instead of relegating them to GC
3
7
  * JPEG: Clear the EXIF buffer explicitly
data/README.md CHANGED
@@ -157,6 +157,9 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
157
157
  ### M4A
158
158
  - fixture.m4a was created by one of the project maintainers and is MIT licensed
159
159
 
160
+ ### PNG
161
+ - `simulator_screenie.png` provided by [Rens Verhoeven](https://github.com/renssies)
162
+
160
163
  ### TIFF
161
164
  - `Shinbutsureijoushuincho.tiff` is obtained from Wikimedia Commons and is Creative Commons licensed
162
165
  - `IMG_9266_*.tif` and all it's variations were created by the project maintainers
@@ -28,14 +28,22 @@ module FormatParser
28
28
  PARSER_MUX = Mutex.new
29
29
  MAX_BYTES_READ_PER_PARSER = 1024 * 1024 * 2
30
30
 
31
+ # The value will ensure the parser having it will be applied to the file last.
32
+ LEAST_PRIORITY = 99
33
+
31
34
  # Register a parser object to be used to perform file format detection. Each parser FormatParser
32
35
  # provides out of the box registers itself using this method.
33
36
  #
34
37
  # @param callable_or_responding_to_new[#call, #new] an object that either responds to #new or to #call
35
38
  # @param formats[Array<Symbol>] file formats that the parser provides
36
39
  # @param natures[Array<Symbol>] file natures that the parser provides
40
+ # @param priority[Integer] whether the parser has to be applied first or later. Parsers that offer the safest
41
+ # detection and have the most popular file formats should get a lower priority (0 or 1), the default
42
+ # priority is 99. Before parsing parsers get sorted according to their priority value ascending, so parsers
43
+ # with a lower priority value will be applied first, and if a single result is requested, will also return
44
+ # first.
37
45
  # @return void
38
- def self.register_parser(callable_or_responding_to_new, formats:, natures:)
46
+ def self.register_parser(callable_or_responding_to_new, formats:, natures:, priority: LEAST_PRIORITY)
39
47
  parser_provided_formats = Array(formats)
40
48
  parser_provided_natures = Array(natures)
41
49
  PARSER_MUX.synchronize do
@@ -51,6 +59,8 @@ module FormatParser
51
59
  @parsers_per_format[provided_format] ||= Set.new
52
60
  @parsers_per_format[provided_format] << callable_or_responding_to_new
53
61
  end
62
+ @parser_priorities ||= {}
63
+ @parser_priorities[callable_or_responding_to_new] = priority
54
64
  end
55
65
  end
56
66
 
@@ -65,6 +75,7 @@ module FormatParser
65
75
  (@parsers || []).delete(callable_or_responding_to_new)
66
76
  (@parsers_per_nature || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
67
77
  (@parsers_per_format || {}).values.map { |e| e.delete(callable_or_responding_to_new) }
78
+ (@parser_priorities || {}).delete(callable_or_responding_to_new)
68
79
  end
69
80
  end
70
81
 
@@ -229,7 +240,13 @@ module FormatParser
229
240
  raise ArgumentError, "No parsers provide both natures #{desired_natures.inspect} and formats #{desired_formats.inspect}"
230
241
  end
231
242
 
232
- factories.map { |callable_or_class| instantiate_parser(callable_or_class) }
243
+ # Order the parsers according to their priority value. The ones having a lower
244
+ # value will sort higher and will be applied sooner
245
+ factories_in_order_of_priority = factories.to_a.sort do |parser_factory_a, parser_factory_b|
246
+ @parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
247
+ end
248
+
249
+ factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
233
250
  end
234
251
 
235
252
  # Instantiates a parser object (an object that responds to `#call`) from a given class
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.14.1'
2
+ VERSION = '0.15.0'
3
3
  end
@@ -44,5 +44,5 @@ class FormatParser::GIFParser
44
44
  )
45
45
  end
46
46
 
47
- FormatParser.register_parser self, natures: :image, formats: :gif
47
+ FormatParser.register_parser self, natures: :image, formats: :gif, priority: 0
48
48
  end
@@ -163,5 +163,5 @@ class FormatParser::JPEGParser
163
163
  safe_skip(@buf, length)
164
164
  end
165
165
 
166
- FormatParser.register_parser self, natures: :image, formats: :jpg
166
+ FormatParser.register_parser self, natures: :image, formats: :jpg, priority: 0
167
167
  end
@@ -83,5 +83,5 @@ class FormatParser::MOOVParser
83
83
  maybe_atom_size >= minimum_ftyp_atom_size && maybe_ftyp_atom_signature == 'ftyp'
84
84
  end
85
85
 
86
- FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values
86
+ FormatParser.register_parser self, natures: :video, formats: FTYP_MAP.values, priority: 1
87
87
  end
@@ -275,5 +275,5 @@ class FormatParser::MP3Parser
275
275
  end
276
276
  end
277
277
 
278
- FormatParser.register_parser self, natures: :audio, formats: :mp3
278
+ FormatParser.register_parser self, natures: :audio, formats: :mp3, priority: 99
279
279
  end
@@ -17,5 +17,5 @@ class FormatParser::PDFParser
17
17
  FormatParser::Document.new(format: :pdf)
18
18
  end
19
19
 
20
- FormatParser.register_parser self, natures: :document, formats: :pdf
20
+ FormatParser.register_parser self, natures: :document, formats: :pdf, priority: 1
21
21
  end
@@ -70,5 +70,6 @@ class FormatParser::PNGParser
70
70
  )
71
71
  end
72
72
 
73
- FormatParser.register_parser self, natures: :image, formats: :png
73
+ # Give it priority 1 since priority 0 is reserved for JPEG, our most popular
74
+ FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
74
75
  end
@@ -54,5 +54,5 @@ class FormatParser::ZIPParser
54
54
  end
55
55
  end
56
56
 
57
- FormatParser.register_parser self, natures: [:archive, :document], formats: :zip
57
+ FormatParser.register_parser self, natures: [:archive, :document], formats: :zip, priority: 2
58
58
  end
@@ -34,6 +34,36 @@ describe FormatParser do
34
34
  end
35
35
  end
36
36
 
37
+ it 'triggers parsers in a certain order that corresponds to the parser priorities' do
38
+ file_contents = StringIO.new('a' * 4096)
39
+
40
+ parsers_called_order = []
41
+ expect_any_instance_of(FormatParser::PNGParser).to receive(:call) { |instance|
42
+ parsers_called_order << instance.class
43
+ nil
44
+ }
45
+ expect_any_instance_of(FormatParser::MP3Parser).to receive(:call) { |instance|
46
+ parsers_called_order << instance.class
47
+ nil
48
+ }
49
+ expect_any_instance_of(FormatParser::ZIPParser).to receive(:call) { |instance|
50
+ parsers_called_order << instance.class
51
+ nil
52
+ }
53
+
54
+ FormatParser.parse(file_contents)
55
+
56
+ png_parser_idx = parsers_called_order.index(FormatParser::PNGParser)
57
+ mp3_parser_idx = parsers_called_order.index(FormatParser::MP3Parser)
58
+ zip_parser_idx = parsers_called_order.index(FormatParser::ZIPParser)
59
+
60
+ # The PNG parser should have been applied first
61
+ expect(png_parser_idx).to be < zip_parser_idx
62
+ # ...and the ZIP parser second (MP3 is the most omnivorous since there
63
+ # is no clear header or footer in the file
64
+ expect(mp3_parser_idx).to be > zip_parser_idx
65
+ end
66
+
37
67
  it "returns either a valid result or a nil for all fuzzed inputs at seed #{RSpec.configuration.seed}" do
38
68
  r = Random.new(RSpec.configuration.seed)
39
69
  1024.times do
@@ -63,6 +93,14 @@ describe FormatParser do
63
93
  FormatParser.deregister_parser(exploit)
64
94
  end
65
95
 
96
+ it 'correctly detects a PNG as a PNG without falling back to another filetype' do
97
+ File.open(fixtures_dir + '/PNG/simulator_screenie.png', 'rb') do |file|
98
+ file_information = FormatParser.parse(file)
99
+ expect(file_information).not_to be_nil
100
+ expect(file_information.format).to eq(:png)
101
+ end
102
+ end
103
+
66
104
  describe 'when multiple results are requested' do
67
105
  let(:blob) { StringIO.new(Random.new.bytes(512 * 1024)) }
68
106
  let(:audio) { FormatParser::Audio.new(format: :aiff, num_audio_channels: 1) }
@@ -72,6 +72,13 @@ describe 'Fetching data from HTTP remotes' do
72
72
  end
73
73
  end
74
74
 
75
+ it 'correctly detects a PNG as a PNG without falling back to another filetype' do
76
+ remote_png_url = 'http://localhost:9399/PNG/simulator_screenie.png'
77
+ file_information = FormatParser.parse_http(remote_png_url)
78
+ expect(file_information).not_to be_nil
79
+ expect(file_information.format).to eq(:png)
80
+ end
81
+
75
82
  describe 'when parsing remote fixtures' do
76
83
  Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
77
84
  filename = File.basename(fixture_path)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-07-27 00:00:00.000000000 Z
12
+ date: 2018-08-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks