format_parser 0.15.1 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b54b8e7c496c5d1e732503678bc762993bf53e9a
4
- data.tar.gz: ca7ce92f31f08a049b7475746e2017e6662b331a
3
+ metadata.gz: e49c10ea63c475b9bc61781ea8e76f42ef9e4307
4
+ data.tar.gz: c625105933c8654f5aac93bae09476e32917655b
5
5
  SHA512:
6
- metadata.gz: d3dcc4792c6097dc701beb1b124ae17ad345305b4e7e4ac46a132646ff2aefea13cdc350087f8582b9b2653b75181c6c00ffb12d6a5180716538a732cfe60c46
7
- data.tar.gz: 898ea93a39709415452b5256fdcee7acb69d9a1decc14c50b52695990ae399848c3f984787ab7374efb9b04b3bb758f0487a4b9da67c7fa83396ab5a5c114218
6
+ metadata.gz: cf72d2b021f6fc407b29cbc5b336fd71e022f6a40b67a79646ea58e605c33d34b44404339e7f15cf1c3f63729712a784ac75296fde54525f51a9c6276069619e
7
+ data.tar.gz: c68eadacfae8062e6fb1532d3e6976c349f1d3f7a950cafb7ea0bab714a125e81d3d32bccb154ed96c2118c142f984637ed17c6a53d1ee755778836ad008a7a0
@@ -1,9 +1,9 @@
1
1
  rvm:
2
2
  - 2.2.0
3
- - 2.3.0
4
- - 2.4.2
5
- - 2.5.0
6
- - jruby-9.0
3
+ - 2.4.6
4
+ - 2.5.5
5
+ - 2.6.3
6
+ - jruby
7
7
  sudo: false
8
8
  cache: bundler
9
9
  script:
@@ -1,3 +1,14 @@
1
+ ## 0.16.0
2
+ * Add `filename_hint` keyword argument to `FormatParser.parse`. This can hint the library to apply
3
+ the parser that will likely match for this filename first, and the other parsers later. This helps
4
+ avoiding extra work when parsing less-popular file formats, and can be optionally used if the caller
5
+ knows the filename of the original file. Note that the filename is only that: a **hint,** it helps
6
+ apply parsers more efficiently but does not specify the actual format of the file that is going to
7
+ be detected.
8
+
9
+ ## 0.15.1
10
+ * Relax the "ks" dependency version since we do not need the constraint to be so strict
11
+
1
12
  ## 0.15.0
2
13
  * Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
3
14
  on detection confidence and file format popularity at WT.
@@ -86,18 +86,22 @@ module FormatParser
86
86
  # @param kwargs the keyword arguments to be delegated to `.parse`
87
87
  # @see {.parse}
88
88
  def self.parse_http(url, **kwargs)
89
+ # Do not extract the filename, since the URL
90
+ # can really be "anything". But if the caller
91
+ # provides filename_hint it will be carried over
89
92
  parse(RemoteIO.new(url), **kwargs)
90
93
  end
91
94
 
92
95
  # Parses the file at the given `path` and returns the results as if it were any IO
93
96
  # given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
97
+ # The file path will be used to provide the `filename_hint` to `.parse()`.
94
98
  #
95
99
  # @param path[String] the path to the file to parse on the local filesystem
96
100
  # @param kwargs the keyword arguments to be delegated to `.parse`
97
101
  # @see {.parse}
98
102
  def self.parse_file_at(path, **kwargs)
99
103
  File.open(path, 'rb') do |io|
100
- parse(io, **kwargs)
104
+ parse(io, filename_hint: File.basename(path), **kwargs)
101
105
  end
102
106
  end
103
107
 
@@ -116,9 +120,13 @@ module FormatParser
116
120
  # When using `:first` parsing will stop at the first successful match and other parsers won't run.
117
121
  # @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
118
122
  # one should be good for most cases.
123
+ # @param filename_hint[String?] the filename. If provided, the first parser applied will be the
124
+ # one that responds `true` to `likely_match?` with that filename as an argument. This way
125
+ # we can optimize the order of application of parsers and start with the one that is more likely
126
+ # to match.
119
127
  # @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
120
128
  # no useful metadata could be recovered from the file
121
- def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config)
129
+ def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config, filename_hint: nil)
122
130
  # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
123
131
  # of page faults (page cache misses) a parser may incur
124
132
  read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limits_config.max_pagefaults_per_parser)
@@ -140,7 +148,7 @@ module FormatParser
140
148
  # Always instantiate parsers fresh for each input, since they might
141
149
  # contain instance variables which otherwise would have to be reset
142
150
  # between invocations, and would complicate threading situations
143
- parsers = parsers_for(natures, formats)
151
+ parsers = parsers_for(natures, formats, filename_hint)
144
152
 
145
153
  # Limit how many operations the parser can perform
146
154
  limited_io = ReadLimiter.new(
@@ -225,9 +233,11 @@ module FormatParser
225
233
  #
226
234
  # @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
227
235
  # @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
236
+ # @param filename_hint[String?] the filename hint for the file. If provided,
237
+ # the parser that likely matches this filename will be applied first.
228
238
  # @return [Array<#call>] an array of callable parsers
229
239
  # @raise ArgumentError when there are no parsers satisfying the constraint
230
- def self.parsers_for(desired_natures, desired_formats)
240
+ def self.parsers_for(desired_natures, desired_formats, filename_hint = nil)
231
241
  assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
232
242
  hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
233
243
  }
@@ -246,6 +256,12 @@ module FormatParser
246
256
  @parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
247
257
  end
248
258
 
259
+ # If there is one parser that is more likely to match, place it first
260
+ if first_match = factories_in_order_of_priority.find { |f| f.respond_to?(:likely_match?) && f.likely_match?(filename_hint) }
261
+ factories_in_order_of_priority.delete(first_match)
262
+ factories_in_order_of_priority.unshift(first_match)
263
+ end
264
+
249
265
  factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
250
266
  end
251
267
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.15.1'
2
+ VERSION = '0.16.0'
3
3
  end
@@ -18,6 +18,10 @@ class FormatParser::AIFFParser
18
18
  'ANNO',
19
19
  ]
20
20
 
21
+ def self.likely_match?(filename)
22
+ filename =~ /\.aiff?$/i
23
+ end
24
+
21
25
  def call(io)
22
26
  io = FormatParser::IOConstraint.new(io)
23
27
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
@@ -6,6 +6,10 @@ class FormatParser::BMPParser
6
6
  VALID_BMP = 'BM'
7
7
  PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 40..512
8
8
 
9
+ def self.likely_match?(filename)
10
+ filename =~ /\.bmp$/i
11
+ end
12
+
9
13
  def call(io)
10
14
  io = FormatParser::IOConstraint.new(io)
11
15
 
@@ -7,6 +7,10 @@ class FormatParser::CR2Parser
7
7
  TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
8
8
  CR2_HEADER = [0x43, 0x52, 0x02, 0x00]
9
9
 
10
+ def self.likely_match?(filename)
11
+ filename =~ /\.cr2$/i
12
+ end
13
+
10
14
  def call(io)
11
15
  io = FormatParser::IOConstraint.new(io)
12
16
 
@@ -19,6 +19,10 @@ class FormatParser::DPXParser
19
19
 
20
20
  private_constant :ByteOrderHintIO
21
21
 
22
+ def self.likely_match?(filename)
23
+ filename =~ /\.dpx$/i
24
+ end
25
+
22
26
  def call(io)
23
27
  io = FormatParser::IOConstraint.new(io)
24
28
  magic = safe_read(io, 4)
@@ -1,6 +1,10 @@
1
1
  class FormatParser::FDXParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ def self.likely_match?(filename)
5
+ filename =~ /\.fdx$/i
6
+ end
7
+
4
8
  def call(io)
5
9
  return unless xml_check(io)
6
10
  file_and_document_type = safe_read(io, 100)
@@ -5,8 +5,8 @@ class FormatParser::FLACParser
5
5
  MAGIC_BYTE_STRING = 'fLaC'
6
6
  BLOCK_HEADER_BYTES = 4
7
7
 
8
- def bytestring_to_int(s)
9
- s.unpack('B*')[0].to_i(2)
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.flac$/i
10
10
  end
11
11
 
12
12
  def call(io)
@@ -71,5 +71,9 @@ class FormatParser::FLACParser
71
71
  )
72
72
  end
73
73
 
74
+ def bytestring_to_int(s)
75
+ s.unpack('B*')[0].to_i(2)
76
+ end
77
+
74
78
  FormatParser.register_parser self, natures: :audio, formats: :flac
75
79
  end
@@ -4,6 +4,10 @@ class FormatParser::GIFParser
4
4
  HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
5
5
  NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
6
6
 
7
+ def self.likely_match?(filename)
8
+ filename =~ /\.gif$/i
9
+ end
10
+
7
11
  def call(io)
8
12
  io = FormatParser::IOConstraint.new(io)
9
13
  header = safe_read(io, 6)
@@ -13,6 +13,10 @@ class FormatParser::JPEGParser
13
13
  EXIF_MAGIC_STRING = "Exif\0\0".b
14
14
  MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
15
15
 
16
+ def self.likely_match?(filename)
17
+ filename =~ /\.jpe?g$/i
18
+ end
19
+
16
20
  def call(io)
17
21
  @buf = FormatParser::IOConstraint.new(io)
18
22
  @width = nil
@@ -11,6 +11,10 @@ class FormatParser::MOOVParser
11
11
  'm4a ' => :m4a,
12
12
  }
13
13
 
14
+ def self.likely_match?(filename)
15
+ filename =~ /\.(mov|m4a|ma4|mp4|aac)$/i
16
+ end
17
+
14
18
  def call(io)
15
19
  return unless matches_moov_definition?(io)
16
20
 
@@ -53,6 +53,10 @@ class FormatParser::MP3Parser
53
53
  end
54
54
  end
55
55
 
56
+ def self.likely_match?(filename)
57
+ filename =~ /\.mp3$/i
58
+ end
59
+
56
60
  def call(raw_io)
57
61
  io = FormatParser::IOConstraint.new(raw_io)
58
62
 
@@ -6,6 +6,10 @@ class FormatParser::OggParser
6
6
  # Maximum size of an Ogg page
7
7
  MAX_POSSIBLE_PAGE_SIZE = 65307
8
8
 
9
+ def self.likely_match?(filename)
10
+ filename =~ /\.ogg$/i
11
+ end
12
+
9
13
  def call(io)
10
14
  # The format consists of chunks of data each called an "Ogg page". Each page
11
15
  # begins with the characters, "OggS", to identify the file as Ogg format.
@@ -9,6 +9,10 @@ class FormatParser::PDFParser
9
9
  #
10
10
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
11
11
 
12
+ def self.likely_match?(filename)
13
+ filename =~ /\.(pdf|ai)$/i
14
+ end
15
+
12
16
  def call(io)
13
17
  io = FormatParser::IOConstraint.new(io)
14
18
 
@@ -15,8 +15,8 @@ class FormatParser::PNGParser
15
15
  6 => true,
16
16
  }
17
17
 
18
- def chunk_length_and_type(io)
19
- safe_read(io, 8).unpack('Na4')
18
+ def self.likely_match?(filename)
19
+ filename =~ /\.png$/i
20
20
  end
21
21
 
22
22
  def call(io)
@@ -70,6 +70,10 @@ class FormatParser::PNGParser
70
70
  )
71
71
  end
72
72
 
73
+ def chunk_length_and_type(io)
74
+ safe_read(io, 8).unpack('Na4')
75
+ end
76
+
73
77
  # Give it priority 1 since priority 0 is reserved for JPEG, our most popular
74
78
  FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
75
79
  end
@@ -3,6 +3,10 @@ class FormatParser::PSDParser
3
3
 
4
4
  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
5
5
 
6
+ def self.likely_match?(filename)
7
+ filename =~ /\.psd$/i # Maybe also PSB at some point
8
+ end
9
+
6
10
  def call(io)
7
11
  io = FormatParser::IOConstraint.new(io)
8
12
  magic_bytes = safe_read(io, 4).unpack('C4')
@@ -5,6 +5,10 @@ class FormatParser::TIFFParser
5
5
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
6
6
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
7
7
 
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.tiff?$/i
10
+ end
11
+
8
12
  def call(io)
9
13
  io = FormatParser::IOConstraint.new(io)
10
14
 
@@ -1,6 +1,10 @@
1
1
  class FormatParser::WAVParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ def self.likely_match?(filename)
5
+ filename =~ /\.wav$/i
6
+ end
7
+
4
8
  def call(io)
5
9
  # Read the RIFF header. Chunk descriptor should be RIFF, the size should
6
10
  # contain the size of the entire file in bytes minus 8 bytes for the
@@ -5,6 +5,10 @@ class FormatParser::ZIPParser
5
5
  include OfficeFormats
6
6
  include FormatParser::IOUtils
7
7
 
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
10
+ end
11
+
8
12
  def call(io)
9
13
  io = FormatParser::IOConstraint.new(io)
10
14
  safe_read(io, 1) # Ensure the file is not empty
@@ -139,7 +139,7 @@ describe FormatParser do
139
139
 
140
140
  it 'passes keyword arguments to parse()' do
141
141
  path = fixtures_dir + '/WAV/c_M1F1-Alaw-AFsp.wav'
142
- expect(FormatParser).to receive(:parse).with(an_instance_of(File), foo: :bar)
142
+ expect(FormatParser).to receive(:parse).with(an_instance_of(File), filename_hint: 'c_M1F1-Alaw-AFsp.wav', foo: :bar)
143
143
  FormatParser.parse_file_at(path, foo: :bar)
144
144
  end
145
145
  end
@@ -165,6 +165,14 @@ describe FormatParser do
165
165
  image_parsers = FormatParser.parsers_for([:image], [:tif, :jpg, :aiff, :mp3])
166
166
  expect(image_parsers.length).to eq(2)
167
167
  end
168
+
169
+ it 'returns an array with the ZIPParser first if the filename_hint is for a ZIP file' do
170
+ prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], nil)
171
+ expect(prioritized_parsers.first).not_to be_kind_of(FormatParser::ZIPParser)
172
+
173
+ prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
174
+ expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
175
+ end
168
176
  end
169
177
 
170
178
  describe '.register_parser and .deregister_parser' do
@@ -1,6 +1,11 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe FormatParser::ZIPParser do
4
+ it 'provides filename hints' do
5
+ expect(FormatParser::ZIPParser).to be_likely_match('file.zip')
6
+ expect(FormatParser::ZIPParser).not_to be_likely_match('file.tif')
7
+ end
8
+
4
9
  it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
5
10
  fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
6
11
  fi_io = File.open(fixture_path, 'rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2019-01-29 00:00:00.000000000 Z
12
+ date: 2019-07-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks