format_parser 0.15.1 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b54b8e7c496c5d1e732503678bc762993bf53e9a
4
- data.tar.gz: ca7ce92f31f08a049b7475746e2017e6662b331a
3
+ metadata.gz: e49c10ea63c475b9bc61781ea8e76f42ef9e4307
4
+ data.tar.gz: c625105933c8654f5aac93bae09476e32917655b
5
5
  SHA512:
6
- metadata.gz: d3dcc4792c6097dc701beb1b124ae17ad345305b4e7e4ac46a132646ff2aefea13cdc350087f8582b9b2653b75181c6c00ffb12d6a5180716538a732cfe60c46
7
- data.tar.gz: 898ea93a39709415452b5256fdcee7acb69d9a1decc14c50b52695990ae399848c3f984787ab7374efb9b04b3bb758f0487a4b9da67c7fa83396ab5a5c114218
6
+ metadata.gz: cf72d2b021f6fc407b29cbc5b336fd71e022f6a40b67a79646ea58e605c33d34b44404339e7f15cf1c3f63729712a784ac75296fde54525f51a9c6276069619e
7
+ data.tar.gz: c68eadacfae8062e6fb1532d3e6976c349f1d3f7a950cafb7ea0bab714a125e81d3d32bccb154ed96c2118c142f984637ed17c6a53d1ee755778836ad008a7a0
@@ -1,9 +1,9 @@
1
1
  rvm:
2
2
  - 2.2.0
3
- - 2.3.0
4
- - 2.4.2
5
- - 2.5.0
6
- - jruby-9.0
3
+ - 2.4.6
4
+ - 2.5.5
5
+ - 2.6.3
6
+ - jruby
7
7
  sudo: false
8
8
  cache: bundler
9
9
  script:
@@ -1,3 +1,14 @@
1
+ ## 0.16.0
2
+ * Add `filename_hint` keyword argument to `FormatParser.parse`. This can hint the library to apply
3
+ the parser that will likely match for this filename first, and the other parsers later. This helps
4
+ avoiding extra work when parsing less-popular file formats, and can be optionally used if the caller
5
+ knows the filename of the original file. Note that the filename is only that: a **hint,** it helps
6
+ apply parsers more efficiently but does not specify the actual format of the file that is going to
7
+ be detected.
8
+
9
+ ## 0.15.1
10
+ * Relax the "ks" dependency version since we do not need the constraint to be so strict
11
+
1
12
  ## 0.15.0
2
13
  * Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
3
14
  on detection confidence and file format popularity at WT.
@@ -86,18 +86,22 @@ module FormatParser
86
86
  # @param kwargs the keyword arguments to be delegated to `.parse`
87
87
  # @see {.parse}
88
88
  def self.parse_http(url, **kwargs)
89
+ # Do not extract the filename, since the URL
90
+ # can really be "anything". But if the caller
91
+ # provides filename_hint it will be carried over
89
92
  parse(RemoteIO.new(url), **kwargs)
90
93
  end
91
94
 
92
95
  # Parses the file at the given `path` and returns the results as if it were any IO
93
96
  # given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
97
+ # The file path will be used to provide the `filename_hint` to `.parse()`.
94
98
  #
95
99
  # @param path[String] the path to the file to parse on the local filesystem
96
100
  # @param kwargs the keyword arguments to be delegated to `.parse`
97
101
  # @see {.parse}
98
102
  def self.parse_file_at(path, **kwargs)
99
103
  File.open(path, 'rb') do |io|
100
- parse(io, **kwargs)
104
+ parse(io, filename_hint: File.basename(path), **kwargs)
101
105
  end
102
106
  end
103
107
 
@@ -116,9 +120,13 @@ module FormatParser
116
120
  # When using `:first` parsing will stop at the first successful match and other parsers won't run.
117
121
  # @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
118
122
  # one should be good for most cases.
123
+ # @param filename_hint[String?] the filename. If provided, the first parser applied will be the
124
+ # one that responds `true` to `likely_match?` with that filename as an argument. This way
125
+ # we can optimize the order of application of parsers and start with the one that is more likely
126
+ # to match.
119
127
  # @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
120
128
  # no useful metadata could be recovered from the file
121
- def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config)
129
+ def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config, filename_hint: nil)
122
130
  # Limit the number of cached _pages_ we may fetch. This allows us to limit the number
123
131
  # of page faults (page cache misses) a parser may incur
124
132
  read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limits_config.max_pagefaults_per_parser)
@@ -140,7 +148,7 @@ module FormatParser
140
148
  # Always instantiate parsers fresh for each input, since they might
141
149
  # contain instance variables which otherwise would have to be reset
142
150
  # between invocations, and would complicate threading situations
143
- parsers = parsers_for(natures, formats)
151
+ parsers = parsers_for(natures, formats, filename_hint)
144
152
 
145
153
  # Limit how many operations the parser can perform
146
154
  limited_io = ReadLimiter.new(
@@ -225,9 +233,11 @@ module FormatParser
225
233
  #
226
234
  # @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
227
235
  # @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
236
+ # @param filename_hint[String?] the filename hint for the file. If provided,
237
+ # the parser that likely matches this filename will be applied first.
228
238
  # @return [Array<#call>] an array of callable parsers
229
239
  # @raise ArgumentError when there are no parsers satisfying the constraint
230
- def self.parsers_for(desired_natures, desired_formats)
240
+ def self.parsers_for(desired_natures, desired_formats, filename_hint = nil)
231
241
  assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
232
242
  hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
233
243
  }
@@ -246,6 +256,12 @@ module FormatParser
246
256
  @parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
247
257
  end
248
258
 
259
+ # If there is one parser that is more likely to match, place it first
260
+ if first_match = factories_in_order_of_priority.find { |f| f.respond_to?(:likely_match?) && f.likely_match?(filename_hint) }
261
+ factories_in_order_of_priority.delete(first_match)
262
+ factories_in_order_of_priority.unshift(first_match)
263
+ end
264
+
249
265
  factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
250
266
  end
251
267
 
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '0.15.1'
2
+ VERSION = '0.16.0'
3
3
  end
@@ -18,6 +18,10 @@ class FormatParser::AIFFParser
18
18
  'ANNO',
19
19
  ]
20
20
 
21
+ def self.likely_match?(filename)
22
+ filename =~ /\.aiff?$/i
23
+ end
24
+
21
25
  def call(io)
22
26
  io = FormatParser::IOConstraint.new(io)
23
27
  form_chunk_type, chunk_size = safe_read(io, 8).unpack('a4N')
@@ -6,6 +6,10 @@ class FormatParser::BMPParser
6
6
  VALID_BMP = 'BM'
7
7
  PERMISSIBLE_PIXEL_ARRAY_LOCATIONS = 40..512
8
8
 
9
+ def self.likely_match?(filename)
10
+ filename =~ /\.bmp$/i
11
+ end
12
+
9
13
  def call(io)
10
14
  io = FormatParser::IOConstraint.new(io)
11
15
 
@@ -7,6 +7,10 @@ class FormatParser::CR2Parser
7
7
  TIFF_HEADER = [0x49, 0x49, 0x2a, 0x00]
8
8
  CR2_HEADER = [0x43, 0x52, 0x02, 0x00]
9
9
 
10
+ def self.likely_match?(filename)
11
+ filename =~ /\.cr2$/i
12
+ end
13
+
10
14
  def call(io)
11
15
  io = FormatParser::IOConstraint.new(io)
12
16
 
@@ -19,6 +19,10 @@ class FormatParser::DPXParser
19
19
 
20
20
  private_constant :ByteOrderHintIO
21
21
 
22
+ def self.likely_match?(filename)
23
+ filename =~ /\.dpx$/i
24
+ end
25
+
22
26
  def call(io)
23
27
  io = FormatParser::IOConstraint.new(io)
24
28
  magic = safe_read(io, 4)
@@ -1,6 +1,10 @@
1
1
  class FormatParser::FDXParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ def self.likely_match?(filename)
5
+ filename =~ /\.fdx$/i
6
+ end
7
+
4
8
  def call(io)
5
9
  return unless xml_check(io)
6
10
  file_and_document_type = safe_read(io, 100)
@@ -5,8 +5,8 @@ class FormatParser::FLACParser
5
5
  MAGIC_BYTE_STRING = 'fLaC'
6
6
  BLOCK_HEADER_BYTES = 4
7
7
 
8
- def bytestring_to_int(s)
9
- s.unpack('B*')[0].to_i(2)
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.flac$/i
10
10
  end
11
11
 
12
12
  def call(io)
@@ -71,5 +71,9 @@ class FormatParser::FLACParser
71
71
  )
72
72
  end
73
73
 
74
+ def bytestring_to_int(s)
75
+ s.unpack('B*')[0].to_i(2)
76
+ end
77
+
74
78
  FormatParser.register_parser self, natures: :audio, formats: :flac
75
79
  end
@@ -4,6 +4,10 @@ class FormatParser::GIFParser
4
4
  HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
5
5
  NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
6
6
 
7
+ def self.likely_match?(filename)
8
+ filename =~ /\.gif$/i
9
+ end
10
+
7
11
  def call(io)
8
12
  io = FormatParser::IOConstraint.new(io)
9
13
  header = safe_read(io, 6)
@@ -13,6 +13,10 @@ class FormatParser::JPEGParser
13
13
  EXIF_MAGIC_STRING = "Exif\0\0".b
14
14
  MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
15
15
 
16
+ def self.likely_match?(filename)
17
+ filename =~ /\.jpe?g$/i
18
+ end
19
+
16
20
  def call(io)
17
21
  @buf = FormatParser::IOConstraint.new(io)
18
22
  @width = nil
@@ -11,6 +11,10 @@ class FormatParser::MOOVParser
11
11
  'm4a ' => :m4a,
12
12
  }
13
13
 
14
+ def self.likely_match?(filename)
15
+ filename =~ /\.(mov|m4a|ma4|mp4|aac)$/i
16
+ end
17
+
14
18
  def call(io)
15
19
  return unless matches_moov_definition?(io)
16
20
 
@@ -53,6 +53,10 @@ class FormatParser::MP3Parser
53
53
  end
54
54
  end
55
55
 
56
+ def self.likely_match?(filename)
57
+ filename =~ /\.mp3$/i
58
+ end
59
+
56
60
  def call(raw_io)
57
61
  io = FormatParser::IOConstraint.new(raw_io)
58
62
 
@@ -6,6 +6,10 @@ class FormatParser::OggParser
6
6
  # Maximum size of an Ogg page
7
7
  MAX_POSSIBLE_PAGE_SIZE = 65307
8
8
 
9
+ def self.likely_match?(filename)
10
+ filename =~ /\.ogg$/i
11
+ end
12
+
9
13
  def call(io)
10
14
  # The format consists of chunks of data each called an "Ogg page". Each page
11
15
  # begins with the characters, "OggS", to identify the file as Ogg format.
@@ -9,6 +9,10 @@ class FormatParser::PDFParser
9
9
  #
10
10
  PDF_MARKER = /%PDF-1\.[0-8]{1}/
11
11
 
12
+ def self.likely_match?(filename)
13
+ filename =~ /\.(pdf|ai)$/i
14
+ end
15
+
12
16
  def call(io)
13
17
  io = FormatParser::IOConstraint.new(io)
14
18
 
@@ -15,8 +15,8 @@ class FormatParser::PNGParser
15
15
  6 => true,
16
16
  }
17
17
 
18
- def chunk_length_and_type(io)
19
- safe_read(io, 8).unpack('Na4')
18
+ def self.likely_match?(filename)
19
+ filename =~ /\.png$/i
20
20
  end
21
21
 
22
22
  def call(io)
@@ -70,6 +70,10 @@ class FormatParser::PNGParser
70
70
  )
71
71
  end
72
72
 
73
+ def chunk_length_and_type(io)
74
+ safe_read(io, 8).unpack('Na4')
75
+ end
76
+
73
77
  # Give it priority 1 since priority 0 is reserved for JPEG, our most popular
74
78
  FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
75
79
  end
@@ -3,6 +3,10 @@ class FormatParser::PSDParser
3
3
 
4
4
  PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
5
5
 
6
+ def self.likely_match?(filename)
7
+ filename =~ /\.psd$/i # Maybe also PSB at some point
8
+ end
9
+
6
10
  def call(io)
7
11
  io = FormatParser::IOConstraint.new(io)
8
12
  magic_bytes = safe_read(io, 4).unpack('C4')
@@ -5,6 +5,10 @@ class FormatParser::TIFFParser
5
5
  MAGIC_LE = [0x49, 0x49, 0x2A, 0x0].pack('C4')
6
6
  MAGIC_BE = [0x4D, 0x4D, 0x0, 0x2A].pack('C4')
7
7
 
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.tiff?$/i
10
+ end
11
+
8
12
  def call(io)
9
13
  io = FormatParser::IOConstraint.new(io)
10
14
 
@@ -1,6 +1,10 @@
1
1
  class FormatParser::WAVParser
2
2
  include FormatParser::IOUtils
3
3
 
4
+ def self.likely_match?(filename)
5
+ filename =~ /\.wav$/i
6
+ end
7
+
4
8
  def call(io)
5
9
  # Read the RIFF header. Chunk descriptor should be RIFF, the size should
6
10
  # contain the size of the entire file in bytes minus 8 bytes for the
@@ -5,6 +5,10 @@ class FormatParser::ZIPParser
5
5
  include OfficeFormats
6
6
  include FormatParser::IOUtils
7
7
 
8
+ def self.likely_match?(filename)
9
+ filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
10
+ end
11
+
8
12
  def call(io)
9
13
  io = FormatParser::IOConstraint.new(io)
10
14
  safe_read(io, 1) # Ensure the file is not empty
@@ -139,7 +139,7 @@ describe FormatParser do
139
139
 
140
140
  it 'passes keyword arguments to parse()' do
141
141
  path = fixtures_dir + '/WAV/c_M1F1-Alaw-AFsp.wav'
142
- expect(FormatParser).to receive(:parse).with(an_instance_of(File), foo: :bar)
142
+ expect(FormatParser).to receive(:parse).with(an_instance_of(File), filename_hint: 'c_M1F1-Alaw-AFsp.wav', foo: :bar)
143
143
  FormatParser.parse_file_at(path, foo: :bar)
144
144
  end
145
145
  end
@@ -165,6 +165,14 @@ describe FormatParser do
165
165
  image_parsers = FormatParser.parsers_for([:image], [:tif, :jpg, :aiff, :mp3])
166
166
  expect(image_parsers.length).to eq(2)
167
167
  end
168
+
169
+ it 'returns an array with the ZIPParser first if the filename_hint is for a ZIP file' do
170
+ prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], nil)
171
+ expect(prioritized_parsers.first).not_to be_kind_of(FormatParser::ZIPParser)
172
+
173
+ prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
174
+ expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
175
+ end
168
176
  end
169
177
 
170
178
  describe '.register_parser and .deregister_parser' do
@@ -1,6 +1,11 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe FormatParser::ZIPParser do
4
+ it 'provides filename hints' do
5
+ expect(FormatParser::ZIPParser).to be_likely_match('file.zip')
6
+ expect(FormatParser::ZIPParser).not_to be_likely_match('file.tif')
7
+ end
8
+
4
9
  it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
5
10
  fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
6
11
  fi_io = File.open(fixture_path, 'rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2019-01-29 00:00:00.000000000 Z
12
+ date: 2019-07-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: ks