format_parser 0.15.1 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +4 -4
- data/CHANGELOG.md +11 -0
- data/lib/format_parser.rb +20 -4
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/aiff_parser.rb +4 -0
- data/lib/parsers/bmp_parser.rb +4 -0
- data/lib/parsers/cr2_parser.rb +4 -0
- data/lib/parsers/dpx_parser.rb +4 -0
- data/lib/parsers/fdx_parser.rb +4 -0
- data/lib/parsers/flac_parser.rb +6 -2
- data/lib/parsers/gif_parser.rb +4 -0
- data/lib/parsers/jpeg_parser.rb +4 -0
- data/lib/parsers/moov_parser.rb +4 -0
- data/lib/parsers/mp3_parser.rb +4 -0
- data/lib/parsers/ogg_parser.rb +4 -0
- data/lib/parsers/pdf_parser.rb +4 -0
- data/lib/parsers/png_parser.rb +6 -2
- data/lib/parsers/psd_parser.rb +4 -0
- data/lib/parsers/tiff_parser.rb +4 -0
- data/lib/parsers/wav_parser.rb +4 -0
- data/lib/parsers/zip_parser.rb +4 -0
- data/spec/format_parser_spec.rb +9 -1
- data/spec/parsers/zip_parser_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e49c10ea63c475b9bc61781ea8e76f42ef9e4307
|
4
|
+
data.tar.gz: c625105933c8654f5aac93bae09476e32917655b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf72d2b021f6fc407b29cbc5b336fd71e022f6a40b67a79646ea58e605c33d34b44404339e7f15cf1c3f63729712a784ac75296fde54525f51a9c6276069619e
|
7
|
+
data.tar.gz: c68eadacfae8062e6fb1532d3e6976c349f1d3f7a950cafb7ea0bab714a125e81d3d32bccb154ed96c2118c142f984637ed17c6a53d1ee755778836ad008a7a0
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.16.0
|
2
|
+
* Add `filename_hint` keyword argument to `FormatParser.parse`. This can hint the library to apply
|
3
|
+
the parser that will likely match for this filename first, and the other parsers later. This helps
|
4
|
+
avoiding extra work when parsing less-popular file formats, and can be optionally used if the caller
|
5
|
+
knows the filename of the original file. Note that the filename is only that: a **hint,** it helps
|
6
|
+
apply parsers more efficiently but does not specify the actual format of the file that is going to
|
7
|
+
be detected.
|
8
|
+
|
9
|
+
## 0.15.1
|
10
|
+
* Relax the "ks" dependency version since we do not need the constraint to be so strict
|
11
|
+
|
1
12
|
## 0.15.0
|
2
13
|
* Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
|
3
14
|
on detection confidence and file format popularity at WT.
|
data/lib/format_parser.rb
CHANGED
@@ -86,18 +86,22 @@ module FormatParser
|
|
86
86
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
87
87
|
# @see {.parse}
|
88
88
|
def self.parse_http(url, **kwargs)
|
89
|
+
# Do not extract the filename, since the URL
|
90
|
+
# can really be "anything". But if the caller
|
91
|
+
# provides filename_hint it will be carried over
|
89
92
|
parse(RemoteIO.new(url), **kwargs)
|
90
93
|
end
|
91
94
|
|
92
95
|
# Parses the file at the given `path` and returns the results as if it were any IO
|
93
96
|
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
97
|
+
# The file path will be used to provide the `filename_hint` to `.parse()`.
|
94
98
|
#
|
95
99
|
# @param path[String] the path to the file to parse on the local filesystem
|
96
100
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
97
101
|
# @see {.parse}
|
98
102
|
def self.parse_file_at(path, **kwargs)
|
99
103
|
File.open(path, 'rb') do |io|
|
100
|
-
parse(io, **kwargs)
|
104
|
+
parse(io, filename_hint: File.basename(path), **kwargs)
|
101
105
|
end
|
102
106
|
end
|
103
107
|
|
@@ -116,9 +120,13 @@ module FormatParser
|
|
116
120
|
# When using `:first` parsing will stop at the first successful match and other parsers won't run.
|
117
121
|
# @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
|
118
122
|
# one should be good for most cases.
|
123
|
+
# @param filename_hint[String?] the filename. If provided, the first parser applied will be the
|
124
|
+
# one that responds `true` to `likely_match?` with that filename as an argument. This way
|
125
|
+
# we can optimize the order of application of parsers and start with the one that is more likely
|
126
|
+
# to match.
|
119
127
|
# @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
|
120
128
|
# no useful metadata could be recovered from the file
|
121
|
-
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config)
|
129
|
+
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config, filename_hint: nil)
|
122
130
|
# Limit the number of cached _pages_ we may fetch. This allows us to limit the number
|
123
131
|
# of page faults (page cache misses) a parser may incur
|
124
132
|
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limits_config.max_pagefaults_per_parser)
|
@@ -140,7 +148,7 @@ module FormatParser
|
|
140
148
|
# Always instantiate parsers fresh for each input, since they might
|
141
149
|
# contain instance variables which otherwise would have to be reset
|
142
150
|
# between invocations, and would complicate threading situations
|
143
|
-
parsers = parsers_for(natures, formats)
|
151
|
+
parsers = parsers_for(natures, formats, filename_hint)
|
144
152
|
|
145
153
|
# Limit how many operations the parser can perform
|
146
154
|
limited_io = ReadLimiter.new(
|
@@ -225,9 +233,11 @@ module FormatParser
|
|
225
233
|
#
|
226
234
|
# @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
|
227
235
|
# @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
|
236
|
+
# @param filename_hint[String?] the filename hint for the file. If provided,
|
237
|
+
# the parser that likely matches this filename will be applied first.
|
228
238
|
# @return [Array<#call>] an array of callable parsers
|
229
239
|
# @raise ArgumentError when there are no parsers satisfying the constraint
|
230
|
-
def self.parsers_for(desired_natures, desired_formats)
|
240
|
+
def self.parsers_for(desired_natures, desired_formats, filename_hint = nil)
|
231
241
|
assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
|
232
242
|
hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
|
233
243
|
}
|
@@ -246,6 +256,12 @@ module FormatParser
|
|
246
256
|
@parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
|
247
257
|
end
|
248
258
|
|
259
|
+
# If there is one parser that is more likely to match, place it first
|
260
|
+
if first_match = factories_in_order_of_priority.find { |f| f.respond_to?(:likely_match?) && f.likely_match?(filename_hint) }
|
261
|
+
factories_in_order_of_priority.delete(first_match)
|
262
|
+
factories_in_order_of_priority.unshift(first_match)
|
263
|
+
end
|
264
|
+
|
249
265
|
factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
250
266
|
end
|
251
267
|
|
data/lib/parsers/aiff_parser.rb
CHANGED
data/lib/parsers/bmp_parser.rb
CHANGED
data/lib/parsers/cr2_parser.rb
CHANGED
data/lib/parsers/dpx_parser.rb
CHANGED
data/lib/parsers/fdx_parser.rb
CHANGED
data/lib/parsers/flac_parser.rb
CHANGED
@@ -5,8 +5,8 @@ class FormatParser::FLACParser
|
|
5
5
|
MAGIC_BYTE_STRING = 'fLaC'
|
6
6
|
BLOCK_HEADER_BYTES = 4
|
7
7
|
|
8
|
-
def
|
9
|
-
|
8
|
+
def self.likely_match?(filename)
|
9
|
+
filename =~ /\.flac$/i
|
10
10
|
end
|
11
11
|
|
12
12
|
def call(io)
|
@@ -71,5 +71,9 @@ class FormatParser::FLACParser
|
|
71
71
|
)
|
72
72
|
end
|
73
73
|
|
74
|
+
def bytestring_to_int(s)
|
75
|
+
s.unpack('B*')[0].to_i(2)
|
76
|
+
end
|
77
|
+
|
74
78
|
FormatParser.register_parser self, natures: :audio, formats: :flac
|
75
79
|
end
|
data/lib/parsers/gif_parser.rb
CHANGED
@@ -4,6 +4,10 @@ class FormatParser::GIFParser
|
|
4
4
|
HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
|
5
5
|
NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
|
6
6
|
|
7
|
+
def self.likely_match?(filename)
|
8
|
+
filename =~ /\.gif$/i
|
9
|
+
end
|
10
|
+
|
7
11
|
def call(io)
|
8
12
|
io = FormatParser::IOConstraint.new(io)
|
9
13
|
header = safe_read(io, 6)
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -13,6 +13,10 @@ class FormatParser::JPEGParser
|
|
13
13
|
EXIF_MAGIC_STRING = "Exif\0\0".b
|
14
14
|
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
|
15
15
|
|
16
|
+
def self.likely_match?(filename)
|
17
|
+
filename =~ /\.jpe?g$/i
|
18
|
+
end
|
19
|
+
|
16
20
|
def call(io)
|
17
21
|
@buf = FormatParser::IOConstraint.new(io)
|
18
22
|
@width = nil
|
data/lib/parsers/moov_parser.rb
CHANGED
data/lib/parsers/mp3_parser.rb
CHANGED
data/lib/parsers/ogg_parser.rb
CHANGED
@@ -6,6 +6,10 @@ class FormatParser::OggParser
|
|
6
6
|
# Maximum size of an Ogg page
|
7
7
|
MAX_POSSIBLE_PAGE_SIZE = 65307
|
8
8
|
|
9
|
+
def self.likely_match?(filename)
|
10
|
+
filename =~ /\.ogg$/i
|
11
|
+
end
|
12
|
+
|
9
13
|
def call(io)
|
10
14
|
# The format consists of chunks of data each called an "Ogg page". Each page
|
11
15
|
# begins with the characters, "OggS", to identify the file as Ogg format.
|
data/lib/parsers/pdf_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -15,8 +15,8 @@ class FormatParser::PNGParser
|
|
15
15
|
6 => true,
|
16
16
|
}
|
17
17
|
|
18
|
-
def
|
19
|
-
|
18
|
+
def self.likely_match?(filename)
|
19
|
+
filename =~ /\.png$/i
|
20
20
|
end
|
21
21
|
|
22
22
|
def call(io)
|
@@ -70,6 +70,10 @@ class FormatParser::PNGParser
|
|
70
70
|
)
|
71
71
|
end
|
72
72
|
|
73
|
+
def chunk_length_and_type(io)
|
74
|
+
safe_read(io, 8).unpack('Na4')
|
75
|
+
end
|
76
|
+
|
73
77
|
# Give it priority 1 since priority 0 is reserved for JPEG, our most popular
|
74
78
|
FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
|
75
79
|
end
|
data/lib/parsers/psd_parser.rb
CHANGED
@@ -3,6 +3,10 @@ class FormatParser::PSDParser
|
|
3
3
|
|
4
4
|
PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
|
5
5
|
|
6
|
+
def self.likely_match?(filename)
|
7
|
+
filename =~ /\.psd$/i # Maybe also PSB at some point
|
8
|
+
end
|
9
|
+
|
6
10
|
def call(io)
|
7
11
|
io = FormatParser::IOConstraint.new(io)
|
8
12
|
magic_bytes = safe_read(io, 4).unpack('C4')
|
data/lib/parsers/tiff_parser.rb
CHANGED
data/lib/parsers/wav_parser.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
class FormatParser::WAVParser
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
|
+
def self.likely_match?(filename)
|
5
|
+
filename =~ /\.wav$/i
|
6
|
+
end
|
7
|
+
|
4
8
|
def call(io)
|
5
9
|
# Read the RIFF header. Chunk descriptor should be RIFF, the size should
|
6
10
|
# contain the size of the entire file in bytes minus 8 bytes for the
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -5,6 +5,10 @@ class FormatParser::ZIPParser
|
|
5
5
|
include OfficeFormats
|
6
6
|
include FormatParser::IOUtils
|
7
7
|
|
8
|
+
def self.likely_match?(filename)
|
9
|
+
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
|
10
|
+
end
|
11
|
+
|
8
12
|
def call(io)
|
9
13
|
io = FormatParser::IOConstraint.new(io)
|
10
14
|
safe_read(io, 1) # Ensure the file is not empty
|
data/spec/format_parser_spec.rb
CHANGED
@@ -139,7 +139,7 @@ describe FormatParser do
|
|
139
139
|
|
140
140
|
it 'passes keyword arguments to parse()' do
|
141
141
|
path = fixtures_dir + '/WAV/c_M1F1-Alaw-AFsp.wav'
|
142
|
-
expect(FormatParser).to receive(:parse).with(an_instance_of(File), foo: :bar)
|
142
|
+
expect(FormatParser).to receive(:parse).with(an_instance_of(File), filename_hint: 'c_M1F1-Alaw-AFsp.wav', foo: :bar)
|
143
143
|
FormatParser.parse_file_at(path, foo: :bar)
|
144
144
|
end
|
145
145
|
end
|
@@ -165,6 +165,14 @@ describe FormatParser do
|
|
165
165
|
image_parsers = FormatParser.parsers_for([:image], [:tif, :jpg, :aiff, :mp3])
|
166
166
|
expect(image_parsers.length).to eq(2)
|
167
167
|
end
|
168
|
+
|
169
|
+
it 'returns an array with the ZIPParser first if the filename_hint is for a ZIP file' do
|
170
|
+
prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], nil)
|
171
|
+
expect(prioritized_parsers.first).not_to be_kind_of(FormatParser::ZIPParser)
|
172
|
+
|
173
|
+
prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
|
174
|
+
expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
|
175
|
+
end
|
168
176
|
end
|
169
177
|
|
170
178
|
describe '.register_parser and .deregister_parser' do
|
@@ -1,6 +1,11 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe FormatParser::ZIPParser do
|
4
|
+
it 'provides filename hints' do
|
5
|
+
expect(FormatParser::ZIPParser).to be_likely_match('file.zip')
|
6
|
+
expect(FormatParser::ZIPParser).not_to be_likely_match('file.tif')
|
7
|
+
end
|
8
|
+
|
4
9
|
it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
|
5
10
|
fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
|
6
11
|
fi_io = File.open(fixture_path, 'rb')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-07-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|