format_parser 0.15.1 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +4 -4
- data/CHANGELOG.md +11 -0
- data/lib/format_parser.rb +20 -4
- data/lib/format_parser/version.rb +1 -1
- data/lib/parsers/aiff_parser.rb +4 -0
- data/lib/parsers/bmp_parser.rb +4 -0
- data/lib/parsers/cr2_parser.rb +4 -0
- data/lib/parsers/dpx_parser.rb +4 -0
- data/lib/parsers/fdx_parser.rb +4 -0
- data/lib/parsers/flac_parser.rb +6 -2
- data/lib/parsers/gif_parser.rb +4 -0
- data/lib/parsers/jpeg_parser.rb +4 -0
- data/lib/parsers/moov_parser.rb +4 -0
- data/lib/parsers/mp3_parser.rb +4 -0
- data/lib/parsers/ogg_parser.rb +4 -0
- data/lib/parsers/pdf_parser.rb +4 -0
- data/lib/parsers/png_parser.rb +6 -2
- data/lib/parsers/psd_parser.rb +4 -0
- data/lib/parsers/tiff_parser.rb +4 -0
- data/lib/parsers/wav_parser.rb +4 -0
- data/lib/parsers/zip_parser.rb +4 -0
- data/spec/format_parser_spec.rb +9 -1
- data/spec/parsers/zip_parser_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e49c10ea63c475b9bc61781ea8e76f42ef9e4307
|
4
|
+
data.tar.gz: c625105933c8654f5aac93bae09476e32917655b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf72d2b021f6fc407b29cbc5b336fd71e022f6a40b67a79646ea58e605c33d34b44404339e7f15cf1c3f63729712a784ac75296fde54525f51a9c6276069619e
|
7
|
+
data.tar.gz: c68eadacfae8062e6fb1532d3e6976c349f1d3f7a950cafb7ea0bab714a125e81d3d32bccb154ed96c2118c142f984637ed17c6a53d1ee755778836ad008a7a0
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.16.0
|
2
|
+
* Add `filename_hint` keyword argument to `FormatParser.parse`. This can hint the library to apply
|
3
|
+
the parser that will likely match for this filename first, and the other parsers later. This helps
|
4
|
+
avoiding extra work when parsing less-popular file formats, and can be optionally used if the caller
|
5
|
+
knows the filename of the original file. Note that the filename is only that: a **hint,** it helps
|
6
|
+
apply parsers more efficiently but does not specify the actual format of the file that is going to
|
7
|
+
be detected.
|
8
|
+
|
9
|
+
## 0.15.1
|
10
|
+
* Relax the "ks" dependency version since we do not need the constraint to be so strict
|
11
|
+
|
1
12
|
## 0.15.0
|
2
13
|
* Allow setting `:priority` when registering a parser, to make sure certain parsers are applied earlier - depending
|
3
14
|
on detection confidence and file format popularity at WT.
|
data/lib/format_parser.rb
CHANGED
@@ -86,18 +86,22 @@ module FormatParser
|
|
86
86
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
87
87
|
# @see {.parse}
|
88
88
|
def self.parse_http(url, **kwargs)
|
89
|
+
# Do not extract the filename, since the URL
|
90
|
+
# can really be "anything". But if the caller
|
91
|
+
# provides filename_hint it will be carried over
|
89
92
|
parse(RemoteIO.new(url), **kwargs)
|
90
93
|
end
|
91
94
|
|
92
95
|
# Parses the file at the given `path` and returns the results as if it were any IO
|
93
96
|
# given to `.parse`. The accepted keyword arguments are the same as the ones for `parse`.
|
97
|
+
# The file path will be used to provide the `filename_hint` to `.parse()`.
|
94
98
|
#
|
95
99
|
# @param path[String] the path to the file to parse on the local filesystem
|
96
100
|
# @param kwargs the keyword arguments to be delegated to `.parse`
|
97
101
|
# @see {.parse}
|
98
102
|
def self.parse_file_at(path, **kwargs)
|
99
103
|
File.open(path, 'rb') do |io|
|
100
|
-
parse(io, **kwargs)
|
104
|
+
parse(io, filename_hint: File.basename(path), **kwargs)
|
101
105
|
end
|
102
106
|
end
|
103
107
|
|
@@ -116,9 +120,13 @@ module FormatParser
|
|
116
120
|
# When using `:first` parsing will stop at the first successful match and other parsers won't run.
|
117
121
|
# @param limits_config[ReadLimitsConfig] the configuration object for various read/cache limits. The default
|
118
122
|
# one should be good for most cases.
|
123
|
+
# @param filename_hint[String?] the filename. If provided, the first parser applied will be the
|
124
|
+
# one that responds `true` to `likely_match?` with that filename as an argument. This way
|
125
|
+
# we can optimize the order of application of parsers and start with the one that is more likely
|
126
|
+
# to match.
|
119
127
|
# @return [Array<Result>, Result, nil] either an Array of results, a single parsing result or `nil`if
|
120
128
|
# no useful metadata could be recovered from the file
|
121
|
-
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config)
|
129
|
+
def self.parse(io, natures: @parsers_per_nature.keys, formats: @parsers_per_format.keys, results: :first, limits_config: default_limits_config, filename_hint: nil)
|
122
130
|
# Limit the number of cached _pages_ we may fetch. This allows us to limit the number
|
123
131
|
# of page faults (page cache misses) a parser may incur
|
124
132
|
read_limiter_under_cache = FormatParser::ReadLimiter.new(io, max_reads: limits_config.max_pagefaults_per_parser)
|
@@ -140,7 +148,7 @@ module FormatParser
|
|
140
148
|
# Always instantiate parsers fresh for each input, since they might
|
141
149
|
# contain instance variables which otherwise would have to be reset
|
142
150
|
# between invocations, and would complicate threading situations
|
143
|
-
parsers = parsers_for(natures, formats)
|
151
|
+
parsers = parsers_for(natures, formats, filename_hint)
|
144
152
|
|
145
153
|
# Limit how many operations the parser can perform
|
146
154
|
limited_io = ReadLimiter.new(
|
@@ -225,9 +233,11 @@ module FormatParser
|
|
225
233
|
#
|
226
234
|
# @param desired_natures[Array] which natures should be considered (like `[:image, :archive]`)
|
227
235
|
# @param desired_formats[Array] which formats should be considered (like `[:tif, :jpg]`)
|
236
|
+
# @param filename_hint[String?] the filename hint for the file. If provided,
|
237
|
+
# the parser that likely matches this filename will be applied first.
|
228
238
|
# @return [Array<#call>] an array of callable parsers
|
229
239
|
# @raise ArgumentError when there are no parsers satisfying the constraint
|
230
|
-
def self.parsers_for(desired_natures, desired_formats)
|
240
|
+
def self.parsers_for(desired_natures, desired_formats, filename_hint = nil)
|
231
241
|
assemble_parser_set = ->(hash_of_sets, keys_of_interest) {
|
232
242
|
hash_of_sets.values_at(*keys_of_interest).compact.inject(&:+) || Set.new
|
233
243
|
}
|
@@ -246,6 +256,12 @@ module FormatParser
|
|
246
256
|
@parser_priorities[parser_factory_a] <=> @parser_priorities[parser_factory_b]
|
247
257
|
end
|
248
258
|
|
259
|
+
# If there is one parser that is more likely to match, place it first
|
260
|
+
if first_match = factories_in_order_of_priority.find { |f| f.respond_to?(:likely_match?) && f.likely_match?(filename_hint) }
|
261
|
+
factories_in_order_of_priority.delete(first_match)
|
262
|
+
factories_in_order_of_priority.unshift(first_match)
|
263
|
+
end
|
264
|
+
|
249
265
|
factories_in_order_of_priority.map { |callable_or_class| instantiate_parser(callable_or_class) }
|
250
266
|
end
|
251
267
|
|
data/lib/parsers/aiff_parser.rb
CHANGED
data/lib/parsers/bmp_parser.rb
CHANGED
data/lib/parsers/cr2_parser.rb
CHANGED
data/lib/parsers/dpx_parser.rb
CHANGED
data/lib/parsers/fdx_parser.rb
CHANGED
data/lib/parsers/flac_parser.rb
CHANGED
@@ -5,8 +5,8 @@ class FormatParser::FLACParser
|
|
5
5
|
MAGIC_BYTE_STRING = 'fLaC'
|
6
6
|
BLOCK_HEADER_BYTES = 4
|
7
7
|
|
8
|
-
def
|
9
|
-
|
8
|
+
def self.likely_match?(filename)
|
9
|
+
filename =~ /\.flac$/i
|
10
10
|
end
|
11
11
|
|
12
12
|
def call(io)
|
@@ -71,5 +71,9 @@ class FormatParser::FLACParser
|
|
71
71
|
)
|
72
72
|
end
|
73
73
|
|
74
|
+
def bytestring_to_int(s)
|
75
|
+
s.unpack('B*')[0].to_i(2)
|
76
|
+
end
|
77
|
+
|
74
78
|
FormatParser.register_parser self, natures: :audio, formats: :flac
|
75
79
|
end
|
data/lib/parsers/gif_parser.rb
CHANGED
@@ -4,6 +4,10 @@ class FormatParser::GIFParser
|
|
4
4
|
HEADERS = ['GIF87a', 'GIF89a'].map(&:b)
|
5
5
|
NETSCAPE_AND_AUTHENTICATION_CODE = 'NETSCAPE2.0'
|
6
6
|
|
7
|
+
def self.likely_match?(filename)
|
8
|
+
filename =~ /\.gif$/i
|
9
|
+
end
|
10
|
+
|
7
11
|
def call(io)
|
8
12
|
io = FormatParser::IOConstraint.new(io)
|
9
13
|
header = safe_read(io, 6)
|
data/lib/parsers/jpeg_parser.rb
CHANGED
@@ -13,6 +13,10 @@ class FormatParser::JPEGParser
|
|
13
13
|
EXIF_MAGIC_STRING = "Exif\0\0".b
|
14
14
|
MUST_FIND_NEXT_MARKER_WITHIN_BYTES = 1024
|
15
15
|
|
16
|
+
def self.likely_match?(filename)
|
17
|
+
filename =~ /\.jpe?g$/i
|
18
|
+
end
|
19
|
+
|
16
20
|
def call(io)
|
17
21
|
@buf = FormatParser::IOConstraint.new(io)
|
18
22
|
@width = nil
|
data/lib/parsers/moov_parser.rb
CHANGED
data/lib/parsers/mp3_parser.rb
CHANGED
data/lib/parsers/ogg_parser.rb
CHANGED
@@ -6,6 +6,10 @@ class FormatParser::OggParser
|
|
6
6
|
# Maximum size of an Ogg page
|
7
7
|
MAX_POSSIBLE_PAGE_SIZE = 65307
|
8
8
|
|
9
|
+
def self.likely_match?(filename)
|
10
|
+
filename =~ /\.ogg$/i
|
11
|
+
end
|
12
|
+
|
9
13
|
def call(io)
|
10
14
|
# The format consists of chunks of data each called an "Ogg page". Each page
|
11
15
|
# begins with the characters, "OggS", to identify the file as Ogg format.
|
data/lib/parsers/pdf_parser.rb
CHANGED
data/lib/parsers/png_parser.rb
CHANGED
@@ -15,8 +15,8 @@ class FormatParser::PNGParser
|
|
15
15
|
6 => true,
|
16
16
|
}
|
17
17
|
|
18
|
-
def
|
19
|
-
|
18
|
+
def self.likely_match?(filename)
|
19
|
+
filename =~ /\.png$/i
|
20
20
|
end
|
21
21
|
|
22
22
|
def call(io)
|
@@ -70,6 +70,10 @@ class FormatParser::PNGParser
|
|
70
70
|
)
|
71
71
|
end
|
72
72
|
|
73
|
+
def chunk_length_and_type(io)
|
74
|
+
safe_read(io, 8).unpack('Na4')
|
75
|
+
end
|
76
|
+
|
73
77
|
# Give it priority 1 since priority 0 is reserved for JPEG, our most popular
|
74
78
|
FormatParser.register_parser self, natures: :image, formats: :png, priority: 1
|
75
79
|
end
|
data/lib/parsers/psd_parser.rb
CHANGED
@@ -3,6 +3,10 @@ class FormatParser::PSDParser
|
|
3
3
|
|
4
4
|
PSD_HEADER = [0x38, 0x42, 0x50, 0x53]
|
5
5
|
|
6
|
+
def self.likely_match?(filename)
|
7
|
+
filename =~ /\.psd$/i # Maybe also PSB at some point
|
8
|
+
end
|
9
|
+
|
6
10
|
def call(io)
|
7
11
|
io = FormatParser::IOConstraint.new(io)
|
8
12
|
magic_bytes = safe_read(io, 4).unpack('C4')
|
data/lib/parsers/tiff_parser.rb
CHANGED
data/lib/parsers/wav_parser.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
class FormatParser::WAVParser
|
2
2
|
include FormatParser::IOUtils
|
3
3
|
|
4
|
+
def self.likely_match?(filename)
|
5
|
+
filename =~ /\.wav$/i
|
6
|
+
end
|
7
|
+
|
4
8
|
def call(io)
|
5
9
|
# Read the RIFF header. Chunk descriptor should be RIFF, the size should
|
6
10
|
# contain the size of the entire file in bytes minus 8 bytes for the
|
data/lib/parsers/zip_parser.rb
CHANGED
@@ -5,6 +5,10 @@ class FormatParser::ZIPParser
|
|
5
5
|
include OfficeFormats
|
6
6
|
include FormatParser::IOUtils
|
7
7
|
|
8
|
+
def self.likely_match?(filename)
|
9
|
+
filename =~ /\.(zip|docx|keynote|numbers|pptx|xlsx)$/i
|
10
|
+
end
|
11
|
+
|
8
12
|
def call(io)
|
9
13
|
io = FormatParser::IOConstraint.new(io)
|
10
14
|
safe_read(io, 1) # Ensure the file is not empty
|
data/spec/format_parser_spec.rb
CHANGED
@@ -139,7 +139,7 @@ describe FormatParser do
|
|
139
139
|
|
140
140
|
it 'passes keyword arguments to parse()' do
|
141
141
|
path = fixtures_dir + '/WAV/c_M1F1-Alaw-AFsp.wav'
|
142
|
-
expect(FormatParser).to receive(:parse).with(an_instance_of(File), foo: :bar)
|
142
|
+
expect(FormatParser).to receive(:parse).with(an_instance_of(File), filename_hint: 'c_M1F1-Alaw-AFsp.wav', foo: :bar)
|
143
143
|
FormatParser.parse_file_at(path, foo: :bar)
|
144
144
|
end
|
145
145
|
end
|
@@ -165,6 +165,14 @@ describe FormatParser do
|
|
165
165
|
image_parsers = FormatParser.parsers_for([:image], [:tif, :jpg, :aiff, :mp3])
|
166
166
|
expect(image_parsers.length).to eq(2)
|
167
167
|
end
|
168
|
+
|
169
|
+
it 'returns an array with the ZIPParser first if the filename_hint is for a ZIP file' do
|
170
|
+
prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], nil)
|
171
|
+
expect(prioritized_parsers.first).not_to be_kind_of(FormatParser::ZIPParser)
|
172
|
+
|
173
|
+
prioritized_parsers = FormatParser.parsers_for([:archive, :document, :image, :audio], [:tif, :jpg, :zip, :docx, :mp3, :aiff], 'a-file.zip')
|
174
|
+
expect(prioritized_parsers.first).to be_kind_of(FormatParser::ZIPParser)
|
175
|
+
end
|
168
176
|
end
|
169
177
|
|
170
178
|
describe '.register_parser and .deregister_parser' do
|
@@ -1,6 +1,11 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe FormatParser::ZIPParser do
|
4
|
+
it 'provides filename hints' do
|
5
|
+
expect(FormatParser::ZIPParser).to be_likely_match('file.zip')
|
6
|
+
expect(FormatParser::ZIPParser).not_to be_likely_match('file.tif')
|
7
|
+
end
|
8
|
+
|
4
9
|
it 'parses a ZIP archive with Zip64 extra fields (due to the number of files)' do
|
5
10
|
fixture_path = fixtures_dir + '/ZIP/arch_many_entries.zip'
|
6
11
|
fi_io = File.open(fixture_path, 'rb')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-07-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ks
|