format_parser 2.7.0 → 2.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +14 -0
- data/lib/parsers/mp3_parser.rb +6 -1
- data/spec/format_parser_spec.rb +30 -2
- data/spec/parsers/flac_parser_spec.rb +1 -1
- data/spec/parsers/json_parser_spec.rb +2 -2
- data/spec/parsers/m3u_parser_spec.rb +1 -1
- data/spec/parsers/mp3_parser_spec.rb +6 -0
- data/spec/parsers/ogg_parser_spec.rb +1 -1
- data/spec/parsers/pdf_parser_spec.rb +3 -3
- data/spec/parsers/wav_parser_spec.rb +1 -1
- data/spec/parsers/webp_parser_spec.rb +1 -1
- data/spec/remote_fetching_spec.rb +37 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d2679a365f7c735d2b8c962765c4783b6336bf23461ffde1705ab141d250591
|
4
|
+
data.tar.gz: e1e4b4caa2956cbf1653d39498a9db84589cd0c9c979c6d84e9f3b3027427274
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52e775ae2a4ced22d2879fff48108974bfe4087333b70d74c5c0910229aa7298826664bbd474dd9b4221777637cc51bf969735a830df976f58f0ff7302dd69c5
|
7
|
+
data.tar.gz: 31b8f95ff8fbcba01d60fe2377699a9abebc61a28d74afc05aa8456d6660f786ee268c2a621c0dc83490e3f9b04dc5e1a60319ae6f4c54503b3dfab9d39d520e
|
data/README.md
CHANGED
@@ -217,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
217
217
|
- NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
|
218
218
|
|
219
219
|
### OGG
|
220
|
-
- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `
|
220
|
+
- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors
|
221
221
|
|
222
222
|
### PDF
|
223
223
|
- PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
|
@@ -236,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
236
236
|
### WAV
|
237
237
|
- c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018
|
238
238
|
- c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed
|
239
|
-
- c_M1F1-Alaw-AFsp.wav and
|
239
|
+
- c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
|
240
240
|
|
241
241
|
### WEBP
|
242
242
|
- With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
|
data/lib/format_parser.rb
CHANGED
@@ -36,6 +36,9 @@ module FormatParser
|
|
36
36
|
# The value will ensure the parser having it will be applied to the file last.
|
37
37
|
LEAST_PRIORITY = 99
|
38
38
|
|
39
|
+
@registered_natures = []
|
40
|
+
@registered_formats = []
|
41
|
+
|
39
42
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
40
43
|
# provides out of the box registers itself using this method.
|
41
44
|
#
|
@@ -68,9 +71,20 @@ module FormatParser
|
|
68
71
|
end
|
69
72
|
@parser_priorities ||= {}
|
70
73
|
@parser_priorities[callable_parser] = priority
|
74
|
+
|
75
|
+
@registered_natures |= parser_provided_natures
|
76
|
+
@registered_formats |= parser_provided_formats
|
71
77
|
end
|
72
78
|
end
|
73
79
|
|
80
|
+
def self.registered_natures
|
81
|
+
@registered_natures
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.registered_formats
|
85
|
+
@registered_formats
|
86
|
+
end
|
87
|
+
|
74
88
|
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
75
89
|
# tests, but can also be used to forcibly disable some formats completely.
|
76
90
|
#
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -76,6 +76,11 @@ class FormatParser::MP3Parser
|
|
76
76
|
io.seek(0)
|
77
77
|
return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
|
78
78
|
|
79
|
+
# Prevention against parsing WAV files.
|
80
|
+
io.seek(0)
|
81
|
+
wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4')
|
82
|
+
return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE'
|
83
|
+
|
79
84
|
# Read all the ID3 tags (or at least attempt to)
|
80
85
|
io.seek(0)
|
81
86
|
id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
|
@@ -315,5 +320,5 @@ class FormatParser::MP3Parser
|
|
315
320
|
end
|
316
321
|
end
|
317
322
|
|
318
|
-
FormatParser.register_parser new, natures: :audio, formats: :mp3, priority:
|
323
|
+
FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101
|
319
324
|
end
|
data/spec/format_parser_spec.rb
CHANGED
@@ -34,6 +34,26 @@ describe FormatParser do
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
it "fixtures with 'invalid' in the filename should fail to parse" do
|
38
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
39
|
+
file_name = File.basename(fixture_path)
|
40
|
+
next unless file_name.include? "invalid"
|
41
|
+
File.open(fixture_path, 'rb') do |file|
|
42
|
+
FormatParser.parse(file)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
it "fixtures without 'invalid' in the filename should be parsed successfully" do
|
48
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
49
|
+
file_name = File.basename(fixture_path)
|
50
|
+
next if file_name.include? "invalid"
|
51
|
+
File.open(fixture_path, 'rb') do |file|
|
52
|
+
FormatParser.parse(file)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
37
57
|
it 'triggers parsers in a certain order that corresponds to the parser priorities' do
|
38
58
|
file_contents = StringIO.new('a' * 4096)
|
39
59
|
|
@@ -189,12 +209,20 @@ describe FormatParser do
|
|
189
209
|
'FormatParser::CR3Parser',
|
190
210
|
'FormatParser::DPXParser',
|
191
211
|
'FormatParser::FLACParser',
|
192
|
-
'FormatParser::MP3Parser',
|
193
212
|
'FormatParser::OggParser',
|
194
213
|
'FormatParser::TIFFParser',
|
195
|
-
'FormatParser::WAVParser'
|
214
|
+
'FormatParser::WAVParser',
|
215
|
+
'FormatParser::MP3Parser'
|
196
216
|
])
|
197
217
|
end
|
218
|
+
|
219
|
+
it 'ensures that MP3 parser is the last one among all' do
|
220
|
+
natures = FormatParser.registered_natures
|
221
|
+
formats = FormatParser.registered_formats
|
222
|
+
prioritised_parsers = FormatParser.parsers_for(natures, formats)
|
223
|
+
parser_class_names = prioritised_parsers.map { |parser| parser.class.name }
|
224
|
+
expect(parser_class_names.last).to eq 'FormatParser::MP3Parser'
|
225
|
+
end
|
198
226
|
end
|
199
227
|
|
200
228
|
describe '.register_parser and .deregister_parser' do
|
@@ -55,7 +55,7 @@ describe FormatParser::FLACParser do
|
|
55
55
|
end
|
56
56
|
|
57
57
|
it 'raises an error when sample rate is 0' do
|
58
|
-
fpath = fixtures_dir + 'FLAC/
|
58
|
+
fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac'
|
59
59
|
|
60
60
|
expect {
|
61
61
|
subject.call(File.open(fpath, 'rb'))
|
@@ -99,7 +99,7 @@ describe FormatParser::JSONParser do
|
|
99
99
|
|
100
100
|
describe 'When reading objects invalid JSON files' do
|
101
101
|
it "rejects files with corrupted JSON data" do
|
102
|
-
io = load_file '
|
102
|
+
io = load_file 'invalid_malformed.json'
|
103
103
|
|
104
104
|
parsed = subject.call(io)
|
105
105
|
|
@@ -107,7 +107,7 @@ describe FormatParser::JSONParser do
|
|
107
107
|
end
|
108
108
|
|
109
109
|
it "rejects invalid files early without reading the whole content" do
|
110
|
-
io = load_file '
|
110
|
+
io = load_file 'invalid_lorem_ipsum.json'
|
111
111
|
|
112
112
|
parsed = subject.call(io)
|
113
113
|
|
@@ -11,7 +11,7 @@ describe FormatParser::M3UParser do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
describe 'an m3u file with missing header' do
|
14
|
-
let(:m3u_file) { '
|
14
|
+
let(:m3u_file) { 'invalid_plain_text.m3u' }
|
15
15
|
|
16
16
|
it 'does not parse the file successfully' do
|
17
17
|
expect(parsed_m3u).to be_nil
|
@@ -36,6 +36,12 @@ describe FormatParser::MP3Parser do
|
|
36
36
|
expect(parsed).to be_nil
|
37
37
|
end
|
38
38
|
|
39
|
+
it 'does not misdetect a WAV' do
|
40
|
+
fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav'
|
41
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
42
|
+
expect(parsed).to be_nil
|
43
|
+
end
|
44
|
+
|
39
45
|
describe 'title/artist/album attributes' do
|
40
46
|
let(:parsed) { subject.call(File.open(fpath, 'rb')) }
|
41
47
|
|
@@ -13,7 +13,7 @@ describe FormatParser::OggParser do
|
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
|
16
|
-
parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/
|
16
|
+
parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb'))
|
17
17
|
expect(parse_result).to be_nil
|
18
18
|
end
|
19
19
|
|
@@ -46,17 +46,17 @@ describe FormatParser::PDFParser do
|
|
46
46
|
|
47
47
|
describe 'broken PDF files should not parse' do
|
48
48
|
it 'PDF with missing version header' do
|
49
|
-
parsed_pdf = parse_pdf '
|
49
|
+
parsed_pdf = parse_pdf 'invalid_not_a.pdf'
|
50
50
|
expect(parsed_pdf).to be_nil
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'PDF 2.0 with offset start' do
|
54
|
-
parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf'
|
54
|
+
parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf'
|
55
55
|
expect(parsed_pdf).to be_nil
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'exceeds the PDF read limit' do
|
59
|
-
parsed_pdf = parse_pdf '
|
59
|
+
parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf'
|
60
60
|
expect(parsed_pdf).to be_nil
|
61
61
|
end
|
62
62
|
end
|
@@ -48,7 +48,7 @@ describe FormatParser::WAVParser do
|
|
48
48
|
|
49
49
|
it "cannot parse file with audio format different from 1 and no 'fact' chunk" do
|
50
50
|
expect {
|
51
|
-
subject.call(File.open(__dir__ + '/../fixtures/WAV/
|
51
|
+
subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb'))
|
52
52
|
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
53
53
|
end
|
54
54
|
end
|
@@ -7,7 +7,7 @@ describe FormatParser::WebpParser do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
it 'does not parse files with an unrecognised variant' do
|
10
|
-
result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
|
10
|
+
result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb'))
|
11
11
|
expect(result).to be_nil
|
12
12
|
end
|
13
13
|
|
@@ -104,6 +104,43 @@ describe 'Fetching data from HTTP remotes' do
|
|
104
104
|
expect(file_information.format).to eq(:png)
|
105
105
|
end
|
106
106
|
|
107
|
+
describe 'correctly parses WAV files without falling back to another filetype' do
|
108
|
+
['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename|
|
109
|
+
it "parses WAV file #{filename}" do
|
110
|
+
remote_url = 'http://localhost:9399/WAV/' + filename
|
111
|
+
file_information = FormatParser.parse_http(remote_url)
|
112
|
+
expect(file_information).not_to be_nil
|
113
|
+
expect(file_information.format).to eq(:wav)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
describe "correctly parses files over HTTP without filename hint" do
|
119
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
120
|
+
file_name = File.basename(fixture_path)
|
121
|
+
next if file_name.include? "invalid"
|
122
|
+
|
123
|
+
file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name)
|
124
|
+
file_type_dir.delete_prefix!('/').delete_suffix!('/')
|
125
|
+
next if file_type_dir.empty?
|
126
|
+
|
127
|
+
# skipping this one because it's a special case
|
128
|
+
next if file_name == "arch_many_entries.zip"
|
129
|
+
|
130
|
+
it "parses #{file_type_dir} file: #{file_name}" do
|
131
|
+
url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20')
|
132
|
+
result_with_hint = FormatParser.parse_http(url, filename_hint: file_name)
|
133
|
+
result_no_hint = FormatParser.parse_http(url)
|
134
|
+
|
135
|
+
expect(result_with_hint).not_to be_nil
|
136
|
+
expect(result_no_hint).not_to be_nil
|
137
|
+
|
138
|
+
expect(result_no_hint.nature).to eq(result_with_hint.nature)
|
139
|
+
expect(result_no_hint.format).to eq(result_with_hint.format)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
107
144
|
describe 'when parsing remote fixtures' do
|
108
145
|
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
109
146
|
filename = File.basename(fixture_path)
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.7.
|
4
|
+
version: 2.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
8
8
|
- Julik Tarkhanov
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-08-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -319,7 +319,7 @@ licenses:
|
|
319
319
|
- MIT (Hippocratic)
|
320
320
|
metadata:
|
321
321
|
allowed_push_host: https://rubygems.org
|
322
|
-
post_install_message:
|
322
|
+
post_install_message:
|
323
323
|
rdoc_options: []
|
324
324
|
require_paths:
|
325
325
|
- lib
|
@@ -334,8 +334,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
338
|
-
signing_key:
|
337
|
+
rubygems_version: 3.1.6
|
338
|
+
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: A library for efficient parsing of file metadata
|
341
341
|
test_files: []
|