format_parser 2.7.0 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +14 -0
- data/lib/parsers/mp3_parser.rb +6 -1
- data/spec/format_parser_spec.rb +30 -2
- data/spec/parsers/flac_parser_spec.rb +1 -1
- data/spec/parsers/json_parser_spec.rb +2 -2
- data/spec/parsers/m3u_parser_spec.rb +1 -1
- data/spec/parsers/mp3_parser_spec.rb +6 -0
- data/spec/parsers/ogg_parser_spec.rb +1 -1
- data/spec/parsers/pdf_parser_spec.rb +3 -3
- data/spec/parsers/wav_parser_spec.rb +1 -1
- data/spec/parsers/webp_parser_spec.rb +1 -1
- data/spec/remote_fetching_spec.rb +37 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d2679a365f7c735d2b8c962765c4783b6336bf23461ffde1705ab141d250591
|
4
|
+
data.tar.gz: e1e4b4caa2956cbf1653d39498a9db84589cd0c9c979c6d84e9f3b3027427274
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52e775ae2a4ced22d2879fff48108974bfe4087333b70d74c5c0910229aa7298826664bbd474dd9b4221777637cc51bf969735a830df976f58f0ff7302dd69c5
|
7
|
+
data.tar.gz: 31b8f95ff8fbcba01d60fe2377699a9abebc61a28d74afc05aa8456d6660f786ee268c2a621c0dc83490e3f9b04dc5e1a60319ae6f4c54503b3dfab9d39d520e
|
data/README.md
CHANGED
@@ -217,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
217
217
|
- NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
|
218
218
|
|
219
219
|
### OGG
|
220
|
-
- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `
|
220
|
+
- `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors
|
221
221
|
|
222
222
|
### PDF
|
223
223
|
- PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
|
@@ -236,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
|
|
236
236
|
### WAV
|
237
237
|
- c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018
|
238
238
|
- c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed
|
239
|
-
- c_M1F1-Alaw-AFsp.wav and
|
239
|
+
- c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
|
240
240
|
|
241
241
|
### WEBP
|
242
242
|
- With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
|
data/lib/format_parser.rb
CHANGED
@@ -36,6 +36,9 @@ module FormatParser
|
|
36
36
|
# The value will ensure the parser having it will be applied to the file last.
|
37
37
|
LEAST_PRIORITY = 99
|
38
38
|
|
39
|
+
@registered_natures = []
|
40
|
+
@registered_formats = []
|
41
|
+
|
39
42
|
# Register a parser object to be used to perform file format detection. Each parser FormatParser
|
40
43
|
# provides out of the box registers itself using this method.
|
41
44
|
#
|
@@ -68,9 +71,20 @@ module FormatParser
|
|
68
71
|
end
|
69
72
|
@parser_priorities ||= {}
|
70
73
|
@parser_priorities[callable_parser] = priority
|
74
|
+
|
75
|
+
@registered_natures |= parser_provided_natures
|
76
|
+
@registered_formats |= parser_provided_formats
|
71
77
|
end
|
72
78
|
end
|
73
79
|
|
80
|
+
def self.registered_natures
|
81
|
+
@registered_natures
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.registered_formats
|
85
|
+
@registered_formats
|
86
|
+
end
|
87
|
+
|
74
88
|
# Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
|
75
89
|
# tests, but can also be used to forcibly disable some formats completely.
|
76
90
|
#
|
data/lib/parsers/mp3_parser.rb
CHANGED
@@ -76,6 +76,11 @@ class FormatParser::MP3Parser
|
|
76
76
|
io.seek(0)
|
77
77
|
return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
|
78
78
|
|
79
|
+
# Prevention against parsing WAV files.
|
80
|
+
io.seek(0)
|
81
|
+
wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4')
|
82
|
+
return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE'
|
83
|
+
|
79
84
|
# Read all the ID3 tags (or at least attempt to)
|
80
85
|
io.seek(0)
|
81
86
|
id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
|
@@ -315,5 +320,5 @@ class FormatParser::MP3Parser
|
|
315
320
|
end
|
316
321
|
end
|
317
322
|
|
318
|
-
FormatParser.register_parser new, natures: :audio, formats: :mp3, priority:
|
323
|
+
FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101
|
319
324
|
end
|
data/spec/format_parser_spec.rb
CHANGED
@@ -34,6 +34,26 @@ describe FormatParser do
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
it "fixtures with 'invalid' in the filename should fail to parse" do
|
38
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
39
|
+
file_name = File.basename(fixture_path)
|
40
|
+
next unless file_name.include? "invalid"
|
41
|
+
File.open(fixture_path, 'rb') do |file|
|
42
|
+
FormatParser.parse(file)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
it "fixtures without 'invalid' in the filename should be parsed successfully" do
|
48
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
49
|
+
file_name = File.basename(fixture_path)
|
50
|
+
next if file_name.include? "invalid"
|
51
|
+
File.open(fixture_path, 'rb') do |file|
|
52
|
+
FormatParser.parse(file)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
37
57
|
it 'triggers parsers in a certain order that corresponds to the parser priorities' do
|
38
58
|
file_contents = StringIO.new('a' * 4096)
|
39
59
|
|
@@ -189,12 +209,20 @@ describe FormatParser do
|
|
189
209
|
'FormatParser::CR3Parser',
|
190
210
|
'FormatParser::DPXParser',
|
191
211
|
'FormatParser::FLACParser',
|
192
|
-
'FormatParser::MP3Parser',
|
193
212
|
'FormatParser::OggParser',
|
194
213
|
'FormatParser::TIFFParser',
|
195
|
-
'FormatParser::WAVParser'
|
214
|
+
'FormatParser::WAVParser',
|
215
|
+
'FormatParser::MP3Parser'
|
196
216
|
])
|
197
217
|
end
|
218
|
+
|
219
|
+
it 'ensures that MP3 parser is the last one among all' do
|
220
|
+
natures = FormatParser.registered_natures
|
221
|
+
formats = FormatParser.registered_formats
|
222
|
+
prioritised_parsers = FormatParser.parsers_for(natures, formats)
|
223
|
+
parser_class_names = prioritised_parsers.map { |parser| parser.class.name }
|
224
|
+
expect(parser_class_names.last).to eq 'FormatParser::MP3Parser'
|
225
|
+
end
|
198
226
|
end
|
199
227
|
|
200
228
|
describe '.register_parser and .deregister_parser' do
|
@@ -55,7 +55,7 @@ describe FormatParser::FLACParser do
|
|
55
55
|
end
|
56
56
|
|
57
57
|
it 'raises an error when sample rate is 0' do
|
58
|
-
fpath = fixtures_dir + 'FLAC/
|
58
|
+
fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac'
|
59
59
|
|
60
60
|
expect {
|
61
61
|
subject.call(File.open(fpath, 'rb'))
|
@@ -99,7 +99,7 @@ describe FormatParser::JSONParser do
|
|
99
99
|
|
100
100
|
describe 'When reading objects invalid JSON files' do
|
101
101
|
it "rejects files with corrupted JSON data" do
|
102
|
-
io = load_file '
|
102
|
+
io = load_file 'invalid_malformed.json'
|
103
103
|
|
104
104
|
parsed = subject.call(io)
|
105
105
|
|
@@ -107,7 +107,7 @@ describe FormatParser::JSONParser do
|
|
107
107
|
end
|
108
108
|
|
109
109
|
it "rejects invalid files early without reading the whole content" do
|
110
|
-
io = load_file '
|
110
|
+
io = load_file 'invalid_lorem_ipsum.json'
|
111
111
|
|
112
112
|
parsed = subject.call(io)
|
113
113
|
|
@@ -11,7 +11,7 @@ describe FormatParser::M3UParser do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
describe 'an m3u file with missing header' do
|
14
|
-
let(:m3u_file) { '
|
14
|
+
let(:m3u_file) { 'invalid_plain_text.m3u' }
|
15
15
|
|
16
16
|
it 'does not parse the file successfully' do
|
17
17
|
expect(parsed_m3u).to be_nil
|
@@ -36,6 +36,12 @@ describe FormatParser::MP3Parser do
|
|
36
36
|
expect(parsed).to be_nil
|
37
37
|
end
|
38
38
|
|
39
|
+
it 'does not misdetect a WAV' do
|
40
|
+
fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav'
|
41
|
+
parsed = subject.call(File.open(fpath, 'rb'))
|
42
|
+
expect(parsed).to be_nil
|
43
|
+
end
|
44
|
+
|
39
45
|
describe 'title/artist/album attributes' do
|
40
46
|
let(:parsed) { subject.call(File.open(fpath, 'rb')) }
|
41
47
|
|
@@ -13,7 +13,7 @@ describe FormatParser::OggParser do
|
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
|
16
|
-
parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/
|
16
|
+
parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb'))
|
17
17
|
expect(parse_result).to be_nil
|
18
18
|
end
|
19
19
|
|
@@ -46,17 +46,17 @@ describe FormatParser::PDFParser do
|
|
46
46
|
|
47
47
|
describe 'broken PDF files should not parse' do
|
48
48
|
it 'PDF with missing version header' do
|
49
|
-
parsed_pdf = parse_pdf '
|
49
|
+
parsed_pdf = parse_pdf 'invalid_not_a.pdf'
|
50
50
|
expect(parsed_pdf).to be_nil
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'PDF 2.0 with offset start' do
|
54
|
-
parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf'
|
54
|
+
parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf'
|
55
55
|
expect(parsed_pdf).to be_nil
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'exceeds the PDF read limit' do
|
59
|
-
parsed_pdf = parse_pdf '
|
59
|
+
parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf'
|
60
60
|
expect(parsed_pdf).to be_nil
|
61
61
|
end
|
62
62
|
end
|
@@ -48,7 +48,7 @@ describe FormatParser::WAVParser do
|
|
48
48
|
|
49
49
|
it "cannot parse file with audio format different from 1 and no 'fact' chunk" do
|
50
50
|
expect {
|
51
|
-
subject.call(File.open(__dir__ + '/../fixtures/WAV/
|
51
|
+
subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb'))
|
52
52
|
}.to raise_error(FormatParser::IOUtils::InvalidRead)
|
53
53
|
end
|
54
54
|
end
|
@@ -7,7 +7,7 @@ describe FormatParser::WebpParser do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
it 'does not parse files with an unrecognised variant' do
|
10
|
-
result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
|
10
|
+
result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb'))
|
11
11
|
expect(result).to be_nil
|
12
12
|
end
|
13
13
|
|
@@ -104,6 +104,43 @@ describe 'Fetching data from HTTP remotes' do
|
|
104
104
|
expect(file_information.format).to eq(:png)
|
105
105
|
end
|
106
106
|
|
107
|
+
describe 'correctly parses WAV files without falling back to another filetype' do
|
108
|
+
['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename|
|
109
|
+
it "parses WAV file #{filename}" do
|
110
|
+
remote_url = 'http://localhost:9399/WAV/' + filename
|
111
|
+
file_information = FormatParser.parse_http(remote_url)
|
112
|
+
expect(file_information).not_to be_nil
|
113
|
+
expect(file_information.format).to eq(:wav)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
describe "correctly parses files over HTTP without filename hint" do
|
119
|
+
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
120
|
+
file_name = File.basename(fixture_path)
|
121
|
+
next if file_name.include? "invalid"
|
122
|
+
|
123
|
+
file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name)
|
124
|
+
file_type_dir.delete_prefix!('/').delete_suffix!('/')
|
125
|
+
next if file_type_dir.empty?
|
126
|
+
|
127
|
+
# skipping this one because it's a special case
|
128
|
+
next if file_name == "arch_many_entries.zip"
|
129
|
+
|
130
|
+
it "parses #{file_type_dir} file: #{file_name}" do
|
131
|
+
url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20')
|
132
|
+
result_with_hint = FormatParser.parse_http(url, filename_hint: file_name)
|
133
|
+
result_no_hint = FormatParser.parse_http(url)
|
134
|
+
|
135
|
+
expect(result_with_hint).not_to be_nil
|
136
|
+
expect(result_no_hint).not_to be_nil
|
137
|
+
|
138
|
+
expect(result_no_hint.nature).to eq(result_with_hint.nature)
|
139
|
+
expect(result_no_hint.format).to eq(result_with_hint.format)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
107
144
|
describe 'when parsing remote fixtures' do
|
108
145
|
Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
|
109
146
|
filename = File.basename(fixture_path)
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.7.
|
4
|
+
version: 2.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
8
8
|
- Julik Tarkhanov
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-08-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -319,7 +319,7 @@ licenses:
|
|
319
319
|
- MIT (Hippocratic)
|
320
320
|
metadata:
|
321
321
|
allowed_push_host: https://rubygems.org
|
322
|
-
post_install_message:
|
322
|
+
post_install_message:
|
323
323
|
rdoc_options: []
|
324
324
|
require_paths:
|
325
325
|
- lib
|
@@ -334,8 +334,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
338
|
-
signing_key:
|
337
|
+
rubygems_version: 3.1.6
|
338
|
+
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: A library for efficient parsing of file metadata
|
341
341
|
test_files: []
|