format_parser 2.7.0 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f7b8b37e26143e2ea941db88183475e70c90ab658adcdad362b32b457a7d19ac
4
- data.tar.gz: 50758e107065e1a2ab4fbca7775359d928cd1a62942277a94fcbabfefa1bfe10
3
+ metadata.gz: 5d2679a365f7c735d2b8c962765c4783b6336bf23461ffde1705ab141d250591
4
+ data.tar.gz: e1e4b4caa2956cbf1653d39498a9db84589cd0c9c979c6d84e9f3b3027427274
5
5
  SHA512:
6
- metadata.gz: d34bcd7b0162fe6f911bdd8c3b626dd9ce35b98139bca6ec1b54e653bc7e50453af734c74134745a46e91fbc3528d88f1663fabdbd35e06ae908a41f8e81dcd7
7
- data.tar.gz: 2ebbc65f373a3e34a2e8300d40bb239df6d2003dab54b9c461a8cc75f651800d93273aa73bfb4f1a14f195e65f348c218b90b4c1bae860c8269e024ebb2e890c
6
+ metadata.gz: 52e775ae2a4ced22d2879fff48108974bfe4087333b70d74c5c0910229aa7298826664bbd474dd9b4221777637cc51bf969735a830df976f58f0ff7302dd69c5
7
+ data.tar.gz: 31b8f95ff8fbcba01d60fe2377699a9abebc61a28d74afc05aa8456d6660f786ee268c2a621c0dc83490e3f9b04dc5e1a60319ae6f4c54503b3dfab9d39d520e
data/README.md CHANGED
@@ -217,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
217
217
  - NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
218
218
 
219
219
  ### OGG
220
- - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors
220
+ - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors
221
221
 
222
222
  ### PDF
223
223
  - PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
@@ -236,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
236
236
  ### WAV
237
237
  - c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018
238
238
  - c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed
239
- - c_M1F1-Alaw-AFsp.wav and d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
239
+ - c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
240
240
 
241
241
  ### WEBP
242
242
  - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.7.0'
2
+ VERSION = '2.7.1'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -36,6 +36,9 @@ module FormatParser
36
36
  # The value will ensure the parser having it will be applied to the file last.
37
37
  LEAST_PRIORITY = 99
38
38
 
39
+ @registered_natures = []
40
+ @registered_formats = []
41
+
39
42
  # Register a parser object to be used to perform file format detection. Each parser FormatParser
40
43
  # provides out of the box registers itself using this method.
41
44
  #
@@ -68,9 +71,20 @@ module FormatParser
68
71
  end
69
72
  @parser_priorities ||= {}
70
73
  @parser_priorities[callable_parser] = priority
74
+
75
+ @registered_natures |= parser_provided_natures
76
+ @registered_formats |= parser_provided_formats
71
77
  end
72
78
  end
73
79
 
80
+ def self.registered_natures
81
+ @registered_natures
82
+ end
83
+
84
+ def self.registered_formats
85
+ @registered_formats
86
+ end
87
+
74
88
  # Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
75
89
  # tests, but can also be used to forcibly disable some formats completely.
76
90
  #
@@ -76,6 +76,11 @@ class FormatParser::MP3Parser
76
76
  io.seek(0)
77
77
  return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
78
78
 
79
+ # Prevention against parsing WAV files.
80
+ io.seek(0)
81
+ wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4')
82
+ return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE'
83
+
79
84
  # Read all the ID3 tags (or at least attempt to)
80
85
  io.seek(0)
81
86
  id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
@@ -315,5 +320,5 @@ class FormatParser::MP3Parser
315
320
  end
316
321
  end
317
322
 
318
- FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 99
323
+ FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101
319
324
  end
@@ -34,6 +34,26 @@ describe FormatParser do
34
34
  end
35
35
  end
36
36
 
37
+ it "fixtures with 'invalid' in the filename should fail to parse" do
38
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
39
+ file_name = File.basename(fixture_path)
40
+ next unless file_name.include? "invalid"
41
+ File.open(fixture_path, 'rb') do |file|
42
+ FormatParser.parse(file)
43
+ end
44
+ end
45
+ end
46
+
47
+ it "fixtures without 'invalid' in the filename should be parsed successfully" do
48
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
49
+ file_name = File.basename(fixture_path)
50
+ next if file_name.include? "invalid"
51
+ File.open(fixture_path, 'rb') do |file|
52
+ FormatParser.parse(file)
53
+ end
54
+ end
55
+ end
56
+
37
57
  it 'triggers parsers in a certain order that corresponds to the parser priorities' do
38
58
  file_contents = StringIO.new('a' * 4096)
39
59
 
@@ -189,12 +209,20 @@ describe FormatParser do
189
209
  'FormatParser::CR3Parser',
190
210
  'FormatParser::DPXParser',
191
211
  'FormatParser::FLACParser',
192
- 'FormatParser::MP3Parser',
193
212
  'FormatParser::OggParser',
194
213
  'FormatParser::TIFFParser',
195
- 'FormatParser::WAVParser'
214
+ 'FormatParser::WAVParser',
215
+ 'FormatParser::MP3Parser'
196
216
  ])
197
217
  end
218
+
219
+ it 'ensures that MP3 parser is the last one among all' do
220
+ natures = FormatParser.registered_natures
221
+ formats = FormatParser.registered_formats
222
+ prioritised_parsers = FormatParser.parsers_for(natures, formats)
223
+ parser_class_names = prioritised_parsers.map { |parser| parser.class.name }
224
+ expect(parser_class_names.last).to eq 'FormatParser::MP3Parser'
225
+ end
198
226
  end
199
227
 
200
228
  describe '.register_parser and .deregister_parser' do
@@ -55,7 +55,7 @@ describe FormatParser::FLACParser do
55
55
  end
56
56
 
57
57
  it 'raises an error when sample rate is 0' do
58
- fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
58
+ fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac'
59
59
 
60
60
  expect {
61
61
  subject.call(File.open(fpath, 'rb'))
@@ -99,7 +99,7 @@ describe FormatParser::JSONParser do
99
99
 
100
100
  describe 'When reading objects invalid JSON files' do
101
101
  it "rejects files with corrupted JSON data" do
102
- io = load_file 'malformed.json'
102
+ io = load_file 'invalid_malformed.json'
103
103
 
104
104
  parsed = subject.call(io)
105
105
 
@@ -107,7 +107,7 @@ describe FormatParser::JSONParser do
107
107
  end
108
108
 
109
109
  it "rejects invalid files early without reading the whole content" do
110
- io = load_file 'lorem_ipsum.json'
110
+ io = load_file 'invalid_lorem_ipsum.json'
111
111
 
112
112
  parsed = subject.call(io)
113
113
 
@@ -11,7 +11,7 @@ describe FormatParser::M3UParser do
11
11
  end
12
12
 
13
13
  describe 'an m3u file with missing header' do
14
- let(:m3u_file) { 'plain_text.m3u' }
14
+ let(:m3u_file) { 'invalid_plain_text.m3u' }
15
15
 
16
16
  it 'does not parse the file successfully' do
17
17
  expect(parsed_m3u).to be_nil
@@ -36,6 +36,12 @@ describe FormatParser::MP3Parser do
36
36
  expect(parsed).to be_nil
37
37
  end
38
38
 
39
+ it 'does not misdetect a WAV' do
40
+ fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav'
41
+ parsed = subject.call(File.open(fpath, 'rb'))
42
+ expect(parsed).to be_nil
43
+ end
44
+
39
45
  describe 'title/artist/album attributes' do
40
46
  let(:parsed) { subject.call(File.open(fpath, 'rb')) }
41
47
 
@@ -13,7 +13,7 @@ describe FormatParser::OggParser do
13
13
  end
14
14
 
15
15
  it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
16
- parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_garbage_at_the_end.ogg', 'rb'))
16
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb'))
17
17
  expect(parse_result).to be_nil
18
18
  end
19
19
 
@@ -46,17 +46,17 @@ describe FormatParser::PDFParser do
46
46
 
47
47
  describe 'broken PDF files should not parse' do
48
48
  it 'PDF with missing version header' do
49
- parsed_pdf = parse_pdf 'not_a.pdf'
49
+ parsed_pdf = parse_pdf 'invalid_not_a.pdf'
50
50
  expect(parsed_pdf).to be_nil
51
51
  end
52
52
 
53
53
  it 'PDF 2.0 with offset start' do
54
- parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf'
54
+ parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf'
55
55
  expect(parsed_pdf).to be_nil
56
56
  end
57
57
 
58
58
  it 'exceeds the PDF read limit' do
59
- parsed_pdf = parse_pdf 'exceed_PDF_read_limit.pdf'
59
+ parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf'
60
60
  expect(parsed_pdf).to be_nil
61
61
  end
62
62
  end
@@ -48,7 +48,7 @@ describe FormatParser::WAVParser do
48
48
 
49
49
  it "cannot parse file with audio format different from 1 and no 'fact' chunk" do
50
50
  expect {
51
- subject.call(File.open(__dir__ + '/../fixtures/WAV/d_6_Channel_ID.wav', 'rb'))
51
+ subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb'))
52
52
  }.to raise_error(FormatParser::IOUtils::InvalidRead)
53
53
  end
54
54
  end
@@ -7,7 +7,7 @@ describe FormatParser::WebpParser do
7
7
  end
8
8
 
9
9
  it 'does not parse files with an unrecognised variant' do
10
- result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb'))
11
11
  expect(result).to be_nil
12
12
  end
13
13
 
@@ -104,6 +104,43 @@ describe 'Fetching data from HTTP remotes' do
104
104
  expect(file_information.format).to eq(:png)
105
105
  end
106
106
 
107
+ describe 'correctly parses WAV files without falling back to another filetype' do
108
+ ['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename|
109
+ it "parses WAV file #{filename}" do
110
+ remote_url = 'http://localhost:9399/WAV/' + filename
111
+ file_information = FormatParser.parse_http(remote_url)
112
+ expect(file_information).not_to be_nil
113
+ expect(file_information.format).to eq(:wav)
114
+ end
115
+ end
116
+ end
117
+
118
+ describe "correctly parses files over HTTP without filename hint" do
119
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
120
+ file_name = File.basename(fixture_path)
121
+ next if file_name.include? "invalid"
122
+
123
+ file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name)
124
+ file_type_dir.delete_prefix!('/').delete_suffix!('/')
125
+ next if file_type_dir.empty?
126
+
127
+ # skipping this one because it's a special case
128
+ next if file_name == "arch_many_entries.zip"
129
+
130
+ it "parses #{file_type_dir} file: #{file_name}" do
131
+ url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20')
132
+ result_with_hint = FormatParser.parse_http(url, filename_hint: file_name)
133
+ result_no_hint = FormatParser.parse_http(url)
134
+
135
+ expect(result_with_hint).not_to be_nil
136
+ expect(result_no_hint).not_to be_nil
137
+
138
+ expect(result_no_hint.nature).to eq(result_with_hint.nature)
139
+ expect(result_no_hint.format).to eq(result_with_hint.format)
140
+ end
141
+ end
142
+ end
143
+
107
144
  describe 'when parsing remote fixtures' do
108
145
  Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
109
146
  filename = File.basename(fixture_path)
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.7.0
4
+ version: 2.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-06-27 00:00:00.000000000 Z
12
+ date: 2023-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -319,7 +319,7 @@ licenses:
319
319
  - MIT (Hippocratic)
320
320
  metadata:
321
321
  allowed_push_host: https://rubygems.org
322
- post_install_message:
322
+ post_install_message:
323
323
  rdoc_options: []
324
324
  require_paths:
325
325
  - lib
@@ -334,8 +334,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.3.7
338
- signing_key:
337
+ rubygems_version: 3.1.6
338
+ signing_key:
339
339
  specification_version: 4
340
340
  summary: A library for efficient parsing of file metadata
341
341
  test_files: []