format_parser 2.6.0 → 2.7.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ce8a7c3fd258ccf2abcee6ea8731c0337e7255dfb189ccfb5fdfc02e9dd9b36
4
- data.tar.gz: a6ff6ea6e771f2636e30cfee70b3c22584cb85e7020a97483f66e2c482048444
3
+ metadata.gz: 5d2679a365f7c735d2b8c962765c4783b6336bf23461ffde1705ab141d250591
4
+ data.tar.gz: e1e4b4caa2956cbf1653d39498a9db84589cd0c9c979c6d84e9f3b3027427274
5
5
  SHA512:
6
- metadata.gz: e27bd51913d3a3b3d061ec27379acc4a15fa19ee2e16cccdbf5a0aec4daf8fe26cdfc0062115119616384ded92c645e4a88a6d0244b47b33c5fc79903ccd3906
7
- data.tar.gz: 024756cdb460347f36cd1aa247b04f5468982b4c502cf049e8c8abd5cb1020f960006f552c7659770c6149ff6ed3a61def2beb44160236f35fe35533579f6c31
6
+ metadata.gz: 52e775ae2a4ced22d2879fff48108974bfe4087333b70d74c5c0910229aa7298826664bbd474dd9b4221777637cc51bf969735a830df976f58f0ff7302dd69c5
7
+ data.tar.gz: 31b8f95ff8fbcba01d60fe2377699a9abebc61a28d74afc05aa8456d6660f786ee268c2a621c0dc83490e3f9b04dc5e1a60319ae6f4c54503b3dfab9d39d520e
data/README.md CHANGED
@@ -26,6 +26,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
26
26
  * HEIC
27
27
  * HEIF
28
28
  * JPEG
29
+ * JSON
29
30
  * M3U
30
31
  * M4A
31
32
  * M4B
@@ -216,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
216
217
  - NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
217
218
 
218
219
  ### OGG
219
- - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors
220
+ - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors
220
221
 
221
222
  ### PDF
222
223
  - PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
@@ -235,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
235
236
  ### WAV
236
237
  - c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018
237
238
  - c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed
238
- - c_M1F1-Alaw-AFsp.wav and d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
239
+ - c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
239
240
 
240
241
  ### WEBP
241
242
  - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.6.0'
2
+ VERSION = '2.7.1'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -17,6 +17,7 @@ module FormatParser
17
17
  require_relative 'read_limits_config'
18
18
  require_relative 'remote_io'
19
19
  require_relative 'io_constraint'
20
+ require_relative 'utf8_reader'
20
21
  require_relative 'care'
21
22
  require_relative 'active_storage/blob_analyzer'
22
23
  require_relative 'text'
@@ -35,6 +36,9 @@ module FormatParser
35
36
  # The value will ensure the parser having it will be applied to the file last.
36
37
  LEAST_PRIORITY = 99
37
38
 
39
+ @registered_natures = []
40
+ @registered_formats = []
41
+
38
42
  # Register a parser object to be used to perform file format detection. Each parser FormatParser
39
43
  # provides out of the box registers itself using this method.
40
44
  #
@@ -67,9 +71,20 @@ module FormatParser
67
71
  end
68
72
  @parser_priorities ||= {}
69
73
  @parser_priorities[callable_parser] = priority
74
+
75
+ @registered_natures |= parser_provided_natures
76
+ @registered_formats |= parser_provided_formats
70
77
  end
71
78
  end
72
79
 
80
+ def self.registered_natures
81
+ @registered_natures
82
+ end
83
+
84
+ def self.registered_formats
85
+ @registered_formats
86
+ end
87
+
73
88
  # Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
74
89
  # tests, but can also be used to forcibly disable some formats completely.
75
90
  #
@@ -0,0 +1,319 @@
1
+ ##
2
+ # This class checks whether a given file is a valid JSON file.
3
+ # The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
4
+ # Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
5
+ # the hierarchy of nodes in the document.
6
+ #
7
+ # Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
8
+ # it does cut a few corners for the sake of simplicity. For instance, instead of validating
9
+ # Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
10
+ # This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
11
+ #
12
+ # There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
13
+ # JSON-compliant, it is assumed that the file is a JSON file.
14
+
15
+ class FormatParser::JSONParser::Validator
16
+ class JSONParserError < StandardError
17
+ end
18
+
19
+ MAX_SAMPLE_SIZE = 1024
20
+ MAX_LITERAL_SIZE = 30 # much larger then necessary.
21
+ ESCAPE_CHAR = "\\"
22
+ WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
23
+ ENDING_VALUE_CHARS = [",", "]", "}"]
24
+ LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
25
+
26
+ def initialize(io)
27
+ @io = io
28
+ @current_node = nil # :object, :array, :string, :literal
29
+ @parent_nodes = []
30
+ @current_state = :awaiting_root_node
31
+ @escape_next = false
32
+ @current_literal_size = 0
33
+ @pos = 0
34
+
35
+ @all_parsers = {}
36
+
37
+ @execution_stats = {
38
+ array: 0,
39
+ object: 0,
40
+ literal: 0,
41
+ string: 0
42
+ }
43
+
44
+ setup_transitions
45
+ end
46
+
47
+ def validate
48
+ char_reader = FormatParser::UTF8Reader.new(@io)
49
+
50
+ while (c = char_reader.read_char)
51
+ @pos += 1
52
+ parse_char c
53
+
54
+ # Halt validation if the sampling limit is reached.
55
+ if @pos >= MAX_SAMPLE_SIZE
56
+ raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
57
+ return false
58
+ end
59
+ end
60
+
61
+ # Raising error in case the EOF is reached earlier than expected
62
+ raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
63
+ true
64
+ rescue FormatParser::UTF8Reader::UTF8CharReaderError
65
+ raise JSONParserError, "Invalid UTF-8 character"
66
+ end
67
+
68
+ def stats(node_type)
69
+ @execution_stats[node_type]
70
+ end
71
+
72
+ private
73
+
74
+ def setup_transitions
75
+ when_its :awaiting_root_node, ->(c) do
76
+ read_whitespace(c) or
77
+ start_object(c) or
78
+ start_array(c)
79
+ end
80
+
81
+ when_its :awaiting_object_attribute_key, ->(c) do
82
+ read_whitespace(c) or
83
+ start_attribute_key(c) or
84
+ close_object(c)
85
+ end
86
+
87
+ when_its :reading_object_attribute_key, ->(c) do
88
+ close_attribute_key(c) or
89
+ read_valid_string_char(c)
90
+ end
91
+
92
+ when_its :awaiting_object_colon_separator, ->(c) do
93
+ read_whitespace(c) or
94
+ read_colon(c)
95
+ end
96
+
97
+ when_its :awaiting_object_attribute_value, ->(c) do
98
+ read_whitespace(c) or
99
+ start_object(c) or
100
+ start_array(c) or
101
+ start_string(c) or
102
+ start_literal(c)
103
+ end
104
+
105
+ when_its :awaiting_array_value, ->(c) do
106
+ read_whitespace(c) or
107
+ start_object(c) or
108
+ start_array(c) or
109
+ start_string(c) or
110
+ start_literal(c) or
111
+ close_array(c)
112
+ end
113
+
114
+ when_its :reading_string, ->(c) do
115
+ close_string(c) or
116
+ read_valid_string_char(c)
117
+ end
118
+
119
+ when_its :awaiting_next_or_close, ->(c) do
120
+ read_whitespace(c) or
121
+ read_comma_separator(c) or
122
+ close_object(c) or
123
+ close_array(c)
124
+ end
125
+
126
+ when_its :reading_literal, ->(c) do
127
+ read_valid_literal_char(c) or (
128
+ close_literal(c) and (
129
+ read_whitespace(c) or
130
+ read_comma_separator(c) or
131
+ close_array(c) or
132
+ close_object(c)))
133
+ end
134
+
135
+ when_its :closed, ->(c) do
136
+ read_whitespace(c)
137
+ end
138
+ end
139
+
140
+ def when_its(state, act)
141
+ @all_parsers[state] = act
142
+ end
143
+
144
+ def parse_char(c)
145
+ next_step = @all_parsers[@current_state]
146
+ accepted = next_step.call(c)
147
+ reject_char(c) unless accepted
148
+ end
149
+
150
+ def read_whitespace(c)
151
+ whitespace?(c)
152
+ end
153
+
154
+ def read_colon(c)
155
+ if c == ":"
156
+ @current_state = :awaiting_object_attribute_value
157
+ return true
158
+ end
159
+ false
160
+ end
161
+
162
+ def read_valid_string_char(c)
163
+ if @escape_next
164
+ @escape_next = false
165
+ return true
166
+ end
167
+
168
+ if c == ESCAPE_CHAR
169
+ @escape_next = true
170
+ return true
171
+ end
172
+ !control_char?(c) and c != "\""
173
+ end
174
+
175
+ def read_valid_literal_char(c)
176
+ if valid_literal_char?(c)
177
+ @current_literal_size += 1
178
+ return true
179
+ end
180
+
181
+ false
182
+ end
183
+
184
+ def read_comma_separator(c)
185
+ if c == ","
186
+ @current_state = :awaiting_object_attribute_key if @current_node == :object
187
+ @current_state = :awaiting_array_value if @current_node == :array
188
+ return true
189
+ end
190
+ false
191
+ end
192
+
193
+ # Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
194
+ def start_object(c)
195
+ return false if whitespace?(c)
196
+ return false unless c == "{"
197
+
198
+ begin_node(:object)
199
+ @current_state = :awaiting_object_attribute_key
200
+ true
201
+ end
202
+
203
+ def close_object(c)
204
+ return false if whitespace?(c)
205
+ return false unless @current_node == :object and c == "}"
206
+
207
+ end_node
208
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
209
+ true
210
+ end
211
+
212
+ # Array: [1, "two", true, undefined, {}, []]
213
+ def start_array(c)
214
+ return false unless c == "["
215
+
216
+ begin_node(:array)
217
+ @current_state = :awaiting_array_value
218
+ true
219
+ end
220
+
221
+ def close_array(c)
222
+ return false if whitespace?(c)
223
+ return false unless @current_node == :array and c == "]"
224
+
225
+ end_node
226
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
227
+ true
228
+ end
229
+
230
+ def start_attribute_key(c)
231
+ return false unless c == "\""
232
+
233
+ begin_node(:string)
234
+ @current_state = :reading_object_attribute_key
235
+ true
236
+ end
237
+
238
+ def close_attribute_key(c)
239
+ return false if @escape_next
240
+ return false unless c == "\""
241
+ end_node
242
+ @current_state = :awaiting_object_colon_separator
243
+ true
244
+ end
245
+
246
+ # Strings: "Foo"
247
+ def start_string(c)
248
+ return false unless c == "\""
249
+
250
+ begin_node(:string)
251
+ @current_state = :reading_string
252
+ true
253
+ end
254
+
255
+ def close_string(c)
256
+ return false if @escape_next
257
+ return false unless c == "\""
258
+ end_node
259
+ @current_state = :awaiting_next_or_close
260
+ true
261
+ end
262
+
263
+ # literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
264
+ def start_literal(c)
265
+ return false unless valid_literal_char?(c)
266
+
267
+ begin_node(:literal)
268
+ @current_state = :reading_literal
269
+ @current_literal_size = 1
270
+ true
271
+ end
272
+
273
+ def close_literal(c)
274
+ raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
275
+
276
+ if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
277
+ end_node
278
+ @current_state = :awaiting_next_or_close
279
+ return true
280
+ end
281
+
282
+ false
283
+ end
284
+
285
+ # Marks the creation of a node (object, array, string or literal)
286
+ def begin_node(node_type)
287
+ # Accounting for the new node
288
+ @execution_stats[node_type] ||= 0
289
+ @execution_stats[node_type] += 1
290
+
291
+ # Managing the node execution stack
292
+ @parent_nodes.push(@current_node)
293
+ @current_node = node_type
294
+ end
295
+
296
+ # Marks the closure of a node (object, array, string or literal)
297
+ def end_node
298
+ @current_node = @parent_nodes.pop
299
+ @current_state = :closed if @current_node.nil?
300
+ end
301
+
302
+ def reject_char(char)
303
+ raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
304
+ end
305
+
306
+ def whitespace?(c)
307
+ WHITESPACE_CHARS.include?(c)
308
+ end
309
+
310
+ def control_char?(c)
311
+ # control characters: (U+0000 through U+001F)
312
+ utf8_code = c.unpack('U*')[0]
313
+ utf8_code <= 31
314
+ end
315
+
316
+ def valid_literal_char?(c)
317
+ LITERALS_CHAR_TEMPLATE === c
318
+ end
319
+ end
@@ -0,0 +1,25 @@
1
+ class FormatParser::JSONParser
2
+ include FormatParser::IOUtils
3
+ require_relative 'json_parser/validator'
4
+
5
+ JSON_MIME_TYPE = 'application/json'
6
+
7
+ def likely_match?(filename)
8
+ filename =~ /\.json$/i
9
+ end
10
+
11
+ def call(io)
12
+ io = FormatParser::IOConstraint.new(io)
13
+ validator = Validator.new(io)
14
+
15
+ validator.validate
16
+
17
+ FormatParser::Text.new(
18
+ format: :json,
19
+ content_type: JSON_MIME_TYPE,
20
+ )
21
+ rescue Validator::JSONParserError
22
+ nil
23
+ end
24
+ FormatParser.register_parser new, natures: :text, formats: :json
25
+ end
@@ -76,6 +76,11 @@ class FormatParser::MP3Parser
76
76
  io.seek(0)
77
77
  return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
78
78
 
79
+ # Prevention against parsing WAV files.
80
+ io.seek(0)
81
+ wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4')
82
+ return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE'
83
+
79
84
  # Read all the ID3 tags (or at least attempt to)
80
85
  io.seek(0)
81
86
  id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
@@ -315,5 +320,5 @@ class FormatParser::MP3Parser
315
320
  end
316
321
  end
317
322
 
318
- FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 99
323
+ FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101
319
324
  end
@@ -0,0 +1,68 @@
1
+ ##
2
+ # This class Reads individual characters from files using UTF-8 encoding
3
+ # This deals with two main concerns:
4
+ # - Variable byte length of characters
5
+ # - Reducing the number of read operations by loading bytes in chunks
6
+
7
+ class FormatParser::UTF8Reader
8
+ READ_CHUNK_SIZE = 128
9
+
10
+ class UTF8CharReaderError < StandardError
11
+ end
12
+
13
+ def initialize(io)
14
+ @io = io
15
+ @chunk = ""
16
+ @index = 0
17
+ @eof = false
18
+ end
19
+
20
+ def read_char
21
+ first_byte = read_byte
22
+ return if first_byte.nil?
23
+
24
+ char_length = assess_char_length(first_byte)
25
+ as_bytes = Array.new(char_length) do |i|
26
+ next first_byte if i == 0
27
+ read_byte
28
+ end
29
+
30
+ char = as_bytes.pack('c*').force_encoding('UTF-8')
31
+ raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
32
+
33
+ char
34
+ rescue TypeError
35
+ raise UTF8CharReaderError, "Invalid UTF-8 character"
36
+ end
37
+
38
+ private
39
+
40
+ def read_byte
41
+ manage_data_chunk
42
+ return if @chunk.nil?
43
+ byte = @chunk.bytes[@index]
44
+ @index += 1 unless byte.nil?
45
+ byte
46
+ end
47
+
48
+ def manage_data_chunk
49
+ return if @index < @chunk.length
50
+ @chunk = @io.read(READ_CHUNK_SIZE)
51
+ @chunk ||= ""
52
+ @index = 0
53
+ @eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
54
+ end
55
+
56
+ def assess_char_length(first_byte)
57
+ # 0_______ (1 byte)
58
+ # 110_____ (2 bytes) 192
59
+ # 1110____ (3 bytes) 224
60
+ # 11110___ (4 bytes) 240
61
+ case first_byte
62
+ when 240.. then 4
63
+ when 224..239 then 3
64
+ when 192..223 then 2
65
+ else 1
66
+ end
67
+ end
68
+ end
@@ -34,6 +34,26 @@ describe FormatParser do
34
34
  end
35
35
  end
36
36
 
37
+ it "fixtures with 'invalid' in the filename should fail to parse" do
38
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
39
+ file_name = File.basename(fixture_path)
40
+ next unless file_name.include? "invalid"
41
+ File.open(fixture_path, 'rb') do |file|
42
+ FormatParser.parse(file)
43
+ end
44
+ end
45
+ end
46
+
47
+ it "fixtures without 'invalid' in the filename should be parsed successfully" do
48
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
49
+ file_name = File.basename(fixture_path)
50
+ next if file_name.include? "invalid"
51
+ File.open(fixture_path, 'rb') do |file|
52
+ FormatParser.parse(file)
53
+ end
54
+ end
55
+ end
56
+
37
57
  it 'triggers parsers in a certain order that corresponds to the parser priorities' do
38
58
  file_contents = StringIO.new('a' * 4096)
39
59
 
@@ -189,12 +209,20 @@ describe FormatParser do
189
209
  'FormatParser::CR3Parser',
190
210
  'FormatParser::DPXParser',
191
211
  'FormatParser::FLACParser',
192
- 'FormatParser::MP3Parser',
193
212
  'FormatParser::OggParser',
194
213
  'FormatParser::TIFFParser',
195
- 'FormatParser::WAVParser'
214
+ 'FormatParser::WAVParser',
215
+ 'FormatParser::MP3Parser'
196
216
  ])
197
217
  end
218
+
219
+ it 'ensures that MP3 parser is the last one among all' do
220
+ natures = FormatParser.registered_natures
221
+ formats = FormatParser.registered_formats
222
+ prioritised_parsers = FormatParser.parsers_for(natures, formats)
223
+ parser_class_names = prioritised_parsers.map { |parser| parser.class.name }
224
+ expect(parser_class_names.last).to eq 'FormatParser::MP3Parser'
225
+ end
198
226
  end
199
227
 
200
228
  describe '.register_parser and .deregister_parser' do
@@ -55,7 +55,7 @@ describe FormatParser::FLACParser do
55
55
  end
56
56
 
57
57
  it 'raises an error when sample rate is 0' do
58
- fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
58
+ fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac'
59
59
 
60
60
  expect {
61
61
  subject.call(File.open(fpath, 'rb'))
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser::Validator do
4
+ def load_file(file_name)
5
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
6
+ FormatParser::JSONParser::Validator.new(io)
7
+ end
8
+
9
+ def load_string(content)
10
+ io = StringIO.new(content.encode(Encoding::UTF_8))
11
+ FormatParser::JSONParser::Validator.new(io)
12
+ end
13
+
14
+ describe 'When reading root nodes' do
15
+ it "identifies objects as root nodes" do
16
+ v = load_string '{"key": "value"}'
17
+
18
+ completed = v.validate
19
+
20
+ expect(completed).to be true
21
+ expect(v.stats(:object)).to be 1
22
+ expect(v.stats(:string)).to be 2
23
+ end
24
+
25
+ it "identifies arrays as root nodes" do
26
+ v = load_string '["e1", "e2"]'
27
+
28
+ completed = v.validate
29
+
30
+ expect(completed).to be true
31
+ expect(v.stats(:array)).to be 1
32
+ expect(v.stats(:string)).to be 2
33
+ end
34
+
35
+ it "rejects strings as root nodes" do
36
+ expect do
37
+ v = load_string '"this is a string"'
38
+ v.validate
39
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
40
+ end
41
+
42
+ it "rejects literals as root nodes" do
43
+ expect do
44
+ v = load_string 'true'
45
+ v.validate
46
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
47
+ end
48
+ end
49
+
50
+ describe 'When reading objects' do
51
+ it "recognizes empty objects" do
52
+ v = load_string '{}'
53
+
54
+ completed = v.validate
55
+ expect(completed).to be true
56
+ expect(v.stats(:object)).to be 1
57
+ expect(v.stats(:string)).to be 0
58
+ end
59
+
60
+ it "recognizes objects with a single attribute" do
61
+ v = load_string '{"key": "value"}'
62
+
63
+ completed = v.validate
64
+ expect(completed).to be true
65
+ expect(v.stats(:object)).to be 1
66
+ expect(v.stats(:string)).to be 2
67
+ end
68
+
69
+ it "recognizes objects with attributes of different types" do
70
+ v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
71
+
72
+ completed = v.validate
73
+ expect(completed).to be true
74
+ expect(v.stats(:object)).to be 1
75
+ expect(v.stats(:string)).to be 4
76
+ expect(v.stats(:literal)).to be 2
77
+ end
78
+
79
+ it "recognizes condensed objects (no whitespaces)" do
80
+ v = load_string '{"a":"b","c":"d"}'
81
+
82
+ completed = v.validate
83
+ expect(completed).to be true
84
+ expect(v.stats(:object)).to be 1
85
+ expect(v.stats(:string)).to be 4
86
+ end
87
+
88
+ it "recognizes formatted objects" do
89
+ v = load_string '{
90
+ "a":"b",
91
+ "c":"d"
92
+ }'
93
+
94
+ completed = v.validate
95
+ expect(completed).to be true
96
+ expect(v.stats(:object)).to be 1
97
+ expect(v.stats(:string)).to be 4
98
+ end
99
+
100
+ it "recognizes objects with nested objects and arrays" do
101
+ v = load_string '{
102
+ "a": {
103
+ "a1": "-",
104
+ "a2": "-",
105
+ "a3": {
106
+ "a3.1": "-"
107
+ },
108
+ },
109
+ "c": [1, null]
110
+ }'
111
+
112
+ completed = v.validate
113
+ expect(completed).to be true
114
+ expect(v.stats(:object)).to be 3
115
+ expect(v.stats(:array)).to be 1
116
+ expect(v.stats(:string)).to be 9
117
+ expect(v.stats(:literal)).to be 2
118
+ end
119
+
120
+ it "rejects objects without double-quoted attribute names" do
121
+ expect do
122
+ v = load_string '{a:"b",c:"d"}'
123
+ v.validate
124
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
125
+ end
126
+
127
+ it "rejects objects without comma separators" do
128
+ expect do
129
+ v = load_string '{
130
+ "a":"b"
131
+ "c":"d"
132
+ }'
133
+ v.validate
134
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
135
+ end
136
+ end
137
+
138
+ describe 'When reading arrays' do
139
+ it "recognizes empty arrays" do
140
+ v = load_string '[]'
141
+
142
+ completed = v.validate
143
+ expect(completed).to be true
144
+ expect(v.stats(:array)).to be 1
145
+ expect(v.stats(:string)).to be 0
146
+ end
147
+
148
+ it "recognizes arrays with a single element" do
149
+ v = load_string '[{}]'
150
+
151
+ completed = v.validate
152
+ expect(completed).to be true
153
+ expect(v.stats(:array)).to be 1
154
+ expect(v.stats(:object)).to be 1
155
+ end
156
+
157
+ it "recognizes arrays with elements of different types" do
158
+ v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
159
+
160
+ completed = v.validate
161
+ expect(completed).to be true
162
+ expect(v.stats(:array)).to be 2
163
+ expect(v.stats(:object)).to be 1
164
+ expect(v.stats(:string)).to be 3
165
+ expect(v.stats(:literal)).to be 2
166
+ end
167
+
168
+ it "recognizes condensed arrays (no whitespaces)" do
169
+ v = load_string '["a",2,null,false]'
170
+
171
+ completed = v.validate
172
+ expect(completed).to be true
173
+ expect(v.stats(:array)).to be 1
174
+ expect(v.stats(:string)).to be 1
175
+ expect(v.stats(:literal)).to be 3
176
+ end
177
+
178
+ it "recognizes formatted arrays" do
179
+ v = load_string '[
180
+ {
181
+ "a":"b"
182
+ },
183
+ {
184
+ "c":"d"
185
+ }
186
+ ]'
187
+
188
+ completed = v.validate
189
+ expect(completed).to be true
190
+ expect(v.stats(:array)).to be 1
191
+ expect(v.stats(:object)).to be 2
192
+ expect(v.stats(:string)).to be 4
193
+ end
194
+
195
+ it "recognizes arrays with nested objects and arrays" do
196
+ v = load_string '[{
197
+ "a": {
198
+ "a1": "-",
199
+ "a2": "-",
200
+ "a3": {
201
+ "a3.1": "-"
202
+ },
203
+ },
204
+ "c": [1, null]
205
+ },
206
+ [{ "a": "b" }, { "c":"d" }]
207
+ ]'
208
+
209
+ completed = v.validate
210
+ expect(completed).to be true
211
+ expect(v.stats(:array)).to be 3
212
+ expect(v.stats(:object)).to be 5
213
+ expect(v.stats(:string)).to be 13
214
+ expect(v.stats(:literal)).to be 2
215
+ end
216
+
217
+ it "rejects arrays without comma separators" do
218
+ expect do
219
+ v = load_string '[
220
+ "abc"
221
+ "def"
222
+ ]'
223
+ v.validate
224
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
225
+ end
226
+ end
227
+
228
+ describe 'When reading strings' do
229
+ it "recognizes regular strings" do
230
+ v = load_string '["abc", "def", "ghi"]'
231
+
232
+ completed = v.validate
233
+ expect(completed).to be true
234
+ expect(v.stats(:string)).to be 3
235
+ end
236
+
237
+ it "recognizes strings containing excaped characters" do
238
+ v = load_string '["ab\"c", "6\\2=3"]'
239
+
240
+ completed = v.validate
241
+ expect(completed).to be true
242
+ expect(v.stats(:string)).to be 2
243
+ end
244
+
245
+ it "recognizes strings containing UTF8 characters" do
246
+ v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
247
+
248
+ completed = v.validate
249
+ expect(completed).to be true
250
+ expect(v.stats(:string)).to be 2
251
+ end
252
+
253
+ it "recognizes long strings containing UTF8 characters" do
254
+ v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
255
+
256
+ completed = v.validate
257
+ expect(completed).to be true
258
+ expect(v.stats(:string)).to be 1
259
+ end
260
+ end
261
+
262
+ describe 'When reading literals' do
263
+ it "recognizes numbers" do
264
+ v = load_string '[1, -2.4, 1.0E+2]'
265
+
266
+ completed = v.validate
267
+ expect(completed).to be true
268
+ expect(v.stats(:literal)).to be 3
269
+ end
270
+
271
+ it "recognizes boolean values" do
272
+ v = load_string '[true, false]'
273
+
274
+ completed = v.validate
275
+ expect(completed).to be true
276
+ expect(v.stats(:literal)).to be 2
277
+ end
278
+
279
+ it "recognizes 'true', 'false' and 'null'" do
280
+ v = load_string '[true, false, null]'
281
+
282
+ completed = v.validate
283
+ expect(completed).to be true
284
+ expect(v.stats(:literal)).to be 3
285
+ end
286
+ end
287
+
288
+ describe 'When reading invalid JSON content' do
289
+ it "rejects truncated JSON content" do
290
+ expect do
291
+ v = load_string '[{
292
+ "a": ["abc","def"],
293
+ "b": 4'
294
+ v.validate
295
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
296
+ end
297
+ end
298
+
299
+ describe 'When reading large JSON files' do
300
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
301
+ v = load_file 'long_file_valid.json'
302
+
303
+ completed = v.validate
304
+ expect(completed).to be false
305
+ end
306
+
307
+ it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
308
+ v = load_file 'long_file_valid_non_formatted.json'
309
+
310
+ completed = v.validate
311
+ expect(completed).to be false
312
+ end
313
+
314
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
315
+ v = load_file 'long_file_malformed.json'
316
+
317
+ completed = v.validate
318
+ expect(completed).to be false
319
+ end
320
+ end
321
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser do
4
+ MAX_READS = 100
5
+
6
+ def load_file(file_name)
7
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
8
+ FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
9
+ end
10
+
11
+ def file_size(file_name)
12
+ File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
13
+ end
14
+
15
+ describe 'When reading objects valid JSON files' do
16
+ it "identifies JSON files with objects as root nodes" do
17
+ io = load_file 'object.json'
18
+
19
+ parsed = subject.call(io)
20
+
21
+ expect(parsed).not_to be_nil
22
+ expect(parsed.nature).to eq(:text)
23
+ expect(parsed.format).to eq(:json)
24
+ expect(parsed.content_type).to eq('application/json')
25
+ end
26
+
27
+ it "identifies JSON files carrying arrays as root nodes" do
28
+ io = load_file 'array.json'
29
+
30
+ parsed = subject.call(io)
31
+
32
+ expect(parsed).not_to be_nil
33
+ expect(parsed.nature).to eq(:text)
34
+ expect(parsed.format).to eq(:json)
35
+ expect(parsed.content_type).to eq('application/json')
36
+ end
37
+
38
+ it "identifies formatted JSON files" do
39
+ io = load_file 'formatted_object_utf8.json'
40
+
41
+ parsed = subject.call(io)
42
+
43
+ expect(parsed).not_to be_nil
44
+ expect(parsed.nature).to eq(:text)
45
+ expect(parsed.format).to eq(:json)
46
+ expect(parsed.content_type).to eq('application/json')
47
+ end
48
+
49
+ it "identifies files wrapped in whitespace characters" do
50
+ io = load_file 'whitespaces.json'
51
+
52
+ parsed = subject.call(io)
53
+
54
+ expect(parsed).not_to be_nil
55
+ expect(parsed.nature).to eq(:text)
56
+ expect(parsed.format).to eq(:json)
57
+ expect(parsed.content_type).to eq('application/json')
58
+ end
59
+
60
+ it "identifies files with nested objects and arrays" do
61
+ io = load_file 'nested_objects.json'
62
+
63
+ parsed = subject.call(io)
64
+
65
+ expect(parsed).not_to be_nil
66
+ expect(parsed.nature).to eq(:text)
67
+ expect(parsed.format).to eq(:json)
68
+ expect(parsed.content_type).to eq('application/json')
69
+ end
70
+
71
+ it "is reads the whole content of small files before accepting them" do
72
+ file_name = 'nested_objects.json'
73
+ io = load_file file_name
74
+ file_size = file_size file_name
75
+
76
+ parsed = subject.call(io)
77
+
78
+ expect(parsed).not_to be_nil
79
+ expect(parsed.nature).to eq(:text)
80
+ expect(parsed.format).to eq(:json)
81
+ expect(parsed.content_type).to eq('application/json')
82
+ expect(io.bytes).to be >= file_size
83
+ end
84
+
85
+ it "is accepts long files before reading the whole content" do
86
+ file_name = 'long_array_numbers.json'
87
+ io = load_file file_name
88
+ file_size = file_size file_name
89
+
90
+ parsed = subject.call(io)
91
+
92
+ expect(parsed).not_to be_nil
93
+ expect(parsed.nature).to eq(:text)
94
+ expect(parsed.format).to eq(:json)
95
+ expect(parsed.content_type).to eq('application/json')
96
+ expect(io.bytes).to be < file_size
97
+ end
98
+ end
99
+
100
+ describe 'When reading objects invalid JSON files' do
101
+ it "rejects files with corrupted JSON data" do
102
+ io = load_file 'invalid_malformed.json'
103
+
104
+ parsed = subject.call(io)
105
+
106
+ expect(parsed).to be_nil
107
+ end
108
+
109
+ it "rejects invalid files early without reading the whole content" do
110
+ io = load_file 'invalid_lorem_ipsum.json'
111
+
112
+ parsed = subject.call(io)
113
+
114
+ expect(parsed).to be_nil
115
+ expect(io.reads).to eq(1)
116
+ end
117
+ end
118
+ end
@@ -11,7 +11,7 @@ describe FormatParser::M3UParser do
11
11
  end
12
12
 
13
13
  describe 'an m3u file with missing header' do
14
- let(:m3u_file) { 'plain_text.m3u' }
14
+ let(:m3u_file) { 'invalid_plain_text.m3u' }
15
15
 
16
16
  it 'does not parse the file successfully' do
17
17
  expect(parsed_m3u).to be_nil
@@ -36,6 +36,12 @@ describe FormatParser::MP3Parser do
36
36
  expect(parsed).to be_nil
37
37
  end
38
38
 
39
+ it 'does not misdetect a WAV' do
40
+ fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav'
41
+ parsed = subject.call(File.open(fpath, 'rb'))
42
+ expect(parsed).to be_nil
43
+ end
44
+
39
45
  describe 'title/artist/album attributes' do
40
46
  let(:parsed) { subject.call(File.open(fpath, 'rb')) }
41
47
 
@@ -13,7 +13,7 @@ describe FormatParser::OggParser do
13
13
  end
14
14
 
15
15
  it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
16
- parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_garbage_at_the_end.ogg', 'rb'))
16
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb'))
17
17
  expect(parse_result).to be_nil
18
18
  end
19
19
 
@@ -46,17 +46,17 @@ describe FormatParser::PDFParser do
46
46
 
47
47
  describe 'broken PDF files should not parse' do
48
48
  it 'PDF with missing version header' do
49
- parsed_pdf = parse_pdf 'not_a.pdf'
49
+ parsed_pdf = parse_pdf 'invalid_not_a.pdf'
50
50
  expect(parsed_pdf).to be_nil
51
51
  end
52
52
 
53
53
  it 'PDF 2.0 with offset start' do
54
- parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf'
54
+ parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf'
55
55
  expect(parsed_pdf).to be_nil
56
56
  end
57
57
 
58
58
  it 'exceeds the PDF read limit' do
59
- parsed_pdf = parse_pdf 'exceed_PDF_read_limit.pdf'
59
+ parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf'
60
60
  expect(parsed_pdf).to be_nil
61
61
  end
62
62
  end
@@ -48,7 +48,7 @@ describe FormatParser::WAVParser do
48
48
 
49
49
  it "cannot parse file with audio format different from 1 and no 'fact' chunk" do
50
50
  expect {
51
- subject.call(File.open(__dir__ + '/../fixtures/WAV/d_6_Channel_ID.wav', 'rb'))
51
+ subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb'))
52
52
  }.to raise_error(FormatParser::IOUtils::InvalidRead)
53
53
  end
54
54
  end
@@ -7,7 +7,7 @@ describe FormatParser::WebpParser do
7
7
  end
8
8
 
9
9
  it 'does not parse files with an unrecognised variant' do
10
- result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb'))
11
11
  expect(result).to be_nil
12
12
  end
13
13
 
@@ -104,6 +104,43 @@ describe 'Fetching data from HTTP remotes' do
104
104
  expect(file_information.format).to eq(:png)
105
105
  end
106
106
 
107
+ describe 'correctly parses WAV files without falling back to another filetype' do
108
+ ['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename|
109
+ it "parses WAV file #{filename}" do
110
+ remote_url = 'http://localhost:9399/WAV/' + filename
111
+ file_information = FormatParser.parse_http(remote_url)
112
+ expect(file_information).not_to be_nil
113
+ expect(file_information.format).to eq(:wav)
114
+ end
115
+ end
116
+ end
117
+
118
+ describe "correctly parses files over HTTP without filename hint" do
119
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
120
+ file_name = File.basename(fixture_path)
121
+ next if file_name.include? "invalid"
122
+
123
+ file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name)
124
+ file_type_dir.delete_prefix!('/').delete_suffix!('/')
125
+ next if file_type_dir.empty?
126
+
127
+ # skipping this one because it's a special case
128
+ next if file_name == "arch_many_entries.zip"
129
+
130
+ it "parses #{file_type_dir} file: #{file_name}" do
131
+ url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20')
132
+ result_with_hint = FormatParser.parse_http(url, filename_hint: file_name)
133
+ result_no_hint = FormatParser.parse_http(url)
134
+
135
+ expect(result_with_hint).not_to be_nil
136
+ expect(result_no_hint).not_to be_nil
137
+
138
+ expect(result_no_hint.nature).to eq(result_with_hint.nature)
139
+ expect(result_no_hint.format).to eq(result_with_hint.format)
140
+ end
141
+ end
142
+ end
143
+
107
144
  describe 'when parsing remote fixtures' do
108
145
  Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
109
146
  filename = File.basename(fixture_path)
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.0
4
+ version: 2.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-31 00:00:00.000000000 Z
12
+ date: 2023-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -236,6 +236,8 @@ files:
236
236
  - lib/parsers/iso_base_media_file_format/decoder.rb
237
237
  - lib/parsers/iso_base_media_file_format/utils.rb
238
238
  - lib/parsers/jpeg_parser.rb
239
+ - lib/parsers/json_parser.rb
240
+ - lib/parsers/json_parser/validator.rb
239
241
  - lib/parsers/m3u_parser.rb
240
242
  - lib/parsers/mov_parser.rb
241
243
  - lib/parsers/mov_parser/decoder.rb
@@ -260,6 +262,7 @@ files:
260
262
  - lib/remote_io.rb
261
263
  - lib/string.rb
262
264
  - lib/text.rb
265
+ - lib/utf8_reader.rb
263
266
  - lib/video.rb
264
267
  - spec/active_storage/blob_io_spec.rb
265
268
  - spec/active_storage/rails_app_spec.rb
@@ -289,6 +292,8 @@ files:
289
292
  - spec/parsers/iso_base_media_file_format/decoder_spec.rb
290
293
  - spec/parsers/iso_base_media_file_format/utils_spec.rb
291
294
  - spec/parsers/jpeg_parser_spec.rb
295
+ - spec/parsers/json_parser/validator_spec.rb
296
+ - spec/parsers/json_parser_spec.rb
292
297
  - spec/parsers/m3u_parser_spec.rb
293
298
  - spec/parsers/mov_parser_spec.rb
294
299
  - spec/parsers/mp3_parser_spec.rb
@@ -314,7 +319,7 @@ licenses:
314
319
  - MIT (Hippocratic)
315
320
  metadata:
316
321
  allowed_push_host: https://rubygems.org
317
- post_install_message:
322
+ post_install_message:
318
323
  rdoc_options: []
319
324
  require_paths:
320
325
  - lib
@@ -329,8 +334,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
329
334
  - !ruby/object:Gem::Version
330
335
  version: '0'
331
336
  requirements: []
332
- rubygems_version: 3.3.7
333
- signing_key:
337
+ rubygems_version: 3.1.6
338
+ signing_key:
334
339
  specification_version: 4
335
340
  summary: A library for efficient parsing of file metadata
336
341
  test_files: []