format_parser 2.6.0 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ce8a7c3fd258ccf2abcee6ea8731c0337e7255dfb189ccfb5fdfc02e9dd9b36
4
- data.tar.gz: a6ff6ea6e771f2636e30cfee70b3c22584cb85e7020a97483f66e2c482048444
3
+ metadata.gz: 5d2679a365f7c735d2b8c962765c4783b6336bf23461ffde1705ab141d250591
4
+ data.tar.gz: e1e4b4caa2956cbf1653d39498a9db84589cd0c9c979c6d84e9f3b3027427274
5
5
  SHA512:
6
- metadata.gz: e27bd51913d3a3b3d061ec27379acc4a15fa19ee2e16cccdbf5a0aec4daf8fe26cdfc0062115119616384ded92c645e4a88a6d0244b47b33c5fc79903ccd3906
7
- data.tar.gz: 024756cdb460347f36cd1aa247b04f5468982b4c502cf049e8c8abd5cb1020f960006f552c7659770c6149ff6ed3a61def2beb44160236f35fe35533579f6c31
6
+ metadata.gz: 52e775ae2a4ced22d2879fff48108974bfe4087333b70d74c5c0910229aa7298826664bbd474dd9b4221777637cc51bf969735a830df976f58f0ff7302dd69c5
7
+ data.tar.gz: 31b8f95ff8fbcba01d60fe2377699a9abebc61a28d74afc05aa8456d6660f786ee268c2a621c0dc83490e3f9b04dc5e1a60319ae6f4c54503b3dfab9d39d520e
data/README.md CHANGED
@@ -26,6 +26,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
26
26
  * HEIC
27
27
  * HEIF
28
28
  * JPEG
29
+ * JSON
29
30
  * M3U
30
31
  * M4A
31
32
  * M4B
@@ -216,7 +217,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
216
217
  - NEF examples are downloaded from http://www.rawsamples.ch/ and are Creative Common Licensed.
217
218
 
218
219
  ### OGG
219
- - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `with_garbage_at_the_end.ogg` have been generated by the project contributors
220
+ - `hi.ogg`, `vorbis.ogg`, `with_confusing_magic_string.ogg`, `invalid_with_garbage_at_the_end.ogg` have been generated by the project contributors
220
221
 
221
222
  ### PDF
222
223
  - PDF 2.0 files downloaded from the [PDF Association public Github repository](https://github.com/pdf-association/pdf20examples). These files are licensed under the Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.
@@ -235,7 +236,7 @@ Unless specified otherwise in this section the fixture files are MIT licensed an
235
236
  ### WAV
236
237
  - c_11k16bitpcm.wav and c_8kmp316.wav are from [Wikipedia WAV](https://en.wikipedia.org/wiki/WAV#Comparison_of_coding_schemes), retrieved January 7, 2018
237
238
  - c_39064__alienbomb__atmo-truck.wav is from [freesound](https://freesound.org/people/alienbomb/sounds/39064/) and is CC0 licensed
238
- - c_M1F1-Alaw-AFsp.wav and d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
239
+ - c_M1F1-Alaw-AFsp.wav and invalid_d_6_Channel_ID.wav are from a [McGill Engineering site](http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples.html)
239
240
 
240
241
  ### WEBP
241
242
  - With the exception of extended-animation.webp, which was obtained from Wikimedia Commons and is Creative Commons
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.6.0'
2
+ VERSION = '2.7.1'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -17,6 +17,7 @@ module FormatParser
17
17
  require_relative 'read_limits_config'
18
18
  require_relative 'remote_io'
19
19
  require_relative 'io_constraint'
20
+ require_relative 'utf8_reader'
20
21
  require_relative 'care'
21
22
  require_relative 'active_storage/blob_analyzer'
22
23
  require_relative 'text'
@@ -35,6 +36,9 @@ module FormatParser
35
36
  # The value will ensure the parser having it will be applied to the file last.
36
37
  LEAST_PRIORITY = 99
37
38
 
39
+ @registered_natures = []
40
+ @registered_formats = []
41
+
38
42
  # Register a parser object to be used to perform file format detection. Each parser FormatParser
39
43
  # provides out of the box registers itself using this method.
40
44
  #
@@ -67,9 +71,20 @@ module FormatParser
67
71
  end
68
72
  @parser_priorities ||= {}
69
73
  @parser_priorities[callable_parser] = priority
74
+
75
+ @registered_natures |= parser_provided_natures
76
+ @registered_formats |= parser_provided_formats
70
77
  end
71
78
  end
72
79
 
80
+ def self.registered_natures
81
+ @registered_natures
82
+ end
83
+
84
+ def self.registered_formats
85
+ @registered_formats
86
+ end
87
+
73
88
  # Deregister a parser object (makes FormatParser forget this parser existed). Is mostly used in
74
89
  # tests, but can also be used to forcibly disable some formats completely.
75
90
  #
@@ -0,0 +1,319 @@
1
+ ##
2
+ # This class checks whether a given file is a valid JSON file.
3
+ # The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
4
+ # Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
5
+ # the hierarchy of nodes in the document.
6
+ #
7
+ # Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
8
+ # it does cut a few corners for the sake of simplicity. For instance, instead of validating
9
+ # Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
10
+ # This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
11
+ #
12
+ # There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
13
+ # JSON-compliant, it is assumed that the file is a JSON file.
14
+
15
+ class FormatParser::JSONParser::Validator
16
+ class JSONParserError < StandardError
17
+ end
18
+
19
+ MAX_SAMPLE_SIZE = 1024
20
+ MAX_LITERAL_SIZE = 30 # much larger then necessary.
21
+ ESCAPE_CHAR = "\\"
22
+ WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
23
+ ENDING_VALUE_CHARS = [",", "]", "}"]
24
+ LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
25
+
26
+ def initialize(io)
27
+ @io = io
28
+ @current_node = nil # :object, :array, :string, :literal
29
+ @parent_nodes = []
30
+ @current_state = :awaiting_root_node
31
+ @escape_next = false
32
+ @current_literal_size = 0
33
+ @pos = 0
34
+
35
+ @all_parsers = {}
36
+
37
+ @execution_stats = {
38
+ array: 0,
39
+ object: 0,
40
+ literal: 0,
41
+ string: 0
42
+ }
43
+
44
+ setup_transitions
45
+ end
46
+
47
+ def validate
48
+ char_reader = FormatParser::UTF8Reader.new(@io)
49
+
50
+ while (c = char_reader.read_char)
51
+ @pos += 1
52
+ parse_char c
53
+
54
+ # Halt validation if the sampling limit is reached.
55
+ if @pos >= MAX_SAMPLE_SIZE
56
+ raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
57
+ return false
58
+ end
59
+ end
60
+
61
+ # Raising error in case the EOF is reached earlier than expected
62
+ raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
63
+ true
64
+ rescue FormatParser::UTF8Reader::UTF8CharReaderError
65
+ raise JSONParserError, "Invalid UTF-8 character"
66
+ end
67
+
68
+ def stats(node_type)
69
+ @execution_stats[node_type]
70
+ end
71
+
72
+ private
73
+
74
+ def setup_transitions
75
+ when_its :awaiting_root_node, ->(c) do
76
+ read_whitespace(c) or
77
+ start_object(c) or
78
+ start_array(c)
79
+ end
80
+
81
+ when_its :awaiting_object_attribute_key, ->(c) do
82
+ read_whitespace(c) or
83
+ start_attribute_key(c) or
84
+ close_object(c)
85
+ end
86
+
87
+ when_its :reading_object_attribute_key, ->(c) do
88
+ close_attribute_key(c) or
89
+ read_valid_string_char(c)
90
+ end
91
+
92
+ when_its :awaiting_object_colon_separator, ->(c) do
93
+ read_whitespace(c) or
94
+ read_colon(c)
95
+ end
96
+
97
+ when_its :awaiting_object_attribute_value, ->(c) do
98
+ read_whitespace(c) or
99
+ start_object(c) or
100
+ start_array(c) or
101
+ start_string(c) or
102
+ start_literal(c)
103
+ end
104
+
105
+ when_its :awaiting_array_value, ->(c) do
106
+ read_whitespace(c) or
107
+ start_object(c) or
108
+ start_array(c) or
109
+ start_string(c) or
110
+ start_literal(c) or
111
+ close_array(c)
112
+ end
113
+
114
+ when_its :reading_string, ->(c) do
115
+ close_string(c) or
116
+ read_valid_string_char(c)
117
+ end
118
+
119
+ when_its :awaiting_next_or_close, ->(c) do
120
+ read_whitespace(c) or
121
+ read_comma_separator(c) or
122
+ close_object(c) or
123
+ close_array(c)
124
+ end
125
+
126
+ when_its :reading_literal, ->(c) do
127
+ read_valid_literal_char(c) or (
128
+ close_literal(c) and (
129
+ read_whitespace(c) or
130
+ read_comma_separator(c) or
131
+ close_array(c) or
132
+ close_object(c)))
133
+ end
134
+
135
+ when_its :closed, ->(c) do
136
+ read_whitespace(c)
137
+ end
138
+ end
139
+
140
+ def when_its(state, act)
141
+ @all_parsers[state] = act
142
+ end
143
+
144
+ def parse_char(c)
145
+ next_step = @all_parsers[@current_state]
146
+ accepted = next_step.call(c)
147
+ reject_char(c) unless accepted
148
+ end
149
+
150
+ def read_whitespace(c)
151
+ whitespace?(c)
152
+ end
153
+
154
+ def read_colon(c)
155
+ if c == ":"
156
+ @current_state = :awaiting_object_attribute_value
157
+ return true
158
+ end
159
+ false
160
+ end
161
+
162
+ def read_valid_string_char(c)
163
+ if @escape_next
164
+ @escape_next = false
165
+ return true
166
+ end
167
+
168
+ if c == ESCAPE_CHAR
169
+ @escape_next = true
170
+ return true
171
+ end
172
+ !control_char?(c) and c != "\""
173
+ end
174
+
175
+ def read_valid_literal_char(c)
176
+ if valid_literal_char?(c)
177
+ @current_literal_size += 1
178
+ return true
179
+ end
180
+
181
+ false
182
+ end
183
+
184
+ def read_comma_separator(c)
185
+ if c == ","
186
+ @current_state = :awaiting_object_attribute_key if @current_node == :object
187
+ @current_state = :awaiting_array_value if @current_node == :array
188
+ return true
189
+ end
190
+ false
191
+ end
192
+
193
+ # Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
194
+ def start_object(c)
195
+ return false if whitespace?(c)
196
+ return false unless c == "{"
197
+
198
+ begin_node(:object)
199
+ @current_state = :awaiting_object_attribute_key
200
+ true
201
+ end
202
+
203
+ def close_object(c)
204
+ return false if whitespace?(c)
205
+ return false unless @current_node == :object and c == "}"
206
+
207
+ end_node
208
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
209
+ true
210
+ end
211
+
212
+ # Array: [1, "two", true, undefined, {}, []]
213
+ def start_array(c)
214
+ return false unless c == "["
215
+
216
+ begin_node(:array)
217
+ @current_state = :awaiting_array_value
218
+ true
219
+ end
220
+
221
+ def close_array(c)
222
+ return false if whitespace?(c)
223
+ return false unless @current_node == :array and c == "]"
224
+
225
+ end_node
226
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
227
+ true
228
+ end
229
+
230
+ def start_attribute_key(c)
231
+ return false unless c == "\""
232
+
233
+ begin_node(:string)
234
+ @current_state = :reading_object_attribute_key
235
+ true
236
+ end
237
+
238
+ def close_attribute_key(c)
239
+ return false if @escape_next
240
+ return false unless c == "\""
241
+ end_node
242
+ @current_state = :awaiting_object_colon_separator
243
+ true
244
+ end
245
+
246
+ # Strings: "Foo"
247
+ def start_string(c)
248
+ return false unless c == "\""
249
+
250
+ begin_node(:string)
251
+ @current_state = :reading_string
252
+ true
253
+ end
254
+
255
+ def close_string(c)
256
+ return false if @escape_next
257
+ return false unless c == "\""
258
+ end_node
259
+ @current_state = :awaiting_next_or_close
260
+ true
261
+ end
262
+
263
+ # literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
264
+ def start_literal(c)
265
+ return false unless valid_literal_char?(c)
266
+
267
+ begin_node(:literal)
268
+ @current_state = :reading_literal
269
+ @current_literal_size = 1
270
+ true
271
+ end
272
+
273
+ def close_literal(c)
274
+ raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
275
+
276
+ if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
277
+ end_node
278
+ @current_state = :awaiting_next_or_close
279
+ return true
280
+ end
281
+
282
+ false
283
+ end
284
+
285
+ # Marks the creation of a node (object, array, string or literal)
286
+ def begin_node(node_type)
287
+ # Accounting for the new node
288
+ @execution_stats[node_type] ||= 0
289
+ @execution_stats[node_type] += 1
290
+
291
+ # Managing the node execution stack
292
+ @parent_nodes.push(@current_node)
293
+ @current_node = node_type
294
+ end
295
+
296
+ # Marks the closure of a node (object, array, string or literal)
297
+ def end_node
298
+ @current_node = @parent_nodes.pop
299
+ @current_state = :closed if @current_node.nil?
300
+ end
301
+
302
+ def reject_char(char)
303
+ raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
304
+ end
305
+
306
+ def whitespace?(c)
307
+ WHITESPACE_CHARS.include?(c)
308
+ end
309
+
310
+ def control_char?(c)
311
+ # control characters: (U+0000 through U+001F)
312
+ utf8_code = c.unpack('U*')[0]
313
+ utf8_code <= 31
314
+ end
315
+
316
+ def valid_literal_char?(c)
317
+ LITERALS_CHAR_TEMPLATE === c
318
+ end
319
+ end
@@ -0,0 +1,25 @@
1
+ class FormatParser::JSONParser
2
+ include FormatParser::IOUtils
3
+ require_relative 'json_parser/validator'
4
+
5
+ JSON_MIME_TYPE = 'application/json'
6
+
7
+ def likely_match?(filename)
8
+ filename =~ /\.json$/i
9
+ end
10
+
11
+ def call(io)
12
+ io = FormatParser::IOConstraint.new(io)
13
+ validator = Validator.new(io)
14
+
15
+ validator.validate
16
+
17
+ FormatParser::Text.new(
18
+ format: :json,
19
+ content_type: JSON_MIME_TYPE,
20
+ )
21
+ rescue Validator::JSONParserError
22
+ nil
23
+ end
24
+ FormatParser.register_parser new, natures: :text, formats: :json
25
+ end
@@ -76,6 +76,11 @@ class FormatParser::MP3Parser
76
76
  io.seek(0)
77
77
  return if TIFF_HEADER_BYTES.include?(safe_read(io, 4))
78
78
 
79
+ # Prevention against parsing WAV files.
80
+ io.seek(0)
81
+ wav_chunk_id, _wav_size, wav_riff_type = safe_read(io, 12).unpack('a4la4')
82
+ return if wav_chunk_id == 'RIFF' || wav_riff_type == 'WAVE'
83
+
79
84
  # Read all the ID3 tags (or at least attempt to)
80
85
  io.seek(0)
81
86
  id3v1 = ID3Extraction.attempt_id3_v1_extraction(io)
@@ -315,5 +320,5 @@ class FormatParser::MP3Parser
315
320
  end
316
321
  end
317
322
 
318
- FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 99
323
+ FormatParser.register_parser new, natures: :audio, formats: :mp3, priority: 101
319
324
  end
@@ -0,0 +1,68 @@
1
+ ##
2
+ # This class Reads individual characters from files using UTF-8 encoding
3
+ # This deals with two main concerns:
4
+ # - Variable byte length of characters
5
+ # - Reducing the number of read operations by loading bytes in chunks
6
+
7
+ class FormatParser::UTF8Reader
8
+ READ_CHUNK_SIZE = 128
9
+
10
+ class UTF8CharReaderError < StandardError
11
+ end
12
+
13
+ def initialize(io)
14
+ @io = io
15
+ @chunk = ""
16
+ @index = 0
17
+ @eof = false
18
+ end
19
+
20
+ def read_char
21
+ first_byte = read_byte
22
+ return if first_byte.nil?
23
+
24
+ char_length = assess_char_length(first_byte)
25
+ as_bytes = Array.new(char_length) do |i|
26
+ next first_byte if i == 0
27
+ read_byte
28
+ end
29
+
30
+ char = as_bytes.pack('c*').force_encoding('UTF-8')
31
+ raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
32
+
33
+ char
34
+ rescue TypeError
35
+ raise UTF8CharReaderError, "Invalid UTF-8 character"
36
+ end
37
+
38
+ private
39
+
40
+ def read_byte
41
+ manage_data_chunk
42
+ return if @chunk.nil?
43
+ byte = @chunk.bytes[@index]
44
+ @index += 1 unless byte.nil?
45
+ byte
46
+ end
47
+
48
+ def manage_data_chunk
49
+ return if @index < @chunk.length
50
+ @chunk = @io.read(READ_CHUNK_SIZE)
51
+ @chunk ||= ""
52
+ @index = 0
53
+ @eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
54
+ end
55
+
56
+ def assess_char_length(first_byte)
57
+ # 0_______ (1 byte)
58
+ # 110_____ (2 bytes) 192
59
+ # 1110____ (3 bytes) 224
60
+ # 11110___ (4 bytes) 240
61
+ case first_byte
62
+ when 240.. then 4
63
+ when 224..239 then 3
64
+ when 192..223 then 2
65
+ else 1
66
+ end
67
+ end
68
+ end
@@ -34,6 +34,26 @@ describe FormatParser do
34
34
  end
35
35
  end
36
36
 
37
+ it "fixtures with 'invalid' in the filename should fail to parse" do
38
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
39
+ file_name = File.basename(fixture_path)
40
+ next unless file_name.include? "invalid"
41
+ File.open(fixture_path, 'rb') do |file|
42
+ FormatParser.parse(file)
43
+ end
44
+ end
45
+ end
46
+
47
+ it "fixtures without 'invalid' in the filename should be parsed successfully" do
48
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
49
+ file_name = File.basename(fixture_path)
50
+ next if file_name.include? "invalid"
51
+ File.open(fixture_path, 'rb') do |file|
52
+ FormatParser.parse(file)
53
+ end
54
+ end
55
+ end
56
+
37
57
  it 'triggers parsers in a certain order that corresponds to the parser priorities' do
38
58
  file_contents = StringIO.new('a' * 4096)
39
59
 
@@ -189,12 +209,20 @@ describe FormatParser do
189
209
  'FormatParser::CR3Parser',
190
210
  'FormatParser::DPXParser',
191
211
  'FormatParser::FLACParser',
192
- 'FormatParser::MP3Parser',
193
212
  'FormatParser::OggParser',
194
213
  'FormatParser::TIFFParser',
195
- 'FormatParser::WAVParser'
214
+ 'FormatParser::WAVParser',
215
+ 'FormatParser::MP3Parser'
196
216
  ])
197
217
  end
218
+
219
+ it 'ensures that MP3 parser is the last one among all' do
220
+ natures = FormatParser.registered_natures
221
+ formats = FormatParser.registered_formats
222
+ prioritised_parsers = FormatParser.parsers_for(natures, formats)
223
+ parser_class_names = prioritised_parsers.map { |parser| parser.class.name }
224
+ expect(parser_class_names.last).to eq 'FormatParser::MP3Parser'
225
+ end
198
226
  end
199
227
 
200
228
  describe '.register_parser and .deregister_parser' do
@@ -55,7 +55,7 @@ describe FormatParser::FLACParser do
55
55
  end
56
56
 
57
57
  it 'raises an error when sample rate is 0' do
58
- fpath = fixtures_dir + 'FLAC/sample_rate_0.flac'
58
+ fpath = fixtures_dir + 'FLAC/invalid_sample_rate_0.flac'
59
59
 
60
60
  expect {
61
61
  subject.call(File.open(fpath, 'rb'))
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser::Validator do
4
+ def load_file(file_name)
5
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
6
+ FormatParser::JSONParser::Validator.new(io)
7
+ end
8
+
9
+ def load_string(content)
10
+ io = StringIO.new(content.encode(Encoding::UTF_8))
11
+ FormatParser::JSONParser::Validator.new(io)
12
+ end
13
+
14
+ describe 'When reading root nodes' do
15
+ it "identifies objects as root nodes" do
16
+ v = load_string '{"key": "value"}'
17
+
18
+ completed = v.validate
19
+
20
+ expect(completed).to be true
21
+ expect(v.stats(:object)).to be 1
22
+ expect(v.stats(:string)).to be 2
23
+ end
24
+
25
+ it "identifies arrays as root nodes" do
26
+ v = load_string '["e1", "e2"]'
27
+
28
+ completed = v.validate
29
+
30
+ expect(completed).to be true
31
+ expect(v.stats(:array)).to be 1
32
+ expect(v.stats(:string)).to be 2
33
+ end
34
+
35
+ it "rejects strings as root nodes" do
36
+ expect do
37
+ v = load_string '"this is a string"'
38
+ v.validate
39
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
40
+ end
41
+
42
+ it "rejects literals as root nodes" do
43
+ expect do
44
+ v = load_string 'true'
45
+ v.validate
46
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
47
+ end
48
+ end
49
+
50
+ describe 'When reading objects' do
51
+ it "recognizes empty objects" do
52
+ v = load_string '{}'
53
+
54
+ completed = v.validate
55
+ expect(completed).to be true
56
+ expect(v.stats(:object)).to be 1
57
+ expect(v.stats(:string)).to be 0
58
+ end
59
+
60
+ it "recognizes objects with a single attribute" do
61
+ v = load_string '{"key": "value"}'
62
+
63
+ completed = v.validate
64
+ expect(completed).to be true
65
+ expect(v.stats(:object)).to be 1
66
+ expect(v.stats(:string)).to be 2
67
+ end
68
+
69
+ it "recognizes objects with attributes of different types" do
70
+ v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
71
+
72
+ completed = v.validate
73
+ expect(completed).to be true
74
+ expect(v.stats(:object)).to be 1
75
+ expect(v.stats(:string)).to be 4
76
+ expect(v.stats(:literal)).to be 2
77
+ end
78
+
79
+ it "recognizes condensed objects (no whitespaces)" do
80
+ v = load_string '{"a":"b","c":"d"}'
81
+
82
+ completed = v.validate
83
+ expect(completed).to be true
84
+ expect(v.stats(:object)).to be 1
85
+ expect(v.stats(:string)).to be 4
86
+ end
87
+
88
+ it "recognizes formatted objects" do
89
+ v = load_string '{
90
+ "a":"b",
91
+ "c":"d"
92
+ }'
93
+
94
+ completed = v.validate
95
+ expect(completed).to be true
96
+ expect(v.stats(:object)).to be 1
97
+ expect(v.stats(:string)).to be 4
98
+ end
99
+
100
+ it "recognizes objects with nested objects and arrays" do
101
+ v = load_string '{
102
+ "a": {
103
+ "a1": "-",
104
+ "a2": "-",
105
+ "a3": {
106
+ "a3.1": "-"
107
+ },
108
+ },
109
+ "c": [1, null]
110
+ }'
111
+
112
+ completed = v.validate
113
+ expect(completed).to be true
114
+ expect(v.stats(:object)).to be 3
115
+ expect(v.stats(:array)).to be 1
116
+ expect(v.stats(:string)).to be 9
117
+ expect(v.stats(:literal)).to be 2
118
+ end
119
+
120
+ it "rejects objects without double-quoted attribute names" do
121
+ expect do
122
+ v = load_string '{a:"b",c:"d"}'
123
+ v.validate
124
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
125
+ end
126
+
127
+ it "rejects objects without comma separators" do
128
+ expect do
129
+ v = load_string '{
130
+ "a":"b"
131
+ "c":"d"
132
+ }'
133
+ v.validate
134
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
135
+ end
136
+ end
137
+
138
+ describe 'When reading arrays' do
139
+ it "recognizes empty arrays" do
140
+ v = load_string '[]'
141
+
142
+ completed = v.validate
143
+ expect(completed).to be true
144
+ expect(v.stats(:array)).to be 1
145
+ expect(v.stats(:string)).to be 0
146
+ end
147
+
148
+ it "recognizes arrays with a single element" do
149
+ v = load_string '[{}]'
150
+
151
+ completed = v.validate
152
+ expect(completed).to be true
153
+ expect(v.stats(:array)).to be 1
154
+ expect(v.stats(:object)).to be 1
155
+ end
156
+
157
+ it "recognizes arrays with elements of different types" do
158
+ v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
159
+
160
+ completed = v.validate
161
+ expect(completed).to be true
162
+ expect(v.stats(:array)).to be 2
163
+ expect(v.stats(:object)).to be 1
164
+ expect(v.stats(:string)).to be 3
165
+ expect(v.stats(:literal)).to be 2
166
+ end
167
+
168
+ it "recognizes condensed arrays (no whitespaces)" do
169
+ v = load_string '["a",2,null,false]'
170
+
171
+ completed = v.validate
172
+ expect(completed).to be true
173
+ expect(v.stats(:array)).to be 1
174
+ expect(v.stats(:string)).to be 1
175
+ expect(v.stats(:literal)).to be 3
176
+ end
177
+
178
+ it "recognizes formatted arrays" do
179
+ v = load_string '[
180
+ {
181
+ "a":"b"
182
+ },
183
+ {
184
+ "c":"d"
185
+ }
186
+ ]'
187
+
188
+ completed = v.validate
189
+ expect(completed).to be true
190
+ expect(v.stats(:array)).to be 1
191
+ expect(v.stats(:object)).to be 2
192
+ expect(v.stats(:string)).to be 4
193
+ end
194
+
195
+ it "recognizes arrays with nested objects and arrays" do
196
+ v = load_string '[{
197
+ "a": {
198
+ "a1": "-",
199
+ "a2": "-",
200
+ "a3": {
201
+ "a3.1": "-"
202
+ },
203
+ },
204
+ "c": [1, null]
205
+ },
206
+ [{ "a": "b" }, { "c":"d" }]
207
+ ]'
208
+
209
+ completed = v.validate
210
+ expect(completed).to be true
211
+ expect(v.stats(:array)).to be 3
212
+ expect(v.stats(:object)).to be 5
213
+ expect(v.stats(:string)).to be 13
214
+ expect(v.stats(:literal)).to be 2
215
+ end
216
+
217
+ it "rejects arrays without comma separators" do
218
+ expect do
219
+ v = load_string '[
220
+ "abc"
221
+ "def"
222
+ ]'
223
+ v.validate
224
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
225
+ end
226
+ end
227
+
228
+ describe 'When reading strings' do
229
+ it "recognizes regular strings" do
230
+ v = load_string '["abc", "def", "ghi"]'
231
+
232
+ completed = v.validate
233
+ expect(completed).to be true
234
+ expect(v.stats(:string)).to be 3
235
+ end
236
+
237
+ it "recognizes strings containing excaped characters" do
238
+ v = load_string '["ab\"c", "6\\2=3"]'
239
+
240
+ completed = v.validate
241
+ expect(completed).to be true
242
+ expect(v.stats(:string)).to be 2
243
+ end
244
+
245
+ it "recognizes strings containing UTF8 characters" do
246
+ v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
247
+
248
+ completed = v.validate
249
+ expect(completed).to be true
250
+ expect(v.stats(:string)).to be 2
251
+ end
252
+
253
+ it "recognizes long strings containing UTF8 characters" do
254
+ v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
255
+
256
+ completed = v.validate
257
+ expect(completed).to be true
258
+ expect(v.stats(:string)).to be 1
259
+ end
260
+ end
261
+
262
+ describe 'When reading literals' do
263
+ it "recognizes numbers" do
264
+ v = load_string '[1, -2.4, 1.0E+2]'
265
+
266
+ completed = v.validate
267
+ expect(completed).to be true
268
+ expect(v.stats(:literal)).to be 3
269
+ end
270
+
271
+ it "recognizes boolean values" do
272
+ v = load_string '[true, false]'
273
+
274
+ completed = v.validate
275
+ expect(completed).to be true
276
+ expect(v.stats(:literal)).to be 2
277
+ end
278
+
279
+ it "recognizes 'true', 'false' and 'null'" do
280
+ v = load_string '[true, false, null]'
281
+
282
+ completed = v.validate
283
+ expect(completed).to be true
284
+ expect(v.stats(:literal)).to be 3
285
+ end
286
+ end
287
+
288
+ describe 'When reading invalid JSON content' do
289
+ it "rejects truncated JSON content" do
290
+ expect do
291
+ v = load_string '[{
292
+ "a": ["abc","def"],
293
+ "b": 4'
294
+ v.validate
295
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
296
+ end
297
+ end
298
+
299
+ describe 'When reading large JSON files' do
300
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
301
+ v = load_file 'long_file_valid.json'
302
+
303
+ completed = v.validate
304
+ expect(completed).to be false
305
+ end
306
+
307
+ it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
308
+ v = load_file 'long_file_valid_non_formatted.json'
309
+
310
+ completed = v.validate
311
+ expect(completed).to be false
312
+ end
313
+
314
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
315
+ v = load_file 'long_file_malformed.json'
316
+
317
+ completed = v.validate
318
+ expect(completed).to be false
319
+ end
320
+ end
321
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser do
4
+ MAX_READS = 100
5
+
6
+ def load_file(file_name)
7
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
8
+ FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
9
+ end
10
+
11
+ def file_size(file_name)
12
+ File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
13
+ end
14
+
15
+ describe 'When reading objects valid JSON files' do
16
+ it "identifies JSON files with objects as root nodes" do
17
+ io = load_file 'object.json'
18
+
19
+ parsed = subject.call(io)
20
+
21
+ expect(parsed).not_to be_nil
22
+ expect(parsed.nature).to eq(:text)
23
+ expect(parsed.format).to eq(:json)
24
+ expect(parsed.content_type).to eq('application/json')
25
+ end
26
+
27
+ it "identifies JSON files carrying arrays as root nodes" do
28
+ io = load_file 'array.json'
29
+
30
+ parsed = subject.call(io)
31
+
32
+ expect(parsed).not_to be_nil
33
+ expect(parsed.nature).to eq(:text)
34
+ expect(parsed.format).to eq(:json)
35
+ expect(parsed.content_type).to eq('application/json')
36
+ end
37
+
38
+ it "identifies formatted JSON files" do
39
+ io = load_file 'formatted_object_utf8.json'
40
+
41
+ parsed = subject.call(io)
42
+
43
+ expect(parsed).not_to be_nil
44
+ expect(parsed.nature).to eq(:text)
45
+ expect(parsed.format).to eq(:json)
46
+ expect(parsed.content_type).to eq('application/json')
47
+ end
48
+
49
+ it "identifies files wrapped in whitespace characters" do
50
+ io = load_file 'whitespaces.json'
51
+
52
+ parsed = subject.call(io)
53
+
54
+ expect(parsed).not_to be_nil
55
+ expect(parsed.nature).to eq(:text)
56
+ expect(parsed.format).to eq(:json)
57
+ expect(parsed.content_type).to eq('application/json')
58
+ end
59
+
60
+ it "identifies files with nested objects and arrays" do
61
+ io = load_file 'nested_objects.json'
62
+
63
+ parsed = subject.call(io)
64
+
65
+ expect(parsed).not_to be_nil
66
+ expect(parsed.nature).to eq(:text)
67
+ expect(parsed.format).to eq(:json)
68
+ expect(parsed.content_type).to eq('application/json')
69
+ end
70
+
71
+ it "is reads the whole content of small files before accepting them" do
72
+ file_name = 'nested_objects.json'
73
+ io = load_file file_name
74
+ file_size = file_size file_name
75
+
76
+ parsed = subject.call(io)
77
+
78
+ expect(parsed).not_to be_nil
79
+ expect(parsed.nature).to eq(:text)
80
+ expect(parsed.format).to eq(:json)
81
+ expect(parsed.content_type).to eq('application/json')
82
+ expect(io.bytes).to be >= file_size
83
+ end
84
+
85
+ it "is accepts long files before reading the whole content" do
86
+ file_name = 'long_array_numbers.json'
87
+ io = load_file file_name
88
+ file_size = file_size file_name
89
+
90
+ parsed = subject.call(io)
91
+
92
+ expect(parsed).not_to be_nil
93
+ expect(parsed.nature).to eq(:text)
94
+ expect(parsed.format).to eq(:json)
95
+ expect(parsed.content_type).to eq('application/json')
96
+ expect(io.bytes).to be < file_size
97
+ end
98
+ end
99
+
100
+ describe 'When reading objects invalid JSON files' do
101
+ it "rejects files with corrupted JSON data" do
102
+ io = load_file 'invalid_malformed.json'
103
+
104
+ parsed = subject.call(io)
105
+
106
+ expect(parsed).to be_nil
107
+ end
108
+
109
+ it "rejects invalid files early without reading the whole content" do
110
+ io = load_file 'invalid_lorem_ipsum.json'
111
+
112
+ parsed = subject.call(io)
113
+
114
+ expect(parsed).to be_nil
115
+ expect(io.reads).to eq(1)
116
+ end
117
+ end
118
+ end
@@ -11,7 +11,7 @@ describe FormatParser::M3UParser do
11
11
  end
12
12
 
13
13
  describe 'an m3u file with missing header' do
14
- let(:m3u_file) { 'plain_text.m3u' }
14
+ let(:m3u_file) { 'invalid_plain_text.m3u' }
15
15
 
16
16
  it 'does not parse the file successfully' do
17
17
  expect(parsed_m3u).to be_nil
@@ -36,6 +36,12 @@ describe FormatParser::MP3Parser do
36
36
  expect(parsed).to be_nil
37
37
  end
38
38
 
39
+ it 'does not misdetect a WAV' do
40
+ fpath = fixtures_dir + '/WAV/c_SCAM_MIC_SOL001_RUN001.wav'
41
+ parsed = subject.call(File.open(fpath, 'rb'))
42
+ expect(parsed).to be_nil
43
+ end
44
+
39
45
  describe 'title/artist/album attributes' do
40
46
  let(:parsed) { subject.call(File.open(fpath, 'rb')) }
41
47
 
@@ -13,7 +13,7 @@ describe FormatParser::OggParser do
13
13
  end
14
14
 
15
15
  it 'skips a file if it contains more than MAX_POSSIBLE_OGG_PAGE_SIZE bytes of garbage at the end' do
16
- parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/with_garbage_at_the_end.ogg', 'rb'))
16
+ parse_result = subject.call(File.open(__dir__ + '/../fixtures/Ogg/invalid_with_garbage_at_the_end.ogg', 'rb'))
17
17
  expect(parse_result).to be_nil
18
18
  end
19
19
 
@@ -46,17 +46,17 @@ describe FormatParser::PDFParser do
46
46
 
47
47
  describe 'broken PDF files should not parse' do
48
48
  it 'PDF with missing version header' do
49
- parsed_pdf = parse_pdf 'not_a.pdf'
49
+ parsed_pdf = parse_pdf 'invalid_not_a.pdf'
50
50
  expect(parsed_pdf).to be_nil
51
51
  end
52
52
 
53
53
  it 'PDF 2.0 with offset start' do
54
- parsed_pdf = parse_pdf 'PDF 2.0 with offset start.pdf'
54
+ parsed_pdf = parse_pdf 'invalid PDF 2.0 with offset start.pdf'
55
55
  expect(parsed_pdf).to be_nil
56
56
  end
57
57
 
58
58
  it 'exceeds the PDF read limit' do
59
- parsed_pdf = parse_pdf 'exceed_PDF_read_limit.pdf'
59
+ parsed_pdf = parse_pdf 'invalid_exceed_PDF_read_limit.pdf'
60
60
  expect(parsed_pdf).to be_nil
61
61
  end
62
62
  end
@@ -48,7 +48,7 @@ describe FormatParser::WAVParser do
48
48
 
49
49
  it "cannot parse file with audio format different from 1 and no 'fact' chunk" do
50
50
  expect {
51
- subject.call(File.open(__dir__ + '/../fixtures/WAV/d_6_Channel_ID.wav', 'rb'))
51
+ subject.call(File.open(__dir__ + '/../fixtures/WAV/invalid_d_6_Channel_ID.wav', 'rb'))
52
52
  }.to raise_error(FormatParser::IOUtils::InvalidRead)
53
53
  end
54
54
  end
@@ -7,7 +7,7 @@ describe FormatParser::WebpParser do
7
7
  end
8
8
 
9
9
  it 'does not parse files with an unrecognised variant' do
10
- result = subject.call(File.open(fixtures_dir + 'WEBP/unrecognised-variant.webp', 'rb'))
10
+ result = subject.call(File.open(fixtures_dir + 'WEBP/invalid-unrecognised-variant.webp', 'rb'))
11
11
  expect(result).to be_nil
12
12
  end
13
13
 
@@ -104,6 +104,43 @@ describe 'Fetching data from HTTP remotes' do
104
104
  expect(file_information.format).to eq(:png)
105
105
  end
106
106
 
107
+ describe 'correctly parses WAV files without falling back to another filetype' do
108
+ ['c_8kmp316.wav', 'c_SCAM_MIC_SOL001_RUN001.wav'].each do |filename|
109
+ it "parses WAV file #{filename}" do
110
+ remote_url = 'http://localhost:9399/WAV/' + filename
111
+ file_information = FormatParser.parse_http(remote_url)
112
+ expect(file_information).not_to be_nil
113
+ expect(file_information.format).to eq(:wav)
114
+ end
115
+ end
116
+ end
117
+
118
+ describe "correctly parses files over HTTP without filename hint" do
119
+ Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
120
+ file_name = File.basename(fixture_path)
121
+ next if file_name.include? "invalid"
122
+
123
+ file_type_dir = fixture_path.delete_prefix(fixtures_dir).delete_suffix(file_name)
124
+ file_type_dir.delete_prefix!('/').delete_suffix!('/')
125
+ next if file_type_dir.empty?
126
+
127
+ # skipping this one because it's a special case
128
+ next if file_name == "arch_many_entries.zip"
129
+
130
+ it "parses #{file_type_dir} file: #{file_name}" do
131
+ url = "http://localhost:9399/#{file_type_dir}/#{file_name}?some_param=test".gsub(' ', '%20')
132
+ result_with_hint = FormatParser.parse_http(url, filename_hint: file_name)
133
+ result_no_hint = FormatParser.parse_http(url)
134
+
135
+ expect(result_with_hint).not_to be_nil
136
+ expect(result_no_hint).not_to be_nil
137
+
138
+ expect(result_no_hint.nature).to eq(result_with_hint.nature)
139
+ expect(result_no_hint.format).to eq(result_with_hint.format)
140
+ end
141
+ end
142
+ end
143
+
107
144
  describe 'when parsing remote fixtures' do
108
145
  Dir.glob(fixtures_dir + '/**/*.*').sort.each do |fixture_path|
109
146
  filename = File.basename(fixture_path)
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.0
4
+ version: 2.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
8
8
  - Julik Tarkhanov
9
- autorequire:
9
+ autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-31 00:00:00.000000000 Z
12
+ date: 2023-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -236,6 +236,8 @@ files:
236
236
  - lib/parsers/iso_base_media_file_format/decoder.rb
237
237
  - lib/parsers/iso_base_media_file_format/utils.rb
238
238
  - lib/parsers/jpeg_parser.rb
239
+ - lib/parsers/json_parser.rb
240
+ - lib/parsers/json_parser/validator.rb
239
241
  - lib/parsers/m3u_parser.rb
240
242
  - lib/parsers/mov_parser.rb
241
243
  - lib/parsers/mov_parser/decoder.rb
@@ -260,6 +262,7 @@ files:
260
262
  - lib/remote_io.rb
261
263
  - lib/string.rb
262
264
  - lib/text.rb
265
+ - lib/utf8_reader.rb
263
266
  - lib/video.rb
264
267
  - spec/active_storage/blob_io_spec.rb
265
268
  - spec/active_storage/rails_app_spec.rb
@@ -289,6 +292,8 @@ files:
289
292
  - spec/parsers/iso_base_media_file_format/decoder_spec.rb
290
293
  - spec/parsers/iso_base_media_file_format/utils_spec.rb
291
294
  - spec/parsers/jpeg_parser_spec.rb
295
+ - spec/parsers/json_parser/validator_spec.rb
296
+ - spec/parsers/json_parser_spec.rb
292
297
  - spec/parsers/m3u_parser_spec.rb
293
298
  - spec/parsers/mov_parser_spec.rb
294
299
  - spec/parsers/mp3_parser_spec.rb
@@ -314,7 +319,7 @@ licenses:
314
319
  - MIT (Hippocratic)
315
320
  metadata:
316
321
  allowed_push_host: https://rubygems.org
317
- post_install_message:
322
+ post_install_message:
318
323
  rdoc_options: []
319
324
  require_paths:
320
325
  - lib
@@ -329,8 +334,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
329
334
  - !ruby/object:Gem::Version
330
335
  version: '0'
331
336
  requirements: []
332
- rubygems_version: 3.3.7
333
- signing_key:
337
+ rubygems_version: 3.1.6
338
+ signing_key:
334
339
  specification_version: 4
335
340
  summary: A library for efficient parsing of file metadata
336
341
  test_files: []