format_parser 2.5.0 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be6d2146ac8ada9870883998a9d4e93a92b8bb572668c424e28be798b27cc21c
4
- data.tar.gz: 5ea4e991625c8b7e1d1414e970efbc9d6fbc4827c4bb9e14ad2140fd0cbd396f
3
+ metadata.gz: f7b8b37e26143e2ea941db88183475e70c90ab658adcdad362b32b457a7d19ac
4
+ data.tar.gz: 50758e107065e1a2ab4fbca7775359d928cd1a62942277a94fcbabfefa1bfe10
5
5
  SHA512:
6
- metadata.gz: '079c5c591cfd548ff4df342882048a0ad315c30256fd4dff9333c2944986cec7949ff85472658ef3a5156e4f1553964368257b838e1130664346f486ffde1623'
7
- data.tar.gz: a877334ed908a65e03f91dfd83a4b0dc5ff5c569942335038c6b37a6727add95796dc023a7cdac1965ff8a3adfe0b131aefac3d3b30a2c33c4f565e3a60dcbc9
6
+ metadata.gz: d34bcd7b0162fe6f911bdd8c3b626dd9ce35b98139bca6ec1b54e653bc7e50453af734c74134745a46e91fbc3528d88f1663fabdbd35e06ae908a41f8e81dcd7
7
+ data.tar.gz: 2ebbc65f373a3e34a2e8300d40bb239df6d2003dab54b9c461a8cc75f651800d93273aa73bfb4f1a14f195e65f348c218b90b4c1bae860c8269e024ebb2e890c
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ ## 2.6.0
2
+ * Prevent the default loading of thumbnails on TIFF-based formats to improve I/O.
3
+
1
4
  ## 2.5.0
2
5
  * Add `avc1` and `xavc` as brand codes in the mp4 format parser to allow more file types to be parsed correctly.
3
6
 
data/README.md CHANGED
@@ -26,6 +26,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
26
26
  * HEIC
27
27
  * HEIF
28
28
  * JPEG
29
+ * JSON
29
30
  * M3U
30
31
  * M4A
31
32
  * M4B
@@ -32,7 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
33
33
  spec.require_paths = ['lib']
34
34
 
35
- spec.add_dependency 'exifr', '>= 1.3.8'
35
+ spec.add_dependency 'exifr', '>= 1.4.0'
36
36
  spec.add_dependency 'id3tag', '>= 0.14.2'
37
37
  spec.add_dependency 'matrix'
38
38
  spec.add_dependency 'measurometer'
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.5.0'
2
+ VERSION = '2.7.0'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -17,6 +17,7 @@ module FormatParser
17
17
  require_relative 'read_limits_config'
18
18
  require_relative 'remote_io'
19
19
  require_relative 'io_constraint'
20
+ require_relative 'utf8_reader'
20
21
  require_relative 'care'
21
22
  require_relative 'active_storage/blob_analyzer'
22
23
  require_relative 'text'
@@ -175,7 +175,7 @@ module FormatParser::EXIFParser
175
175
  def exif_from_tiff_io(constrained_io, should_include_sub_ifds = false)
176
176
  Measurometer.instrument('format_parser.exif_parser.exif_from_tiff_io') do
177
177
  extended_io = IOExt.new(constrained_io)
178
- exif_raw_data = EXIFR::TIFF.new(extended_io)
178
+ exif_raw_data = EXIFR::TIFF.new(extended_io, load_thumbnails: false)
179
179
 
180
180
  return unless exif_raw_data
181
181
 
@@ -0,0 +1,319 @@
1
+ ##
2
+ # This class checks whether a given file is a valid JSON file.
3
+ # The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
4
+ # Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
5
+ # the hierarchy of nodes in the document.
6
+ #
7
+ # Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
8
+ # it does cut a few corners for the sake of simplicity. For instance, instead of validating
9
+ # Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
10
+ # This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
11
+ #
12
+ # There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
13
+ # JSON-compliant, it is assumed that the file is a JSON file.
14
+
15
+ class FormatParser::JSONParser::Validator
16
+ class JSONParserError < StandardError
17
+ end
18
+
19
+ MAX_SAMPLE_SIZE = 1024
20
+ MAX_LITERAL_SIZE = 30 # much larger then necessary.
21
+ ESCAPE_CHAR = "\\"
22
+ WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
23
+ ENDING_VALUE_CHARS = [",", "]", "}"]
24
+ LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
25
+
26
+ def initialize(io)
27
+ @io = io
28
+ @current_node = nil # :object, :array, :string, :literal
29
+ @parent_nodes = []
30
+ @current_state = :awaiting_root_node
31
+ @escape_next = false
32
+ @current_literal_size = 0
33
+ @pos = 0
34
+
35
+ @all_parsers = {}
36
+
37
+ @execution_stats = {
38
+ array: 0,
39
+ object: 0,
40
+ literal: 0,
41
+ string: 0
42
+ }
43
+
44
+ setup_transitions
45
+ end
46
+
47
+ def validate
48
+ char_reader = FormatParser::UTF8Reader.new(@io)
49
+
50
+ while (c = char_reader.read_char)
51
+ @pos += 1
52
+ parse_char c
53
+
54
+ # Halt validation if the sampling limit is reached.
55
+ if @pos >= MAX_SAMPLE_SIZE
56
+ raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
57
+ return false
58
+ end
59
+ end
60
+
61
+ # Raising error in case the EOF is reached earlier than expected
62
+ raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
63
+ true
64
+ rescue FormatParser::UTF8Reader::UTF8CharReaderError
65
+ raise JSONParserError, "Invalid UTF-8 character"
66
+ end
67
+
68
+ def stats(node_type)
69
+ @execution_stats[node_type]
70
+ end
71
+
72
+ private
73
+
74
+ def setup_transitions
75
+ when_its :awaiting_root_node, ->(c) do
76
+ read_whitespace(c) or
77
+ start_object(c) or
78
+ start_array(c)
79
+ end
80
+
81
+ when_its :awaiting_object_attribute_key, ->(c) do
82
+ read_whitespace(c) or
83
+ start_attribute_key(c) or
84
+ close_object(c)
85
+ end
86
+
87
+ when_its :reading_object_attribute_key, ->(c) do
88
+ close_attribute_key(c) or
89
+ read_valid_string_char(c)
90
+ end
91
+
92
+ when_its :awaiting_object_colon_separator, ->(c) do
93
+ read_whitespace(c) or
94
+ read_colon(c)
95
+ end
96
+
97
+ when_its :awaiting_object_attribute_value, ->(c) do
98
+ read_whitespace(c) or
99
+ start_object(c) or
100
+ start_array(c) or
101
+ start_string(c) or
102
+ start_literal(c)
103
+ end
104
+
105
+ when_its :awaiting_array_value, ->(c) do
106
+ read_whitespace(c) or
107
+ start_object(c) or
108
+ start_array(c) or
109
+ start_string(c) or
110
+ start_literal(c) or
111
+ close_array(c)
112
+ end
113
+
114
+ when_its :reading_string, ->(c) do
115
+ close_string(c) or
116
+ read_valid_string_char(c)
117
+ end
118
+
119
+ when_its :awaiting_next_or_close, ->(c) do
120
+ read_whitespace(c) or
121
+ read_comma_separator(c) or
122
+ close_object(c) or
123
+ close_array(c)
124
+ end
125
+
126
+ when_its :reading_literal, ->(c) do
127
+ read_valid_literal_char(c) or (
128
+ close_literal(c) and (
129
+ read_whitespace(c) or
130
+ read_comma_separator(c) or
131
+ close_array(c) or
132
+ close_object(c)))
133
+ end
134
+
135
+ when_its :closed, ->(c) do
136
+ read_whitespace(c)
137
+ end
138
+ end
139
+
140
+ def when_its(state, act)
141
+ @all_parsers[state] = act
142
+ end
143
+
144
+ def parse_char(c)
145
+ next_step = @all_parsers[@current_state]
146
+ accepted = next_step.call(c)
147
+ reject_char(c) unless accepted
148
+ end
149
+
150
+ def read_whitespace(c)
151
+ whitespace?(c)
152
+ end
153
+
154
+ def read_colon(c)
155
+ if c == ":"
156
+ @current_state = :awaiting_object_attribute_value
157
+ return true
158
+ end
159
+ false
160
+ end
161
+
162
+ def read_valid_string_char(c)
163
+ if @escape_next
164
+ @escape_next = false
165
+ return true
166
+ end
167
+
168
+ if c == ESCAPE_CHAR
169
+ @escape_next = true
170
+ return true
171
+ end
172
+ !control_char?(c) and c != "\""
173
+ end
174
+
175
+ def read_valid_literal_char(c)
176
+ if valid_literal_char?(c)
177
+ @current_literal_size += 1
178
+ return true
179
+ end
180
+
181
+ false
182
+ end
183
+
184
+ def read_comma_separator(c)
185
+ if c == ","
186
+ @current_state = :awaiting_object_attribute_key if @current_node == :object
187
+ @current_state = :awaiting_array_value if @current_node == :array
188
+ return true
189
+ end
190
+ false
191
+ end
192
+
193
+ # Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
194
+ def start_object(c)
195
+ return false if whitespace?(c)
196
+ return false unless c == "{"
197
+
198
+ begin_node(:object)
199
+ @current_state = :awaiting_object_attribute_key
200
+ true
201
+ end
202
+
203
+ def close_object(c)
204
+ return false if whitespace?(c)
205
+ return false unless @current_node == :object and c == "}"
206
+
207
+ end_node
208
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
209
+ true
210
+ end
211
+
212
+ # Array: [1, "two", true, undefined, {}, []]
213
+ def start_array(c)
214
+ return false unless c == "["
215
+
216
+ begin_node(:array)
217
+ @current_state = :awaiting_array_value
218
+ true
219
+ end
220
+
221
+ def close_array(c)
222
+ return false if whitespace?(c)
223
+ return false unless @current_node == :array and c == "]"
224
+
225
+ end_node
226
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
227
+ true
228
+ end
229
+
230
+ def start_attribute_key(c)
231
+ return false unless c == "\""
232
+
233
+ begin_node(:string)
234
+ @current_state = :reading_object_attribute_key
235
+ true
236
+ end
237
+
238
+ def close_attribute_key(c)
239
+ return false if @escape_next
240
+ return false unless c == "\""
241
+ end_node
242
+ @current_state = :awaiting_object_colon_separator
243
+ true
244
+ end
245
+
246
+ # Strings: "Foo"
247
+ def start_string(c)
248
+ return false unless c == "\""
249
+
250
+ begin_node(:string)
251
+ @current_state = :reading_string
252
+ true
253
+ end
254
+
255
+ def close_string(c)
256
+ return false if @escape_next
257
+ return false unless c == "\""
258
+ end_node
259
+ @current_state = :awaiting_next_or_close
260
+ true
261
+ end
262
+
263
+ # literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
264
+ def start_literal(c)
265
+ return false unless valid_literal_char?(c)
266
+
267
+ begin_node(:literal)
268
+ @current_state = :reading_literal
269
+ @current_literal_size = 1
270
+ true
271
+ end
272
+
273
+ def close_literal(c)
274
+ raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
275
+
276
+ if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
277
+ end_node
278
+ @current_state = :awaiting_next_or_close
279
+ return true
280
+ end
281
+
282
+ false
283
+ end
284
+
285
+ # Marks the creation of a node (object, array, string or literal)
286
+ def begin_node(node_type)
287
+ # Accounting for the new node
288
+ @execution_stats[node_type] ||= 0
289
+ @execution_stats[node_type] += 1
290
+
291
+ # Managing the node execution stack
292
+ @parent_nodes.push(@current_node)
293
+ @current_node = node_type
294
+ end
295
+
296
+ # Marks the closure of a node (object, array, string or literal)
297
+ def end_node
298
+ @current_node = @parent_nodes.pop
299
+ @current_state = :closed if @current_node.nil?
300
+ end
301
+
302
+ def reject_char(char)
303
+ raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
304
+ end
305
+
306
+ def whitespace?(c)
307
+ WHITESPACE_CHARS.include?(c)
308
+ end
309
+
310
+ def control_char?(c)
311
+ # control characters: (U+0000 through U+001F)
312
+ utf8_code = c.unpack('U*')[0]
313
+ utf8_code <= 31
314
+ end
315
+
316
+ def valid_literal_char?(c)
317
+ LITERALS_CHAR_TEMPLATE === c
318
+ end
319
+ end
@@ -0,0 +1,25 @@
1
+ class FormatParser::JSONParser
2
+ include FormatParser::IOUtils
3
+ require_relative 'json_parser/validator'
4
+
5
+ JSON_MIME_TYPE = 'application/json'
6
+
7
+ def likely_match?(filename)
8
+ filename =~ /\.json$/i
9
+ end
10
+
11
+ def call(io)
12
+ io = FormatParser::IOConstraint.new(io)
13
+ validator = Validator.new(io)
14
+
15
+ validator.validate
16
+
17
+ FormatParser::Text.new(
18
+ format: :json,
19
+ content_type: JSON_MIME_TYPE,
20
+ )
21
+ rescue Validator::JSONParserError
22
+ nil
23
+ end
24
+ FormatParser.register_parser new, natures: :text, formats: :json
25
+ end
@@ -0,0 +1,68 @@
1
+ ##
2
+ # This class Reads individual characters from files using UTF-8 encoding
3
+ # This deals with two main concerns:
4
+ # - Variable byte length of characters
5
+ # - Reducing the number of read operations by loading bytes in chunks
6
+
7
+ class FormatParser::UTF8Reader
8
+ READ_CHUNK_SIZE = 128
9
+
10
+ class UTF8CharReaderError < StandardError
11
+ end
12
+
13
+ def initialize(io)
14
+ @io = io
15
+ @chunk = ""
16
+ @index = 0
17
+ @eof = false
18
+ end
19
+
20
+ def read_char
21
+ first_byte = read_byte
22
+ return if first_byte.nil?
23
+
24
+ char_length = assess_char_length(first_byte)
25
+ as_bytes = Array.new(char_length) do |i|
26
+ next first_byte if i == 0
27
+ read_byte
28
+ end
29
+
30
+ char = as_bytes.pack('c*').force_encoding('UTF-8')
31
+ raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
32
+
33
+ char
34
+ rescue TypeError
35
+ raise UTF8CharReaderError, "Invalid UTF-8 character"
36
+ end
37
+
38
+ private
39
+
40
+ def read_byte
41
+ manage_data_chunk
42
+ return if @chunk.nil?
43
+ byte = @chunk.bytes[@index]
44
+ @index += 1 unless byte.nil?
45
+ byte
46
+ end
47
+
48
+ def manage_data_chunk
49
+ return if @index < @chunk.length
50
+ @chunk = @io.read(READ_CHUNK_SIZE)
51
+ @chunk ||= ""
52
+ @index = 0
53
+ @eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
54
+ end
55
+
56
+ def assess_char_length(first_byte)
57
+ # 0_______ (1 byte)
58
+ # 110_____ (2 bytes) 192
59
+ # 1110____ (3 bytes) 224
60
+ # 11110___ (4 bytes) 240
61
+ case first_byte
62
+ when 240.. then 4
63
+ when 224..239 then 3
64
+ when 192..223 then 2
65
+ else 1
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser::Validator do
4
+ def load_file(file_name)
5
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
6
+ FormatParser::JSONParser::Validator.new(io)
7
+ end
8
+
9
+ def load_string(content)
10
+ io = StringIO.new(content.encode(Encoding::UTF_8))
11
+ FormatParser::JSONParser::Validator.new(io)
12
+ end
13
+
14
+ describe 'When reading root nodes' do
15
+ it "identifies objects as root nodes" do
16
+ v = load_string '{"key": "value"}'
17
+
18
+ completed = v.validate
19
+
20
+ expect(completed).to be true
21
+ expect(v.stats(:object)).to be 1
22
+ expect(v.stats(:string)).to be 2
23
+ end
24
+
25
+ it "identifies arrays as root nodes" do
26
+ v = load_string '["e1", "e2"]'
27
+
28
+ completed = v.validate
29
+
30
+ expect(completed).to be true
31
+ expect(v.stats(:array)).to be 1
32
+ expect(v.stats(:string)).to be 2
33
+ end
34
+
35
+ it "rejects strings as root nodes" do
36
+ expect do
37
+ v = load_string '"this is a string"'
38
+ v.validate
39
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
40
+ end
41
+
42
+ it "rejects literals as root nodes" do
43
+ expect do
44
+ v = load_string 'true'
45
+ v.validate
46
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
47
+ end
48
+ end
49
+
50
+ describe 'When reading objects' do
51
+ it "recognizes empty objects" do
52
+ v = load_string '{}'
53
+
54
+ completed = v.validate
55
+ expect(completed).to be true
56
+ expect(v.stats(:object)).to be 1
57
+ expect(v.stats(:string)).to be 0
58
+ end
59
+
60
+ it "recognizes objects with a single attribute" do
61
+ v = load_string '{"key": "value"}'
62
+
63
+ completed = v.validate
64
+ expect(completed).to be true
65
+ expect(v.stats(:object)).to be 1
66
+ expect(v.stats(:string)).to be 2
67
+ end
68
+
69
+ it "recognizes objects with attributes of different types" do
70
+ v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
71
+
72
+ completed = v.validate
73
+ expect(completed).to be true
74
+ expect(v.stats(:object)).to be 1
75
+ expect(v.stats(:string)).to be 4
76
+ expect(v.stats(:literal)).to be 2
77
+ end
78
+
79
+ it "recognizes condensed objects (no whitespaces)" do
80
+ v = load_string '{"a":"b","c":"d"}'
81
+
82
+ completed = v.validate
83
+ expect(completed).to be true
84
+ expect(v.stats(:object)).to be 1
85
+ expect(v.stats(:string)).to be 4
86
+ end
87
+
88
+ it "recognizes formatted objects" do
89
+ v = load_string '{
90
+ "a":"b",
91
+ "c":"d"
92
+ }'
93
+
94
+ completed = v.validate
95
+ expect(completed).to be true
96
+ expect(v.stats(:object)).to be 1
97
+ expect(v.stats(:string)).to be 4
98
+ end
99
+
100
+ it "recognizes objects with nested objects and arrays" do
101
+ v = load_string '{
102
+ "a": {
103
+ "a1": "-",
104
+ "a2": "-",
105
+ "a3": {
106
+ "a3.1": "-"
107
+ },
108
+ },
109
+ "c": [1, null]
110
+ }'
111
+
112
+ completed = v.validate
113
+ expect(completed).to be true
114
+ expect(v.stats(:object)).to be 3
115
+ expect(v.stats(:array)).to be 1
116
+ expect(v.stats(:string)).to be 9
117
+ expect(v.stats(:literal)).to be 2
118
+ end
119
+
120
+ it "rejects objects without double-quoted attribute names" do
121
+ expect do
122
+ v = load_string '{a:"b",c:"d"}'
123
+ v.validate
124
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
125
+ end
126
+
127
+ it "rejects objects without comma separators" do
128
+ expect do
129
+ v = load_string '{
130
+ "a":"b"
131
+ "c":"d"
132
+ }'
133
+ v.validate
134
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
135
+ end
136
+ end
137
+
138
+ describe 'When reading arrays' do
139
+ it "recognizes empty arrays" do
140
+ v = load_string '[]'
141
+
142
+ completed = v.validate
143
+ expect(completed).to be true
144
+ expect(v.stats(:array)).to be 1
145
+ expect(v.stats(:string)).to be 0
146
+ end
147
+
148
+ it "recognizes arrays with a single element" do
149
+ v = load_string '[{}]'
150
+
151
+ completed = v.validate
152
+ expect(completed).to be true
153
+ expect(v.stats(:array)).to be 1
154
+ expect(v.stats(:object)).to be 1
155
+ end
156
+
157
+ it "recognizes arrays with elements of different types" do
158
+ v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
159
+
160
+ completed = v.validate
161
+ expect(completed).to be true
162
+ expect(v.stats(:array)).to be 2
163
+ expect(v.stats(:object)).to be 1
164
+ expect(v.stats(:string)).to be 3
165
+ expect(v.stats(:literal)).to be 2
166
+ end
167
+
168
+ it "recognizes condensed arrays (no whitespaces)" do
169
+ v = load_string '["a",2,null,false]'
170
+
171
+ completed = v.validate
172
+ expect(completed).to be true
173
+ expect(v.stats(:array)).to be 1
174
+ expect(v.stats(:string)).to be 1
175
+ expect(v.stats(:literal)).to be 3
176
+ end
177
+
178
+ it "recognizes formatted arrays" do
179
+ v = load_string '[
180
+ {
181
+ "a":"b"
182
+ },
183
+ {
184
+ "c":"d"
185
+ }
186
+ ]'
187
+
188
+ completed = v.validate
189
+ expect(completed).to be true
190
+ expect(v.stats(:array)).to be 1
191
+ expect(v.stats(:object)).to be 2
192
+ expect(v.stats(:string)).to be 4
193
+ end
194
+
195
+ it "recognizes arrays with nested objects and arrays" do
196
+ v = load_string '[{
197
+ "a": {
198
+ "a1": "-",
199
+ "a2": "-",
200
+ "a3": {
201
+ "a3.1": "-"
202
+ },
203
+ },
204
+ "c": [1, null]
205
+ },
206
+ [{ "a": "b" }, { "c":"d" }]
207
+ ]'
208
+
209
+ completed = v.validate
210
+ expect(completed).to be true
211
+ expect(v.stats(:array)).to be 3
212
+ expect(v.stats(:object)).to be 5
213
+ expect(v.stats(:string)).to be 13
214
+ expect(v.stats(:literal)).to be 2
215
+ end
216
+
217
+ it "rejects arrays without comma separators" do
218
+ expect do
219
+ v = load_string '[
220
+ "abc"
221
+ "def"
222
+ ]'
223
+ v.validate
224
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
225
+ end
226
+ end
227
+
228
+ describe 'When reading strings' do
229
+ it "recognizes regular strings" do
230
+ v = load_string '["abc", "def", "ghi"]'
231
+
232
+ completed = v.validate
233
+ expect(completed).to be true
234
+ expect(v.stats(:string)).to be 3
235
+ end
236
+
237
+ it "recognizes strings containing excaped characters" do
238
+ v = load_string '["ab\"c", "6\\2=3"]'
239
+
240
+ completed = v.validate
241
+ expect(completed).to be true
242
+ expect(v.stats(:string)).to be 2
243
+ end
244
+
245
+ it "recognizes strings containing UTF8 characters" do
246
+ v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
247
+
248
+ completed = v.validate
249
+ expect(completed).to be true
250
+ expect(v.stats(:string)).to be 2
251
+ end
252
+
253
+ it "recognizes long strings containing UTF8 characters" do
254
+ v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
255
+
256
+ completed = v.validate
257
+ expect(completed).to be true
258
+ expect(v.stats(:string)).to be 1
259
+ end
260
+ end
261
+
262
+ describe 'When reading literals' do
263
+ it "recognizes numbers" do
264
+ v = load_string '[1, -2.4, 1.0E+2]'
265
+
266
+ completed = v.validate
267
+ expect(completed).to be true
268
+ expect(v.stats(:literal)).to be 3
269
+ end
270
+
271
+ it "recognizes boolean values" do
272
+ v = load_string '[true, false]'
273
+
274
+ completed = v.validate
275
+ expect(completed).to be true
276
+ expect(v.stats(:literal)).to be 2
277
+ end
278
+
279
+ it "recognizes 'true', 'false' and 'null'" do
280
+ v = load_string '[true, false, null]'
281
+
282
+ completed = v.validate
283
+ expect(completed).to be true
284
+ expect(v.stats(:literal)).to be 3
285
+ end
286
+ end
287
+
288
+ describe 'When reading invalid JSON content' do
289
+ it "rejects truncated JSON content" do
290
+ expect do
291
+ v = load_string '[{
292
+ "a": ["abc","def"],
293
+ "b": 4'
294
+ v.validate
295
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
296
+ end
297
+ end
298
+
299
+ describe 'When reading large JSON files' do
300
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
301
+ v = load_file 'long_file_valid.json'
302
+
303
+ completed = v.validate
304
+ expect(completed).to be false
305
+ end
306
+
307
+ it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
308
+ v = load_file 'long_file_valid_non_formatted.json'
309
+
310
+ completed = v.validate
311
+ expect(completed).to be false
312
+ end
313
+
314
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
315
+ v = load_file 'long_file_malformed.json'
316
+
317
+ completed = v.validate
318
+ expect(completed).to be false
319
+ end
320
+ end
321
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser do
4
+ MAX_READS = 100
5
+
6
+ def load_file(file_name)
7
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
8
+ FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
9
+ end
10
+
11
+ def file_size(file_name)
12
+ File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
13
+ end
14
+
15
+ describe 'When reading objects valid JSON files' do
16
+ it "identifies JSON files with objects as root nodes" do
17
+ io = load_file 'object.json'
18
+
19
+ parsed = subject.call(io)
20
+
21
+ expect(parsed).not_to be_nil
22
+ expect(parsed.nature).to eq(:text)
23
+ expect(parsed.format).to eq(:json)
24
+ expect(parsed.content_type).to eq('application/json')
25
+ end
26
+
27
+ it "identifies JSON files carrying arrays as root nodes" do
28
+ io = load_file 'array.json'
29
+
30
+ parsed = subject.call(io)
31
+
32
+ expect(parsed).not_to be_nil
33
+ expect(parsed.nature).to eq(:text)
34
+ expect(parsed.format).to eq(:json)
35
+ expect(parsed.content_type).to eq('application/json')
36
+ end
37
+
38
+ it "identifies formatted JSON files" do
39
+ io = load_file 'formatted_object_utf8.json'
40
+
41
+ parsed = subject.call(io)
42
+
43
+ expect(parsed).not_to be_nil
44
+ expect(parsed.nature).to eq(:text)
45
+ expect(parsed.format).to eq(:json)
46
+ expect(parsed.content_type).to eq('application/json')
47
+ end
48
+
49
+ it "identifies files wrapped in whitespace characters" do
50
+ io = load_file 'whitespaces.json'
51
+
52
+ parsed = subject.call(io)
53
+
54
+ expect(parsed).not_to be_nil
55
+ expect(parsed.nature).to eq(:text)
56
+ expect(parsed.format).to eq(:json)
57
+ expect(parsed.content_type).to eq('application/json')
58
+ end
59
+
60
+ it "identifies files with nested objects and arrays" do
61
+ io = load_file 'nested_objects.json'
62
+
63
+ parsed = subject.call(io)
64
+
65
+ expect(parsed).not_to be_nil
66
+ expect(parsed.nature).to eq(:text)
67
+ expect(parsed.format).to eq(:json)
68
+ expect(parsed.content_type).to eq('application/json')
69
+ end
70
+
71
+ it "is reads the whole content of small files before accepting them" do
72
+ file_name = 'nested_objects.json'
73
+ io = load_file file_name
74
+ file_size = file_size file_name
75
+
76
+ parsed = subject.call(io)
77
+
78
+ expect(parsed).not_to be_nil
79
+ expect(parsed.nature).to eq(:text)
80
+ expect(parsed.format).to eq(:json)
81
+ expect(parsed.content_type).to eq('application/json')
82
+ expect(io.bytes).to be >= file_size
83
+ end
84
+
85
+ it "is accepts long files before reading the whole content" do
86
+ file_name = 'long_array_numbers.json'
87
+ io = load_file file_name
88
+ file_size = file_size file_name
89
+
90
+ parsed = subject.call(io)
91
+
92
+ expect(parsed).not_to be_nil
93
+ expect(parsed.nature).to eq(:text)
94
+ expect(parsed.format).to eq(:json)
95
+ expect(parsed.content_type).to eq('application/json')
96
+ expect(io.bytes).to be < file_size
97
+ end
98
+ end
99
+
100
+ describe 'When reading objects invalid JSON files' do
101
+ it "rejects files with corrupted JSON data" do
102
+ io = load_file 'malformed.json'
103
+
104
+ parsed = subject.call(io)
105
+
106
+ expect(parsed).to be_nil
107
+ end
108
+
109
+ it "rejects invalid files early without reading the whole content" do
110
+ io = load_file 'lorem_ipsum.json'
111
+
112
+ parsed = subject.call(io)
113
+
114
+ expect(parsed).to be_nil
115
+ expect(io.reads).to eq(1)
116
+ end
117
+ end
118
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.0
4
+ version: 2.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-12 00:00:00.000000000 Z
12
+ date: 2023-06-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -17,14 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: 1.3.8
20
+ version: 1.4.0
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: 1.3.8
27
+ version: 1.4.0
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: id3tag
30
30
  requirement: !ruby/object:Gem::Requirement
@@ -236,6 +236,8 @@ files:
236
236
  - lib/parsers/iso_base_media_file_format/decoder.rb
237
237
  - lib/parsers/iso_base_media_file_format/utils.rb
238
238
  - lib/parsers/jpeg_parser.rb
239
+ - lib/parsers/json_parser.rb
240
+ - lib/parsers/json_parser/validator.rb
239
241
  - lib/parsers/m3u_parser.rb
240
242
  - lib/parsers/mov_parser.rb
241
243
  - lib/parsers/mov_parser/decoder.rb
@@ -260,6 +262,7 @@ files:
260
262
  - lib/remote_io.rb
261
263
  - lib/string.rb
262
264
  - lib/text.rb
265
+ - lib/utf8_reader.rb
263
266
  - lib/video.rb
264
267
  - spec/active_storage/blob_io_spec.rb
265
268
  - spec/active_storage/rails_app_spec.rb
@@ -289,6 +292,8 @@ files:
289
292
  - spec/parsers/iso_base_media_file_format/decoder_spec.rb
290
293
  - spec/parsers/iso_base_media_file_format/utils_spec.rb
291
294
  - spec/parsers/jpeg_parser_spec.rb
295
+ - spec/parsers/json_parser/validator_spec.rb
296
+ - spec/parsers/json_parser_spec.rb
292
297
  - spec/parsers/m3u_parser_spec.rb
293
298
  - spec/parsers/mov_parser_spec.rb
294
299
  - spec/parsers/mp3_parser_spec.rb
@@ -329,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
329
334
  - !ruby/object:Gem::Version
330
335
  version: '0'
331
336
  requirements: []
332
- rubygems_version: 3.4.1
337
+ rubygems_version: 3.3.7
333
338
  signing_key:
334
339
  specification_version: 4
335
340
  summary: A library for efficient parsing of file metadata