format_parser 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ce8a7c3fd258ccf2abcee6ea8731c0337e7255dfb189ccfb5fdfc02e9dd9b36
4
- data.tar.gz: a6ff6ea6e771f2636e30cfee70b3c22584cb85e7020a97483f66e2c482048444
3
+ metadata.gz: f7b8b37e26143e2ea941db88183475e70c90ab658adcdad362b32b457a7d19ac
4
+ data.tar.gz: 50758e107065e1a2ab4fbca7775359d928cd1a62942277a94fcbabfefa1bfe10
5
5
  SHA512:
6
- metadata.gz: e27bd51913d3a3b3d061ec27379acc4a15fa19ee2e16cccdbf5a0aec4daf8fe26cdfc0062115119616384ded92c645e4a88a6d0244b47b33c5fc79903ccd3906
7
- data.tar.gz: 024756cdb460347f36cd1aa247b04f5468982b4c502cf049e8c8abd5cb1020f960006f552c7659770c6149ff6ed3a61def2beb44160236f35fe35533579f6c31
6
+ metadata.gz: d34bcd7b0162fe6f911bdd8c3b626dd9ce35b98139bca6ec1b54e653bc7e50453af734c74134745a46e91fbc3528d88f1663fabdbd35e06ae908a41f8e81dcd7
7
+ data.tar.gz: 2ebbc65f373a3e34a2e8300d40bb239df6d2003dab54b9c461a8cc75f651800d93273aa73bfb4f1a14f195e65f348c218b90b4c1bae860c8269e024ebb2e890c
data/README.md CHANGED
@@ -26,6 +26,7 @@ and [dimensions,](https://github.com/sstephenson/dimensions) borrowing from them
26
26
  * HEIC
27
27
  * HEIF
28
28
  * JPEG
29
+ * JSON
29
30
  * M3U
30
31
  * M4A
31
32
  * M4B
@@ -1,3 +1,3 @@
1
1
  module FormatParser
2
- VERSION = '2.6.0'
2
+ VERSION = '2.7.0'
3
3
  end
data/lib/format_parser.rb CHANGED
@@ -17,6 +17,7 @@ module FormatParser
17
17
  require_relative 'read_limits_config'
18
18
  require_relative 'remote_io'
19
19
  require_relative 'io_constraint'
20
+ require_relative 'utf8_reader'
20
21
  require_relative 'care'
21
22
  require_relative 'active_storage/blob_analyzer'
22
23
  require_relative 'text'
@@ -0,0 +1,319 @@
1
+ ##
2
+ # This class checks whether a given file is a valid JSON file.
3
+ # The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
4
+ # Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
5
+ # the hierarchy of nodes in the document.
6
+ #
7
+ # Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
8
+ # it does cut a few corners for the sake of simplicity. For instance, instead of validating
9
+ # Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
10
+ # This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
11
+ #
12
+ # There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
13
+ # JSON-compliant, it is assumed that the file is a JSON file.
14
+
15
+ class FormatParser::JSONParser::Validator
16
+ class JSONParserError < StandardError
17
+ end
18
+
19
+ MAX_SAMPLE_SIZE = 1024
20
+ MAX_LITERAL_SIZE = 30 # much larger then necessary.
21
+ ESCAPE_CHAR = "\\"
22
+ WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
23
+ ENDING_VALUE_CHARS = [",", "]", "}"]
24
+ LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
25
+
26
+ def initialize(io)
27
+ @io = io
28
+ @current_node = nil # :object, :array, :string, :literal
29
+ @parent_nodes = []
30
+ @current_state = :awaiting_root_node
31
+ @escape_next = false
32
+ @current_literal_size = 0
33
+ @pos = 0
34
+
35
+ @all_parsers = {}
36
+
37
+ @execution_stats = {
38
+ array: 0,
39
+ object: 0,
40
+ literal: 0,
41
+ string: 0
42
+ }
43
+
44
+ setup_transitions
45
+ end
46
+
47
+ def validate
48
+ char_reader = FormatParser::UTF8Reader.new(@io)
49
+
50
+ while (c = char_reader.read_char)
51
+ @pos += 1
52
+ parse_char c
53
+
54
+ # Halt validation if the sampling limit is reached.
55
+ if @pos >= MAX_SAMPLE_SIZE
56
+ raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
57
+ return false
58
+ end
59
+ end
60
+
61
+ # Raising error in case the EOF is reached earlier than expected
62
+ raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
63
+ true
64
+ rescue FormatParser::UTF8Reader::UTF8CharReaderError
65
+ raise JSONParserError, "Invalid UTF-8 character"
66
+ end
67
+
68
+ def stats(node_type)
69
+ @execution_stats[node_type]
70
+ end
71
+
72
+ private
73
+
74
+ def setup_transitions
75
+ when_its :awaiting_root_node, ->(c) do
76
+ read_whitespace(c) or
77
+ start_object(c) or
78
+ start_array(c)
79
+ end
80
+
81
+ when_its :awaiting_object_attribute_key, ->(c) do
82
+ read_whitespace(c) or
83
+ start_attribute_key(c) or
84
+ close_object(c)
85
+ end
86
+
87
+ when_its :reading_object_attribute_key, ->(c) do
88
+ close_attribute_key(c) or
89
+ read_valid_string_char(c)
90
+ end
91
+
92
+ when_its :awaiting_object_colon_separator, ->(c) do
93
+ read_whitespace(c) or
94
+ read_colon(c)
95
+ end
96
+
97
+ when_its :awaiting_object_attribute_value, ->(c) do
98
+ read_whitespace(c) or
99
+ start_object(c) or
100
+ start_array(c) or
101
+ start_string(c) or
102
+ start_literal(c)
103
+ end
104
+
105
+ when_its :awaiting_array_value, ->(c) do
106
+ read_whitespace(c) or
107
+ start_object(c) or
108
+ start_array(c) or
109
+ start_string(c) or
110
+ start_literal(c) or
111
+ close_array(c)
112
+ end
113
+
114
+ when_its :reading_string, ->(c) do
115
+ close_string(c) or
116
+ read_valid_string_char(c)
117
+ end
118
+
119
+ when_its :awaiting_next_or_close, ->(c) do
120
+ read_whitespace(c) or
121
+ read_comma_separator(c) or
122
+ close_object(c) or
123
+ close_array(c)
124
+ end
125
+
126
+ when_its :reading_literal, ->(c) do
127
+ read_valid_literal_char(c) or (
128
+ close_literal(c) and (
129
+ read_whitespace(c) or
130
+ read_comma_separator(c) or
131
+ close_array(c) or
132
+ close_object(c)))
133
+ end
134
+
135
+ when_its :closed, ->(c) do
136
+ read_whitespace(c)
137
+ end
138
+ end
139
+
140
+ def when_its(state, act)
141
+ @all_parsers[state] = act
142
+ end
143
+
144
+ def parse_char(c)
145
+ next_step = @all_parsers[@current_state]
146
+ accepted = next_step.call(c)
147
+ reject_char(c) unless accepted
148
+ end
149
+
150
+ def read_whitespace(c)
151
+ whitespace?(c)
152
+ end
153
+
154
+ def read_colon(c)
155
+ if c == ":"
156
+ @current_state = :awaiting_object_attribute_value
157
+ return true
158
+ end
159
+ false
160
+ end
161
+
162
+ def read_valid_string_char(c)
163
+ if @escape_next
164
+ @escape_next = false
165
+ return true
166
+ end
167
+
168
+ if c == ESCAPE_CHAR
169
+ @escape_next = true
170
+ return true
171
+ end
172
+ !control_char?(c) and c != "\""
173
+ end
174
+
175
+ def read_valid_literal_char(c)
176
+ if valid_literal_char?(c)
177
+ @current_literal_size += 1
178
+ return true
179
+ end
180
+
181
+ false
182
+ end
183
+
184
+ def read_comma_separator(c)
185
+ if c == ","
186
+ @current_state = :awaiting_object_attribute_key if @current_node == :object
187
+ @current_state = :awaiting_array_value if @current_node == :array
188
+ return true
189
+ end
190
+ false
191
+ end
192
+
193
+ # Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
194
+ def start_object(c)
195
+ return false if whitespace?(c)
196
+ return false unless c == "{"
197
+
198
+ begin_node(:object)
199
+ @current_state = :awaiting_object_attribute_key
200
+ true
201
+ end
202
+
203
+ def close_object(c)
204
+ return false if whitespace?(c)
205
+ return false unless @current_node == :object and c == "}"
206
+
207
+ end_node
208
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
209
+ true
210
+ end
211
+
212
+ # Array: [1, "two", true, undefined, {}, []]
213
+ def start_array(c)
214
+ return false unless c == "["
215
+
216
+ begin_node(:array)
217
+ @current_state = :awaiting_array_value
218
+ true
219
+ end
220
+
221
+ def close_array(c)
222
+ return false if whitespace?(c)
223
+ return false unless @current_node == :array and c == "]"
224
+
225
+ end_node
226
+ @current_state = :awaiting_next_or_close unless @current_node.nil?
227
+ true
228
+ end
229
+
230
+ def start_attribute_key(c)
231
+ return false unless c == "\""
232
+
233
+ begin_node(:string)
234
+ @current_state = :reading_object_attribute_key
235
+ true
236
+ end
237
+
238
+ def close_attribute_key(c)
239
+ return false if @escape_next
240
+ return false unless c == "\""
241
+ end_node
242
+ @current_state = :awaiting_object_colon_separator
243
+ true
244
+ end
245
+
246
+ # Strings: "Foo"
247
+ def start_string(c)
248
+ return false unless c == "\""
249
+
250
+ begin_node(:string)
251
+ @current_state = :reading_string
252
+ true
253
+ end
254
+
255
+ def close_string(c)
256
+ return false if @escape_next
257
+ return false unless c == "\""
258
+ end_node
259
+ @current_state = :awaiting_next_or_close
260
+ true
261
+ end
262
+
263
+ # literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
264
+ def start_literal(c)
265
+ return false unless valid_literal_char?(c)
266
+
267
+ begin_node(:literal)
268
+ @current_state = :reading_literal
269
+ @current_literal_size = 1
270
+ true
271
+ end
272
+
273
+ def close_literal(c)
274
+ raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
275
+
276
+ if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
277
+ end_node
278
+ @current_state = :awaiting_next_or_close
279
+ return true
280
+ end
281
+
282
+ false
283
+ end
284
+
285
+ # Marks the creation of a node (object, array, string or literal)
286
+ def begin_node(node_type)
287
+ # Accounting for the new node
288
+ @execution_stats[node_type] ||= 0
289
+ @execution_stats[node_type] += 1
290
+
291
+ # Managing the node execution stack
292
+ @parent_nodes.push(@current_node)
293
+ @current_node = node_type
294
+ end
295
+
296
+ # Marks the closure of a node (object, array, string or literal)
297
+ def end_node
298
+ @current_node = @parent_nodes.pop
299
+ @current_state = :closed if @current_node.nil?
300
+ end
301
+
302
+ def reject_char(char)
303
+ raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
304
+ end
305
+
306
+ def whitespace?(c)
307
+ WHITESPACE_CHARS.include?(c)
308
+ end
309
+
310
+ def control_char?(c)
311
+ # control characters: (U+0000 through U+001F)
312
+ utf8_code = c.unpack('U*')[0]
313
+ utf8_code <= 31
314
+ end
315
+
316
+ def valid_literal_char?(c)
317
+ LITERALS_CHAR_TEMPLATE === c
318
+ end
319
+ end
@@ -0,0 +1,25 @@
1
+ class FormatParser::JSONParser
2
+ include FormatParser::IOUtils
3
+ require_relative 'json_parser/validator'
4
+
5
+ JSON_MIME_TYPE = 'application/json'
6
+
7
+ def likely_match?(filename)
8
+ filename =~ /\.json$/i
9
+ end
10
+
11
+ def call(io)
12
+ io = FormatParser::IOConstraint.new(io)
13
+ validator = Validator.new(io)
14
+
15
+ validator.validate
16
+
17
+ FormatParser::Text.new(
18
+ format: :json,
19
+ content_type: JSON_MIME_TYPE,
20
+ )
21
+ rescue Validator::JSONParserError
22
+ nil
23
+ end
24
+ FormatParser.register_parser new, natures: :text, formats: :json
25
+ end
@@ -0,0 +1,68 @@
1
+ ##
2
+ # This class Reads individual characters from files using UTF-8 encoding
3
+ # This deals with two main concerns:
4
+ # - Variable byte length of characters
5
+ # - Reducing the number of read operations by loading bytes in chunks
6
+
7
+ class FormatParser::UTF8Reader
8
+ READ_CHUNK_SIZE = 128
9
+
10
+ class UTF8CharReaderError < StandardError
11
+ end
12
+
13
+ def initialize(io)
14
+ @io = io
15
+ @chunk = ""
16
+ @index = 0
17
+ @eof = false
18
+ end
19
+
20
+ def read_char
21
+ first_byte = read_byte
22
+ return if first_byte.nil?
23
+
24
+ char_length = assess_char_length(first_byte)
25
+ as_bytes = Array.new(char_length) do |i|
26
+ next first_byte if i == 0
27
+ read_byte
28
+ end
29
+
30
+ char = as_bytes.pack('c*').force_encoding('UTF-8')
31
+ raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
32
+
33
+ char
34
+ rescue TypeError
35
+ raise UTF8CharReaderError, "Invalid UTF-8 character"
36
+ end
37
+
38
+ private
39
+
40
+ def read_byte
41
+ manage_data_chunk
42
+ return if @chunk.nil?
43
+ byte = @chunk.bytes[@index]
44
+ @index += 1 unless byte.nil?
45
+ byte
46
+ end
47
+
48
+ def manage_data_chunk
49
+ return if @index < @chunk.length
50
+ @chunk = @io.read(READ_CHUNK_SIZE)
51
+ @chunk ||= ""
52
+ @index = 0
53
+ @eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
54
+ end
55
+
56
+ def assess_char_length(first_byte)
57
+ # 0_______ (1 byte)
58
+ # 110_____ (2 bytes) 192
59
+ # 1110____ (3 bytes) 224
60
+ # 11110___ (4 bytes) 240
61
+ case first_byte
62
+ when 240.. then 4
63
+ when 224..239 then 3
64
+ when 192..223 then 2
65
+ else 1
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,321 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser::Validator do
4
+ def load_file(file_name)
5
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
6
+ FormatParser::JSONParser::Validator.new(io)
7
+ end
8
+
9
+ def load_string(content)
10
+ io = StringIO.new(content.encode(Encoding::UTF_8))
11
+ FormatParser::JSONParser::Validator.new(io)
12
+ end
13
+
14
+ describe 'When reading root nodes' do
15
+ it "identifies objects as root nodes" do
16
+ v = load_string '{"key": "value"}'
17
+
18
+ completed = v.validate
19
+
20
+ expect(completed).to be true
21
+ expect(v.stats(:object)).to be 1
22
+ expect(v.stats(:string)).to be 2
23
+ end
24
+
25
+ it "identifies arrays as root nodes" do
26
+ v = load_string '["e1", "e2"]'
27
+
28
+ completed = v.validate
29
+
30
+ expect(completed).to be true
31
+ expect(v.stats(:array)).to be 1
32
+ expect(v.stats(:string)).to be 2
33
+ end
34
+
35
+ it "rejects strings as root nodes" do
36
+ expect do
37
+ v = load_string '"this is a string"'
38
+ v.validate
39
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
40
+ end
41
+
42
+ it "rejects literals as root nodes" do
43
+ expect do
44
+ v = load_string 'true'
45
+ v.validate
46
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
47
+ end
48
+ end
49
+
50
+ describe 'When reading objects' do
51
+ it "recognizes empty objects" do
52
+ v = load_string '{}'
53
+
54
+ completed = v.validate
55
+ expect(completed).to be true
56
+ expect(v.stats(:object)).to be 1
57
+ expect(v.stats(:string)).to be 0
58
+ end
59
+
60
+ it "recognizes objects with a single attribute" do
61
+ v = load_string '{"key": "value"}'
62
+
63
+ completed = v.validate
64
+ expect(completed).to be true
65
+ expect(v.stats(:object)).to be 1
66
+ expect(v.stats(:string)).to be 2
67
+ end
68
+
69
+ it "recognizes objects with attributes of different types" do
70
+ v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
71
+
72
+ completed = v.validate
73
+ expect(completed).to be true
74
+ expect(v.stats(:object)).to be 1
75
+ expect(v.stats(:string)).to be 4
76
+ expect(v.stats(:literal)).to be 2
77
+ end
78
+
79
+ it "recognizes condensed objects (no whitespaces)" do
80
+ v = load_string '{"a":"b","c":"d"}'
81
+
82
+ completed = v.validate
83
+ expect(completed).to be true
84
+ expect(v.stats(:object)).to be 1
85
+ expect(v.stats(:string)).to be 4
86
+ end
87
+
88
+ it "recognizes formatted objects" do
89
+ v = load_string '{
90
+ "a":"b",
91
+ "c":"d"
92
+ }'
93
+
94
+ completed = v.validate
95
+ expect(completed).to be true
96
+ expect(v.stats(:object)).to be 1
97
+ expect(v.stats(:string)).to be 4
98
+ end
99
+
100
+ it "recognizes objects with nested objects and arrays" do
101
+ v = load_string '{
102
+ "a": {
103
+ "a1": "-",
104
+ "a2": "-",
105
+ "a3": {
106
+ "a3.1": "-"
107
+ },
108
+ },
109
+ "c": [1, null]
110
+ }'
111
+
112
+ completed = v.validate
113
+ expect(completed).to be true
114
+ expect(v.stats(:object)).to be 3
115
+ expect(v.stats(:array)).to be 1
116
+ expect(v.stats(:string)).to be 9
117
+ expect(v.stats(:literal)).to be 2
118
+ end
119
+
120
+ it "rejects objects without double-quoted attribute names" do
121
+ expect do
122
+ v = load_string '{a:"b",c:"d"}'
123
+ v.validate
124
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
125
+ end
126
+
127
+ it "rejects objects without comma separators" do
128
+ expect do
129
+ v = load_string '{
130
+ "a":"b"
131
+ "c":"d"
132
+ }'
133
+ v.validate
134
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
135
+ end
136
+ end
137
+
138
+ describe 'When reading arrays' do
139
+ it "recognizes empty arrays" do
140
+ v = load_string '[]'
141
+
142
+ completed = v.validate
143
+ expect(completed).to be true
144
+ expect(v.stats(:array)).to be 1
145
+ expect(v.stats(:string)).to be 0
146
+ end
147
+
148
+ it "recognizes arrays with a single element" do
149
+ v = load_string '[{}]'
150
+
151
+ completed = v.validate
152
+ expect(completed).to be true
153
+ expect(v.stats(:array)).to be 1
154
+ expect(v.stats(:object)).to be 1
155
+ end
156
+
157
+ it "recognizes arrays with elements of different types" do
158
+ v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
159
+
160
+ completed = v.validate
161
+ expect(completed).to be true
162
+ expect(v.stats(:array)).to be 2
163
+ expect(v.stats(:object)).to be 1
164
+ expect(v.stats(:string)).to be 3
165
+ expect(v.stats(:literal)).to be 2
166
+ end
167
+
168
+ it "recognizes condensed arrays (no whitespaces)" do
169
+ v = load_string '["a",2,null,false]'
170
+
171
+ completed = v.validate
172
+ expect(completed).to be true
173
+ expect(v.stats(:array)).to be 1
174
+ expect(v.stats(:string)).to be 1
175
+ expect(v.stats(:literal)).to be 3
176
+ end
177
+
178
+ it "recognizes formatted arrays" do
179
+ v = load_string '[
180
+ {
181
+ "a":"b"
182
+ },
183
+ {
184
+ "c":"d"
185
+ }
186
+ ]'
187
+
188
+ completed = v.validate
189
+ expect(completed).to be true
190
+ expect(v.stats(:array)).to be 1
191
+ expect(v.stats(:object)).to be 2
192
+ expect(v.stats(:string)).to be 4
193
+ end
194
+
195
+ it "recognizes arrays with nested objects and arrays" do
196
+ v = load_string '[{
197
+ "a": {
198
+ "a1": "-",
199
+ "a2": "-",
200
+ "a3": {
201
+ "a3.1": "-"
202
+ },
203
+ },
204
+ "c": [1, null]
205
+ },
206
+ [{ "a": "b" }, { "c":"d" }]
207
+ ]'
208
+
209
+ completed = v.validate
210
+ expect(completed).to be true
211
+ expect(v.stats(:array)).to be 3
212
+ expect(v.stats(:object)).to be 5
213
+ expect(v.stats(:string)).to be 13
214
+ expect(v.stats(:literal)).to be 2
215
+ end
216
+
217
+ it "rejects arrays without comma separators" do
218
+ expect do
219
+ v = load_string '[
220
+ "abc"
221
+ "def"
222
+ ]'
223
+ v.validate
224
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
225
+ end
226
+ end
227
+
228
+ describe 'When reading strings' do
229
+ it "recognizes regular strings" do
230
+ v = load_string '["abc", "def", "ghi"]'
231
+
232
+ completed = v.validate
233
+ expect(completed).to be true
234
+ expect(v.stats(:string)).to be 3
235
+ end
236
+
237
+ it "recognizes strings containing excaped characters" do
238
+ v = load_string '["ab\"c", "6\\2=3"]'
239
+
240
+ completed = v.validate
241
+ expect(completed).to be true
242
+ expect(v.stats(:string)).to be 2
243
+ end
244
+
245
+ it "recognizes strings containing UTF8 characters" do
246
+ v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
247
+
248
+ completed = v.validate
249
+ expect(completed).to be true
250
+ expect(v.stats(:string)).to be 2
251
+ end
252
+
253
+ it "recognizes long strings containing UTF8 characters" do
254
+ v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
255
+
256
+ completed = v.validate
257
+ expect(completed).to be true
258
+ expect(v.stats(:string)).to be 1
259
+ end
260
+ end
261
+
262
+ describe 'When reading literals' do
263
+ it "recognizes numbers" do
264
+ v = load_string '[1, -2.4, 1.0E+2]'
265
+
266
+ completed = v.validate
267
+ expect(completed).to be true
268
+ expect(v.stats(:literal)).to be 3
269
+ end
270
+
271
+ it "recognizes boolean values" do
272
+ v = load_string '[true, false]'
273
+
274
+ completed = v.validate
275
+ expect(completed).to be true
276
+ expect(v.stats(:literal)).to be 2
277
+ end
278
+
279
+ it "recognizes 'true', 'false' and 'null'" do
280
+ v = load_string '[true, false, null]'
281
+
282
+ completed = v.validate
283
+ expect(completed).to be true
284
+ expect(v.stats(:literal)).to be 3
285
+ end
286
+ end
287
+
288
+ describe 'When reading invalid JSON content' do
289
+ it "rejects truncated JSON content" do
290
+ expect do
291
+ v = load_string '[{
292
+ "a": ["abc","def"],
293
+ "b": 4'
294
+ v.validate
295
+ end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
296
+ end
297
+ end
298
+
299
+ describe 'When reading large JSON files' do
300
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
301
+ v = load_file 'long_file_valid.json'
302
+
303
+ completed = v.validate
304
+ expect(completed).to be false
305
+ end
306
+
307
+ it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
308
+ v = load_file 'long_file_valid_non_formatted.json'
309
+
310
+ completed = v.validate
311
+ expect(completed).to be false
312
+ end
313
+
314
+ it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
315
+ v = load_file 'long_file_malformed.json'
316
+
317
+ completed = v.validate
318
+ expect(completed).to be false
319
+ end
320
+ end
321
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ describe FormatParser::JSONParser do
4
+ MAX_READS = 100
5
+
6
+ def load_file(file_name)
7
+ io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
8
+ FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
9
+ end
10
+
11
+ def file_size(file_name)
12
+ File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
13
+ end
14
+
15
+ describe 'When reading objects valid JSON files' do
16
+ it "identifies JSON files with objects as root nodes" do
17
+ io = load_file 'object.json'
18
+
19
+ parsed = subject.call(io)
20
+
21
+ expect(parsed).not_to be_nil
22
+ expect(parsed.nature).to eq(:text)
23
+ expect(parsed.format).to eq(:json)
24
+ expect(parsed.content_type).to eq('application/json')
25
+ end
26
+
27
+ it "identifies JSON files carrying arrays as root nodes" do
28
+ io = load_file 'array.json'
29
+
30
+ parsed = subject.call(io)
31
+
32
+ expect(parsed).not_to be_nil
33
+ expect(parsed.nature).to eq(:text)
34
+ expect(parsed.format).to eq(:json)
35
+ expect(parsed.content_type).to eq('application/json')
36
+ end
37
+
38
+ it "identifies formatted JSON files" do
39
+ io = load_file 'formatted_object_utf8.json'
40
+
41
+ parsed = subject.call(io)
42
+
43
+ expect(parsed).not_to be_nil
44
+ expect(parsed.nature).to eq(:text)
45
+ expect(parsed.format).to eq(:json)
46
+ expect(parsed.content_type).to eq('application/json')
47
+ end
48
+
49
+ it "identifies files wrapped in whitespace characters" do
50
+ io = load_file 'whitespaces.json'
51
+
52
+ parsed = subject.call(io)
53
+
54
+ expect(parsed).not_to be_nil
55
+ expect(parsed.nature).to eq(:text)
56
+ expect(parsed.format).to eq(:json)
57
+ expect(parsed.content_type).to eq('application/json')
58
+ end
59
+
60
+ it "identifies files with nested objects and arrays" do
61
+ io = load_file 'nested_objects.json'
62
+
63
+ parsed = subject.call(io)
64
+
65
+ expect(parsed).not_to be_nil
66
+ expect(parsed.nature).to eq(:text)
67
+ expect(parsed.format).to eq(:json)
68
+ expect(parsed.content_type).to eq('application/json')
69
+ end
70
+
71
+ it "is reads the whole content of small files before accepting them" do
72
+ file_name = 'nested_objects.json'
73
+ io = load_file file_name
74
+ file_size = file_size file_name
75
+
76
+ parsed = subject.call(io)
77
+
78
+ expect(parsed).not_to be_nil
79
+ expect(parsed.nature).to eq(:text)
80
+ expect(parsed.format).to eq(:json)
81
+ expect(parsed.content_type).to eq('application/json')
82
+ expect(io.bytes).to be >= file_size
83
+ end
84
+
85
+ it "is accepts long files before reading the whole content" do
86
+ file_name = 'long_array_numbers.json'
87
+ io = load_file file_name
88
+ file_size = file_size file_name
89
+
90
+ parsed = subject.call(io)
91
+
92
+ expect(parsed).not_to be_nil
93
+ expect(parsed.nature).to eq(:text)
94
+ expect(parsed.format).to eq(:json)
95
+ expect(parsed.content_type).to eq('application/json')
96
+ expect(io.bytes).to be < file_size
97
+ end
98
+ end
99
+
100
+ describe 'When reading objects invalid JSON files' do
101
+ it "rejects files with corrupted JSON data" do
102
+ io = load_file 'malformed.json'
103
+
104
+ parsed = subject.call(io)
105
+
106
+ expect(parsed).to be_nil
107
+ end
108
+
109
+ it "rejects invalid files early without reading the whole content" do
110
+ io = load_file 'lorem_ipsum.json'
111
+
112
+ parsed = subject.call(io)
113
+
114
+ expect(parsed).to be_nil
115
+ expect(io.reads).to eq(1)
116
+ end
117
+ end
118
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: format_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.0
4
+ version: 2.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Noah Berman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-31 00:00:00.000000000 Z
12
+ date: 2023-06-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: exifr
@@ -236,6 +236,8 @@ files:
236
236
  - lib/parsers/iso_base_media_file_format/decoder.rb
237
237
  - lib/parsers/iso_base_media_file_format/utils.rb
238
238
  - lib/parsers/jpeg_parser.rb
239
+ - lib/parsers/json_parser.rb
240
+ - lib/parsers/json_parser/validator.rb
239
241
  - lib/parsers/m3u_parser.rb
240
242
  - lib/parsers/mov_parser.rb
241
243
  - lib/parsers/mov_parser/decoder.rb
@@ -260,6 +262,7 @@ files:
260
262
  - lib/remote_io.rb
261
263
  - lib/string.rb
262
264
  - lib/text.rb
265
+ - lib/utf8_reader.rb
263
266
  - lib/video.rb
264
267
  - spec/active_storage/blob_io_spec.rb
265
268
  - spec/active_storage/rails_app_spec.rb
@@ -289,6 +292,8 @@ files:
289
292
  - spec/parsers/iso_base_media_file_format/decoder_spec.rb
290
293
  - spec/parsers/iso_base_media_file_format/utils_spec.rb
291
294
  - spec/parsers/jpeg_parser_spec.rb
295
+ - spec/parsers/json_parser/validator_spec.rb
296
+ - spec/parsers/json_parser_spec.rb
292
297
  - spec/parsers/m3u_parser_spec.rb
293
298
  - spec/parsers/mov_parser_spec.rb
294
299
  - spec/parsers/mp3_parser_spec.rb