format_parser 2.6.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +1 -0
- data/lib/parsers/json_parser/validator.rb +319 -0
- data/lib/parsers/json_parser.rb +25 -0
- data/lib/utf8_reader.rb +68 -0
- data/spec/parsers/json_parser/validator_spec.rb +321 -0
- data/spec/parsers/json_parser_spec.rb +118 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7b8b37e26143e2ea941db88183475e70c90ab658adcdad362b32b457a7d19ac
|
4
|
+
data.tar.gz: 50758e107065e1a2ab4fbca7775359d928cd1a62942277a94fcbabfefa1bfe10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d34bcd7b0162fe6f911bdd8c3b626dd9ce35b98139bca6ec1b54e653bc7e50453af734c74134745a46e91fbc3528d88f1663fabdbd35e06ae908a41f8e81dcd7
|
7
|
+
data.tar.gz: 2ebbc65f373a3e34a2e8300d40bb239df6d2003dab54b9c461a8cc75f651800d93273aa73bfb4f1a14f195e65f348c218b90b4c1bae860c8269e024ebb2e890c
|
data/README.md
CHANGED
data/lib/format_parser.rb
CHANGED
@@ -17,6 +17,7 @@ module FormatParser
|
|
17
17
|
require_relative 'read_limits_config'
|
18
18
|
require_relative 'remote_io'
|
19
19
|
require_relative 'io_constraint'
|
20
|
+
require_relative 'utf8_reader'
|
20
21
|
require_relative 'care'
|
21
22
|
require_relative 'active_storage/blob_analyzer'
|
22
23
|
require_relative 'text'
|
@@ -0,0 +1,319 @@
|
|
1
|
+
##
|
2
|
+
# This class checks whether a given file is a valid JSON file.
|
3
|
+
# The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
|
4
|
+
# Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
|
5
|
+
# the hierarchy of nodes in the document.
|
6
|
+
#
|
7
|
+
# Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
|
8
|
+
# it does cut a few corners for the sake of simplicity. For instance, instead of validating
|
9
|
+
# Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
|
10
|
+
# This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
|
11
|
+
#
|
12
|
+
# There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
|
13
|
+
# JSON-compliant, it is assumed that the file is a JSON file.
|
14
|
+
|
15
|
+
class FormatParser::JSONParser::Validator
|
16
|
+
class JSONParserError < StandardError
|
17
|
+
end
|
18
|
+
|
19
|
+
MAX_SAMPLE_SIZE = 1024
|
20
|
+
MAX_LITERAL_SIZE = 30 # much larger then necessary.
|
21
|
+
ESCAPE_CHAR = "\\"
|
22
|
+
WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
|
23
|
+
ENDING_VALUE_CHARS = [",", "]", "}"]
|
24
|
+
LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
|
25
|
+
|
26
|
+
def initialize(io)
|
27
|
+
@io = io
|
28
|
+
@current_node = nil # :object, :array, :string, :literal
|
29
|
+
@parent_nodes = []
|
30
|
+
@current_state = :awaiting_root_node
|
31
|
+
@escape_next = false
|
32
|
+
@current_literal_size = 0
|
33
|
+
@pos = 0
|
34
|
+
|
35
|
+
@all_parsers = {}
|
36
|
+
|
37
|
+
@execution_stats = {
|
38
|
+
array: 0,
|
39
|
+
object: 0,
|
40
|
+
literal: 0,
|
41
|
+
string: 0
|
42
|
+
}
|
43
|
+
|
44
|
+
setup_transitions
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate
|
48
|
+
char_reader = FormatParser::UTF8Reader.new(@io)
|
49
|
+
|
50
|
+
while (c = char_reader.read_char)
|
51
|
+
@pos += 1
|
52
|
+
parse_char c
|
53
|
+
|
54
|
+
# Halt validation if the sampling limit is reached.
|
55
|
+
if @pos >= MAX_SAMPLE_SIZE
|
56
|
+
raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Raising error in case the EOF is reached earlier than expected
|
62
|
+
raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
|
63
|
+
true
|
64
|
+
rescue FormatParser::UTF8Reader::UTF8CharReaderError
|
65
|
+
raise JSONParserError, "Invalid UTF-8 character"
|
66
|
+
end
|
67
|
+
|
68
|
+
def stats(node_type)
|
69
|
+
@execution_stats[node_type]
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def setup_transitions
|
75
|
+
when_its :awaiting_root_node, ->(c) do
|
76
|
+
read_whitespace(c) or
|
77
|
+
start_object(c) or
|
78
|
+
start_array(c)
|
79
|
+
end
|
80
|
+
|
81
|
+
when_its :awaiting_object_attribute_key, ->(c) do
|
82
|
+
read_whitespace(c) or
|
83
|
+
start_attribute_key(c) or
|
84
|
+
close_object(c)
|
85
|
+
end
|
86
|
+
|
87
|
+
when_its :reading_object_attribute_key, ->(c) do
|
88
|
+
close_attribute_key(c) or
|
89
|
+
read_valid_string_char(c)
|
90
|
+
end
|
91
|
+
|
92
|
+
when_its :awaiting_object_colon_separator, ->(c) do
|
93
|
+
read_whitespace(c) or
|
94
|
+
read_colon(c)
|
95
|
+
end
|
96
|
+
|
97
|
+
when_its :awaiting_object_attribute_value, ->(c) do
|
98
|
+
read_whitespace(c) or
|
99
|
+
start_object(c) or
|
100
|
+
start_array(c) or
|
101
|
+
start_string(c) or
|
102
|
+
start_literal(c)
|
103
|
+
end
|
104
|
+
|
105
|
+
when_its :awaiting_array_value, ->(c) do
|
106
|
+
read_whitespace(c) or
|
107
|
+
start_object(c) or
|
108
|
+
start_array(c) or
|
109
|
+
start_string(c) or
|
110
|
+
start_literal(c) or
|
111
|
+
close_array(c)
|
112
|
+
end
|
113
|
+
|
114
|
+
when_its :reading_string, ->(c) do
|
115
|
+
close_string(c) or
|
116
|
+
read_valid_string_char(c)
|
117
|
+
end
|
118
|
+
|
119
|
+
when_its :awaiting_next_or_close, ->(c) do
|
120
|
+
read_whitespace(c) or
|
121
|
+
read_comma_separator(c) or
|
122
|
+
close_object(c) or
|
123
|
+
close_array(c)
|
124
|
+
end
|
125
|
+
|
126
|
+
when_its :reading_literal, ->(c) do
|
127
|
+
read_valid_literal_char(c) or (
|
128
|
+
close_literal(c) and (
|
129
|
+
read_whitespace(c) or
|
130
|
+
read_comma_separator(c) or
|
131
|
+
close_array(c) or
|
132
|
+
close_object(c)))
|
133
|
+
end
|
134
|
+
|
135
|
+
when_its :closed, ->(c) do
|
136
|
+
read_whitespace(c)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def when_its(state, act)
|
141
|
+
@all_parsers[state] = act
|
142
|
+
end
|
143
|
+
|
144
|
+
def parse_char(c)
|
145
|
+
next_step = @all_parsers[@current_state]
|
146
|
+
accepted = next_step.call(c)
|
147
|
+
reject_char(c) unless accepted
|
148
|
+
end
|
149
|
+
|
150
|
+
def read_whitespace(c)
|
151
|
+
whitespace?(c)
|
152
|
+
end
|
153
|
+
|
154
|
+
def read_colon(c)
|
155
|
+
if c == ":"
|
156
|
+
@current_state = :awaiting_object_attribute_value
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
false
|
160
|
+
end
|
161
|
+
|
162
|
+
def read_valid_string_char(c)
|
163
|
+
if @escape_next
|
164
|
+
@escape_next = false
|
165
|
+
return true
|
166
|
+
end
|
167
|
+
|
168
|
+
if c == ESCAPE_CHAR
|
169
|
+
@escape_next = true
|
170
|
+
return true
|
171
|
+
end
|
172
|
+
!control_char?(c) and c != "\""
|
173
|
+
end
|
174
|
+
|
175
|
+
def read_valid_literal_char(c)
|
176
|
+
if valid_literal_char?(c)
|
177
|
+
@current_literal_size += 1
|
178
|
+
return true
|
179
|
+
end
|
180
|
+
|
181
|
+
false
|
182
|
+
end
|
183
|
+
|
184
|
+
def read_comma_separator(c)
|
185
|
+
if c == ","
|
186
|
+
@current_state = :awaiting_object_attribute_key if @current_node == :object
|
187
|
+
@current_state = :awaiting_array_value if @current_node == :array
|
188
|
+
return true
|
189
|
+
end
|
190
|
+
false
|
191
|
+
end
|
192
|
+
|
193
|
+
# Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
|
194
|
+
def start_object(c)
|
195
|
+
return false if whitespace?(c)
|
196
|
+
return false unless c == "{"
|
197
|
+
|
198
|
+
begin_node(:object)
|
199
|
+
@current_state = :awaiting_object_attribute_key
|
200
|
+
true
|
201
|
+
end
|
202
|
+
|
203
|
+
def close_object(c)
|
204
|
+
return false if whitespace?(c)
|
205
|
+
return false unless @current_node == :object and c == "}"
|
206
|
+
|
207
|
+
end_node
|
208
|
+
@current_state = :awaiting_next_or_close unless @current_node.nil?
|
209
|
+
true
|
210
|
+
end
|
211
|
+
|
212
|
+
# Array: [1, "two", true, undefined, {}, []]
|
213
|
+
def start_array(c)
|
214
|
+
return false unless c == "["
|
215
|
+
|
216
|
+
begin_node(:array)
|
217
|
+
@current_state = :awaiting_array_value
|
218
|
+
true
|
219
|
+
end
|
220
|
+
|
221
|
+
def close_array(c)
|
222
|
+
return false if whitespace?(c)
|
223
|
+
return false unless @current_node == :array and c == "]"
|
224
|
+
|
225
|
+
end_node
|
226
|
+
@current_state = :awaiting_next_or_close unless @current_node.nil?
|
227
|
+
true
|
228
|
+
end
|
229
|
+
|
230
|
+
def start_attribute_key(c)
|
231
|
+
return false unless c == "\""
|
232
|
+
|
233
|
+
begin_node(:string)
|
234
|
+
@current_state = :reading_object_attribute_key
|
235
|
+
true
|
236
|
+
end
|
237
|
+
|
238
|
+
def close_attribute_key(c)
|
239
|
+
return false if @escape_next
|
240
|
+
return false unless c == "\""
|
241
|
+
end_node
|
242
|
+
@current_state = :awaiting_object_colon_separator
|
243
|
+
true
|
244
|
+
end
|
245
|
+
|
246
|
+
# Strings: "Foo"
|
247
|
+
def start_string(c)
|
248
|
+
return false unless c == "\""
|
249
|
+
|
250
|
+
begin_node(:string)
|
251
|
+
@current_state = :reading_string
|
252
|
+
true
|
253
|
+
end
|
254
|
+
|
255
|
+
def close_string(c)
|
256
|
+
return false if @escape_next
|
257
|
+
return false unless c == "\""
|
258
|
+
end_node
|
259
|
+
@current_state = :awaiting_next_or_close
|
260
|
+
true
|
261
|
+
end
|
262
|
+
|
263
|
+
# literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
|
264
|
+
def start_literal(c)
|
265
|
+
return false unless valid_literal_char?(c)
|
266
|
+
|
267
|
+
begin_node(:literal)
|
268
|
+
@current_state = :reading_literal
|
269
|
+
@current_literal_size = 1
|
270
|
+
true
|
271
|
+
end
|
272
|
+
|
273
|
+
def close_literal(c)
|
274
|
+
raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
|
275
|
+
|
276
|
+
if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
|
277
|
+
end_node
|
278
|
+
@current_state = :awaiting_next_or_close
|
279
|
+
return true
|
280
|
+
end
|
281
|
+
|
282
|
+
false
|
283
|
+
end
|
284
|
+
|
285
|
+
# Marks the creation of a node (object, array, string or literal)
|
286
|
+
def begin_node(node_type)
|
287
|
+
# Accounting for the new node
|
288
|
+
@execution_stats[node_type] ||= 0
|
289
|
+
@execution_stats[node_type] += 1
|
290
|
+
|
291
|
+
# Managing the node execution stack
|
292
|
+
@parent_nodes.push(@current_node)
|
293
|
+
@current_node = node_type
|
294
|
+
end
|
295
|
+
|
296
|
+
# Marks the closure of a node (object, array, string or literal)
|
297
|
+
def end_node
|
298
|
+
@current_node = @parent_nodes.pop
|
299
|
+
@current_state = :closed if @current_node.nil?
|
300
|
+
end
|
301
|
+
|
302
|
+
def reject_char(char)
|
303
|
+
raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
|
304
|
+
end
|
305
|
+
|
306
|
+
def whitespace?(c)
|
307
|
+
WHITESPACE_CHARS.include?(c)
|
308
|
+
end
|
309
|
+
|
310
|
+
def control_char?(c)
|
311
|
+
# control characters: (U+0000 through U+001F)
|
312
|
+
utf8_code = c.unpack('U*')[0]
|
313
|
+
utf8_code <= 31
|
314
|
+
end
|
315
|
+
|
316
|
+
def valid_literal_char?(c)
|
317
|
+
LITERALS_CHAR_TEMPLATE === c
|
318
|
+
end
|
319
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class FormatParser::JSONParser
|
2
|
+
include FormatParser::IOUtils
|
3
|
+
require_relative 'json_parser/validator'
|
4
|
+
|
5
|
+
JSON_MIME_TYPE = 'application/json'
|
6
|
+
|
7
|
+
def likely_match?(filename)
|
8
|
+
filename =~ /\.json$/i
|
9
|
+
end
|
10
|
+
|
11
|
+
def call(io)
|
12
|
+
io = FormatParser::IOConstraint.new(io)
|
13
|
+
validator = Validator.new(io)
|
14
|
+
|
15
|
+
validator.validate
|
16
|
+
|
17
|
+
FormatParser::Text.new(
|
18
|
+
format: :json,
|
19
|
+
content_type: JSON_MIME_TYPE,
|
20
|
+
)
|
21
|
+
rescue Validator::JSONParserError
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
FormatParser.register_parser new, natures: :text, formats: :json
|
25
|
+
end
|
data/lib/utf8_reader.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
##
|
2
|
+
# This class Reads individual characters from files using UTF-8 encoding
|
3
|
+
# This deals with two main concerns:
|
4
|
+
# - Variable byte length of characters
|
5
|
+
# - Reducing the number of read operations by loading bytes in chunks
|
6
|
+
|
7
|
+
class FormatParser::UTF8Reader
|
8
|
+
READ_CHUNK_SIZE = 128
|
9
|
+
|
10
|
+
class UTF8CharReaderError < StandardError
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(io)
|
14
|
+
@io = io
|
15
|
+
@chunk = ""
|
16
|
+
@index = 0
|
17
|
+
@eof = false
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_char
|
21
|
+
first_byte = read_byte
|
22
|
+
return if first_byte.nil?
|
23
|
+
|
24
|
+
char_length = assess_char_length(first_byte)
|
25
|
+
as_bytes = Array.new(char_length) do |i|
|
26
|
+
next first_byte if i == 0
|
27
|
+
read_byte
|
28
|
+
end
|
29
|
+
|
30
|
+
char = as_bytes.pack('c*').force_encoding('UTF-8')
|
31
|
+
raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
|
32
|
+
|
33
|
+
char
|
34
|
+
rescue TypeError
|
35
|
+
raise UTF8CharReaderError, "Invalid UTF-8 character"
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def read_byte
|
41
|
+
manage_data_chunk
|
42
|
+
return if @chunk.nil?
|
43
|
+
byte = @chunk.bytes[@index]
|
44
|
+
@index += 1 unless byte.nil?
|
45
|
+
byte
|
46
|
+
end
|
47
|
+
|
48
|
+
def manage_data_chunk
|
49
|
+
return if @index < @chunk.length
|
50
|
+
@chunk = @io.read(READ_CHUNK_SIZE)
|
51
|
+
@chunk ||= ""
|
52
|
+
@index = 0
|
53
|
+
@eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
|
54
|
+
end
|
55
|
+
|
56
|
+
def assess_char_length(first_byte)
|
57
|
+
# 0_______ (1 byte)
|
58
|
+
# 110_____ (2 bytes) 192
|
59
|
+
# 1110____ (3 bytes) 224
|
60
|
+
# 11110___ (4 bytes) 240
|
61
|
+
case first_byte
|
62
|
+
when 240.. then 4
|
63
|
+
when 224..239 then 3
|
64
|
+
when 192..223 then 2
|
65
|
+
else 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,321 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::JSONParser::Validator do
|
4
|
+
def load_file(file_name)
|
5
|
+
io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
|
6
|
+
FormatParser::JSONParser::Validator.new(io)
|
7
|
+
end
|
8
|
+
|
9
|
+
def load_string(content)
|
10
|
+
io = StringIO.new(content.encode(Encoding::UTF_8))
|
11
|
+
FormatParser::JSONParser::Validator.new(io)
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'When reading root nodes' do
|
15
|
+
it "identifies objects as root nodes" do
|
16
|
+
v = load_string '{"key": "value"}'
|
17
|
+
|
18
|
+
completed = v.validate
|
19
|
+
|
20
|
+
expect(completed).to be true
|
21
|
+
expect(v.stats(:object)).to be 1
|
22
|
+
expect(v.stats(:string)).to be 2
|
23
|
+
end
|
24
|
+
|
25
|
+
it "identifies arrays as root nodes" do
|
26
|
+
v = load_string '["e1", "e2"]'
|
27
|
+
|
28
|
+
completed = v.validate
|
29
|
+
|
30
|
+
expect(completed).to be true
|
31
|
+
expect(v.stats(:array)).to be 1
|
32
|
+
expect(v.stats(:string)).to be 2
|
33
|
+
end
|
34
|
+
|
35
|
+
it "rejects strings as root nodes" do
|
36
|
+
expect do
|
37
|
+
v = load_string '"this is a string"'
|
38
|
+
v.validate
|
39
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "rejects literals as root nodes" do
|
43
|
+
expect do
|
44
|
+
v = load_string 'true'
|
45
|
+
v.validate
|
46
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'When reading objects' do
|
51
|
+
it "recognizes empty objects" do
|
52
|
+
v = load_string '{}'
|
53
|
+
|
54
|
+
completed = v.validate
|
55
|
+
expect(completed).to be true
|
56
|
+
expect(v.stats(:object)).to be 1
|
57
|
+
expect(v.stats(:string)).to be 0
|
58
|
+
end
|
59
|
+
|
60
|
+
it "recognizes objects with a single attribute" do
|
61
|
+
v = load_string '{"key": "value"}'
|
62
|
+
|
63
|
+
completed = v.validate
|
64
|
+
expect(completed).to be true
|
65
|
+
expect(v.stats(:object)).to be 1
|
66
|
+
expect(v.stats(:string)).to be 2
|
67
|
+
end
|
68
|
+
|
69
|
+
it "recognizes objects with attributes of different types" do
|
70
|
+
v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
|
71
|
+
|
72
|
+
completed = v.validate
|
73
|
+
expect(completed).to be true
|
74
|
+
expect(v.stats(:object)).to be 1
|
75
|
+
expect(v.stats(:string)).to be 4
|
76
|
+
expect(v.stats(:literal)).to be 2
|
77
|
+
end
|
78
|
+
|
79
|
+
it "recognizes condensed objects (no whitespaces)" do
|
80
|
+
v = load_string '{"a":"b","c":"d"}'
|
81
|
+
|
82
|
+
completed = v.validate
|
83
|
+
expect(completed).to be true
|
84
|
+
expect(v.stats(:object)).to be 1
|
85
|
+
expect(v.stats(:string)).to be 4
|
86
|
+
end
|
87
|
+
|
88
|
+
it "recognizes formatted objects" do
|
89
|
+
v = load_string '{
|
90
|
+
"a":"b",
|
91
|
+
"c":"d"
|
92
|
+
}'
|
93
|
+
|
94
|
+
completed = v.validate
|
95
|
+
expect(completed).to be true
|
96
|
+
expect(v.stats(:object)).to be 1
|
97
|
+
expect(v.stats(:string)).to be 4
|
98
|
+
end
|
99
|
+
|
100
|
+
it "recognizes objects with nested objects and arrays" do
|
101
|
+
v = load_string '{
|
102
|
+
"a": {
|
103
|
+
"a1": "-",
|
104
|
+
"a2": "-",
|
105
|
+
"a3": {
|
106
|
+
"a3.1": "-"
|
107
|
+
},
|
108
|
+
},
|
109
|
+
"c": [1, null]
|
110
|
+
}'
|
111
|
+
|
112
|
+
completed = v.validate
|
113
|
+
expect(completed).to be true
|
114
|
+
expect(v.stats(:object)).to be 3
|
115
|
+
expect(v.stats(:array)).to be 1
|
116
|
+
expect(v.stats(:string)).to be 9
|
117
|
+
expect(v.stats(:literal)).to be 2
|
118
|
+
end
|
119
|
+
|
120
|
+
it "rejects objects without double-quoted attribute names" do
|
121
|
+
expect do
|
122
|
+
v = load_string '{a:"b",c:"d"}'
|
123
|
+
v.validate
|
124
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "rejects objects without comma separators" do
|
128
|
+
expect do
|
129
|
+
v = load_string '{
|
130
|
+
"a":"b"
|
131
|
+
"c":"d"
|
132
|
+
}'
|
133
|
+
v.validate
|
134
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe 'When reading arrays' do
|
139
|
+
it "recognizes empty arrays" do
|
140
|
+
v = load_string '[]'
|
141
|
+
|
142
|
+
completed = v.validate
|
143
|
+
expect(completed).to be true
|
144
|
+
expect(v.stats(:array)).to be 1
|
145
|
+
expect(v.stats(:string)).to be 0
|
146
|
+
end
|
147
|
+
|
148
|
+
it "recognizes arrays with a single element" do
|
149
|
+
v = load_string '[{}]'
|
150
|
+
|
151
|
+
completed = v.validate
|
152
|
+
expect(completed).to be true
|
153
|
+
expect(v.stats(:array)).to be 1
|
154
|
+
expect(v.stats(:object)).to be 1
|
155
|
+
end
|
156
|
+
|
157
|
+
it "recognizes arrays with elements of different types" do
|
158
|
+
v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
|
159
|
+
|
160
|
+
completed = v.validate
|
161
|
+
expect(completed).to be true
|
162
|
+
expect(v.stats(:array)).to be 2
|
163
|
+
expect(v.stats(:object)).to be 1
|
164
|
+
expect(v.stats(:string)).to be 3
|
165
|
+
expect(v.stats(:literal)).to be 2
|
166
|
+
end
|
167
|
+
|
168
|
+
it "recognizes condensed arrays (no whitespaces)" do
|
169
|
+
v = load_string '["a",2,null,false]'
|
170
|
+
|
171
|
+
completed = v.validate
|
172
|
+
expect(completed).to be true
|
173
|
+
expect(v.stats(:array)).to be 1
|
174
|
+
expect(v.stats(:string)).to be 1
|
175
|
+
expect(v.stats(:literal)).to be 3
|
176
|
+
end
|
177
|
+
|
178
|
+
it "recognizes formatted arrays" do
|
179
|
+
v = load_string '[
|
180
|
+
{
|
181
|
+
"a":"b"
|
182
|
+
},
|
183
|
+
{
|
184
|
+
"c":"d"
|
185
|
+
}
|
186
|
+
]'
|
187
|
+
|
188
|
+
completed = v.validate
|
189
|
+
expect(completed).to be true
|
190
|
+
expect(v.stats(:array)).to be 1
|
191
|
+
expect(v.stats(:object)).to be 2
|
192
|
+
expect(v.stats(:string)).to be 4
|
193
|
+
end
|
194
|
+
|
195
|
+
it "recognizes arrays with nested objects and arrays" do
|
196
|
+
v = load_string '[{
|
197
|
+
"a": {
|
198
|
+
"a1": "-",
|
199
|
+
"a2": "-",
|
200
|
+
"a3": {
|
201
|
+
"a3.1": "-"
|
202
|
+
},
|
203
|
+
},
|
204
|
+
"c": [1, null]
|
205
|
+
},
|
206
|
+
[{ "a": "b" }, { "c":"d" }]
|
207
|
+
]'
|
208
|
+
|
209
|
+
completed = v.validate
|
210
|
+
expect(completed).to be true
|
211
|
+
expect(v.stats(:array)).to be 3
|
212
|
+
expect(v.stats(:object)).to be 5
|
213
|
+
expect(v.stats(:string)).to be 13
|
214
|
+
expect(v.stats(:literal)).to be 2
|
215
|
+
end
|
216
|
+
|
217
|
+
it "rejects arrays without comma separators" do
|
218
|
+
expect do
|
219
|
+
v = load_string '[
|
220
|
+
"abc"
|
221
|
+
"def"
|
222
|
+
]'
|
223
|
+
v.validate
|
224
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
describe 'When reading strings' do
|
229
|
+
it "recognizes regular strings" do
|
230
|
+
v = load_string '["abc", "def", "ghi"]'
|
231
|
+
|
232
|
+
completed = v.validate
|
233
|
+
expect(completed).to be true
|
234
|
+
expect(v.stats(:string)).to be 3
|
235
|
+
end
|
236
|
+
|
237
|
+
it "recognizes strings containing excaped characters" do
|
238
|
+
v = load_string '["ab\"c", "6\\2=3"]'
|
239
|
+
|
240
|
+
completed = v.validate
|
241
|
+
expect(completed).to be true
|
242
|
+
expect(v.stats(:string)).to be 2
|
243
|
+
end
|
244
|
+
|
245
|
+
it "recognizes strings containing UTF8 characters" do
|
246
|
+
v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
|
247
|
+
|
248
|
+
completed = v.validate
|
249
|
+
expect(completed).to be true
|
250
|
+
expect(v.stats(:string)).to be 2
|
251
|
+
end
|
252
|
+
|
253
|
+
it "recognizes long strings containing UTF8 characters" do
|
254
|
+
v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
|
255
|
+
|
256
|
+
completed = v.validate
|
257
|
+
expect(completed).to be true
|
258
|
+
expect(v.stats(:string)).to be 1
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
describe 'When reading literals' do
|
263
|
+
it "recognizes numbers" do
|
264
|
+
v = load_string '[1, -2.4, 1.0E+2]'
|
265
|
+
|
266
|
+
completed = v.validate
|
267
|
+
expect(completed).to be true
|
268
|
+
expect(v.stats(:literal)).to be 3
|
269
|
+
end
|
270
|
+
|
271
|
+
it "recognizes boolean values" do
|
272
|
+
v = load_string '[true, false]'
|
273
|
+
|
274
|
+
completed = v.validate
|
275
|
+
expect(completed).to be true
|
276
|
+
expect(v.stats(:literal)).to be 2
|
277
|
+
end
|
278
|
+
|
279
|
+
it "recognizes 'true', 'false' and 'null'" do
|
280
|
+
v = load_string '[true, false, null]'
|
281
|
+
|
282
|
+
completed = v.validate
|
283
|
+
expect(completed).to be true
|
284
|
+
expect(v.stats(:literal)).to be 3
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
describe 'When reading invalid JSON content' do
|
289
|
+
it "rejects truncated JSON content" do
|
290
|
+
expect do
|
291
|
+
v = load_string '[{
|
292
|
+
"a": ["abc","def"],
|
293
|
+
"b": 4'
|
294
|
+
v.validate
|
295
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
describe 'When reading large JSON files' do
|
300
|
+
it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
|
301
|
+
v = load_file 'long_file_valid.json'
|
302
|
+
|
303
|
+
completed = v.validate
|
304
|
+
expect(completed).to be false
|
305
|
+
end
|
306
|
+
|
307
|
+
it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
|
308
|
+
v = load_file 'long_file_valid_non_formatted.json'
|
309
|
+
|
310
|
+
completed = v.validate
|
311
|
+
expect(completed).to be false
|
312
|
+
end
|
313
|
+
|
314
|
+
it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
|
315
|
+
v = load_file 'long_file_malformed.json'
|
316
|
+
|
317
|
+
completed = v.validate
|
318
|
+
expect(completed).to be false
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::JSONParser do
|
4
|
+
MAX_READS = 100
|
5
|
+
|
6
|
+
def load_file(file_name)
|
7
|
+
io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
|
8
|
+
FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
|
9
|
+
end
|
10
|
+
|
11
|
+
def file_size(file_name)
|
12
|
+
File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
|
13
|
+
end
|
14
|
+
|
15
|
+
describe 'When reading objects valid JSON files' do
|
16
|
+
it "identifies JSON files with objects as root nodes" do
|
17
|
+
io = load_file 'object.json'
|
18
|
+
|
19
|
+
parsed = subject.call(io)
|
20
|
+
|
21
|
+
expect(parsed).not_to be_nil
|
22
|
+
expect(parsed.nature).to eq(:text)
|
23
|
+
expect(parsed.format).to eq(:json)
|
24
|
+
expect(parsed.content_type).to eq('application/json')
|
25
|
+
end
|
26
|
+
|
27
|
+
it "identifies JSON files carrying arrays as root nodes" do
|
28
|
+
io = load_file 'array.json'
|
29
|
+
|
30
|
+
parsed = subject.call(io)
|
31
|
+
|
32
|
+
expect(parsed).not_to be_nil
|
33
|
+
expect(parsed.nature).to eq(:text)
|
34
|
+
expect(parsed.format).to eq(:json)
|
35
|
+
expect(parsed.content_type).to eq('application/json')
|
36
|
+
end
|
37
|
+
|
38
|
+
it "identifies formatted JSON files" do
|
39
|
+
io = load_file 'formatted_object_utf8.json'
|
40
|
+
|
41
|
+
parsed = subject.call(io)
|
42
|
+
|
43
|
+
expect(parsed).not_to be_nil
|
44
|
+
expect(parsed.nature).to eq(:text)
|
45
|
+
expect(parsed.format).to eq(:json)
|
46
|
+
expect(parsed.content_type).to eq('application/json')
|
47
|
+
end
|
48
|
+
|
49
|
+
it "identifies files wrapped in whitespace characters" do
|
50
|
+
io = load_file 'whitespaces.json'
|
51
|
+
|
52
|
+
parsed = subject.call(io)
|
53
|
+
|
54
|
+
expect(parsed).not_to be_nil
|
55
|
+
expect(parsed.nature).to eq(:text)
|
56
|
+
expect(parsed.format).to eq(:json)
|
57
|
+
expect(parsed.content_type).to eq('application/json')
|
58
|
+
end
|
59
|
+
|
60
|
+
it "identifies files with nested objects and arrays" do
|
61
|
+
io = load_file 'nested_objects.json'
|
62
|
+
|
63
|
+
parsed = subject.call(io)
|
64
|
+
|
65
|
+
expect(parsed).not_to be_nil
|
66
|
+
expect(parsed.nature).to eq(:text)
|
67
|
+
expect(parsed.format).to eq(:json)
|
68
|
+
expect(parsed.content_type).to eq('application/json')
|
69
|
+
end
|
70
|
+
|
71
|
+
it "is reads the whole content of small files before accepting them" do
|
72
|
+
file_name = 'nested_objects.json'
|
73
|
+
io = load_file file_name
|
74
|
+
file_size = file_size file_name
|
75
|
+
|
76
|
+
parsed = subject.call(io)
|
77
|
+
|
78
|
+
expect(parsed).not_to be_nil
|
79
|
+
expect(parsed.nature).to eq(:text)
|
80
|
+
expect(parsed.format).to eq(:json)
|
81
|
+
expect(parsed.content_type).to eq('application/json')
|
82
|
+
expect(io.bytes).to be >= file_size
|
83
|
+
end
|
84
|
+
|
85
|
+
it "is accepts long files before reading the whole content" do
|
86
|
+
file_name = 'long_array_numbers.json'
|
87
|
+
io = load_file file_name
|
88
|
+
file_size = file_size file_name
|
89
|
+
|
90
|
+
parsed = subject.call(io)
|
91
|
+
|
92
|
+
expect(parsed).not_to be_nil
|
93
|
+
expect(parsed.nature).to eq(:text)
|
94
|
+
expect(parsed.format).to eq(:json)
|
95
|
+
expect(parsed.content_type).to eq('application/json')
|
96
|
+
expect(io.bytes).to be < file_size
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
describe 'When reading objects invalid JSON files' do
|
101
|
+
it "rejects files with corrupted JSON data" do
|
102
|
+
io = load_file 'malformed.json'
|
103
|
+
|
104
|
+
parsed = subject.call(io)
|
105
|
+
|
106
|
+
expect(parsed).to be_nil
|
107
|
+
end
|
108
|
+
|
109
|
+
it "rejects invalid files early without reading the whole content" do
|
110
|
+
io = load_file 'lorem_ipsum.json'
|
111
|
+
|
112
|
+
parsed = subject.call(io)
|
113
|
+
|
114
|
+
expect(parsed).to be_nil
|
115
|
+
expect(io.reads).to eq(1)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-06-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -236,6 +236,8 @@ files:
|
|
236
236
|
- lib/parsers/iso_base_media_file_format/decoder.rb
|
237
237
|
- lib/parsers/iso_base_media_file_format/utils.rb
|
238
238
|
- lib/parsers/jpeg_parser.rb
|
239
|
+
- lib/parsers/json_parser.rb
|
240
|
+
- lib/parsers/json_parser/validator.rb
|
239
241
|
- lib/parsers/m3u_parser.rb
|
240
242
|
- lib/parsers/mov_parser.rb
|
241
243
|
- lib/parsers/mov_parser/decoder.rb
|
@@ -260,6 +262,7 @@ files:
|
|
260
262
|
- lib/remote_io.rb
|
261
263
|
- lib/string.rb
|
262
264
|
- lib/text.rb
|
265
|
+
- lib/utf8_reader.rb
|
263
266
|
- lib/video.rb
|
264
267
|
- spec/active_storage/blob_io_spec.rb
|
265
268
|
- spec/active_storage/rails_app_spec.rb
|
@@ -289,6 +292,8 @@ files:
|
|
289
292
|
- spec/parsers/iso_base_media_file_format/decoder_spec.rb
|
290
293
|
- spec/parsers/iso_base_media_file_format/utils_spec.rb
|
291
294
|
- spec/parsers/jpeg_parser_spec.rb
|
295
|
+
- spec/parsers/json_parser/validator_spec.rb
|
296
|
+
- spec/parsers/json_parser_spec.rb
|
292
297
|
- spec/parsers/m3u_parser_spec.rb
|
293
298
|
- spec/parsers/mov_parser_spec.rb
|
294
299
|
- spec/parsers/mp3_parser_spec.rb
|