format_parser 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/format_parser/version.rb +1 -1
- data/lib/format_parser.rb +1 -0
- data/lib/parsers/json_parser/validator.rb +319 -0
- data/lib/parsers/json_parser.rb +25 -0
- data/lib/utf8_reader.rb +68 -0
- data/spec/parsers/json_parser/validator_spec.rb +321 -0
- data/spec/parsers/json_parser_spec.rb +118 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7b8b37e26143e2ea941db88183475e70c90ab658adcdad362b32b457a7d19ac
|
4
|
+
data.tar.gz: 50758e107065e1a2ab4fbca7775359d928cd1a62942277a94fcbabfefa1bfe10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d34bcd7b0162fe6f911bdd8c3b626dd9ce35b98139bca6ec1b54e653bc7e50453af734c74134745a46e91fbc3528d88f1663fabdbd35e06ae908a41f8e81dcd7
|
7
|
+
data.tar.gz: 2ebbc65f373a3e34a2e8300d40bb239df6d2003dab54b9c461a8cc75f651800d93273aa73bfb4f1a14f195e65f348c218b90b4c1bae860c8269e024ebb2e890c
|
data/README.md
CHANGED
data/lib/format_parser.rb
CHANGED
@@ -17,6 +17,7 @@ module FormatParser
|
|
17
17
|
require_relative 'read_limits_config'
|
18
18
|
require_relative 'remote_io'
|
19
19
|
require_relative 'io_constraint'
|
20
|
+
require_relative 'utf8_reader'
|
20
21
|
require_relative 'care'
|
21
22
|
require_relative 'active_storage/blob_analyzer'
|
22
23
|
require_relative 'text'
|
@@ -0,0 +1,319 @@
|
|
1
|
+
##
|
2
|
+
# This class checks whether a given file is a valid JSON file.
|
3
|
+
# The validation process DOES NOT assemble an object with the contents of the JSON file in memory,
|
4
|
+
# Instead, it implements a simple state-machine-like that digests the contents of the file while traversing
|
5
|
+
# the hierarchy of nodes in the document.
|
6
|
+
#
|
7
|
+
# Although this is based on the IETF standard (https://www.rfc-editor.org/rfc/rfc8259),
|
8
|
+
# it does cut a few corners for the sake of simplicity. For instance, instead of validating
|
9
|
+
# Numbers, "true", "false" and "null" tokens, it supports a type called Literal to hold generic sequences of characters.
|
10
|
+
# This decision makes the implementation simpler while being a good-enough approach to identify JSON files.
|
11
|
+
#
|
12
|
+
# There is also a cap. Large files are not read all the way through. Instead, if the beginning of file is
|
13
|
+
# JSON-compliant, it is assumed that the file is a JSON file.
|
14
|
+
|
15
|
+
class FormatParser::JSONParser::Validator
|
16
|
+
class JSONParserError < StandardError
|
17
|
+
end
|
18
|
+
|
19
|
+
MAX_SAMPLE_SIZE = 1024
|
20
|
+
MAX_LITERAL_SIZE = 30 # much larger then necessary.
|
21
|
+
ESCAPE_CHAR = "\\"
|
22
|
+
WHITESPACE_CHARS = [" ", "\t", "\n", "\r"]
|
23
|
+
ENDING_VALUE_CHARS = [",", "]", "}"]
|
24
|
+
LITERALS_CHAR_TEMPLATE = /\w|[+\-.]/ # any alphanumeric, "+", "-" and "."
|
25
|
+
|
26
|
+
def initialize(io)
|
27
|
+
@io = io
|
28
|
+
@current_node = nil # :object, :array, :string, :literal
|
29
|
+
@parent_nodes = []
|
30
|
+
@current_state = :awaiting_root_node
|
31
|
+
@escape_next = false
|
32
|
+
@current_literal_size = 0
|
33
|
+
@pos = 0
|
34
|
+
|
35
|
+
@all_parsers = {}
|
36
|
+
|
37
|
+
@execution_stats = {
|
38
|
+
array: 0,
|
39
|
+
object: 0,
|
40
|
+
literal: 0,
|
41
|
+
string: 0
|
42
|
+
}
|
43
|
+
|
44
|
+
setup_transitions
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate
|
48
|
+
char_reader = FormatParser::UTF8Reader.new(@io)
|
49
|
+
|
50
|
+
while (c = char_reader.read_char)
|
51
|
+
@pos += 1
|
52
|
+
parse_char c
|
53
|
+
|
54
|
+
# Halt validation if the sampling limit is reached.
|
55
|
+
if @pos >= MAX_SAMPLE_SIZE
|
56
|
+
raise JSONParserError, "Invalid JSON file" if @current_state == :awaiting_root_node
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Raising error in case the EOF is reached earlier than expected
|
62
|
+
raise JSONParserError, "Incomplete JSON file" if @current_state != :closed
|
63
|
+
true
|
64
|
+
rescue FormatParser::UTF8Reader::UTF8CharReaderError
|
65
|
+
raise JSONParserError, "Invalid UTF-8 character"
|
66
|
+
end
|
67
|
+
|
68
|
+
def stats(node_type)
|
69
|
+
@execution_stats[node_type]
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def setup_transitions
|
75
|
+
when_its :awaiting_root_node, ->(c) do
|
76
|
+
read_whitespace(c) or
|
77
|
+
start_object(c) or
|
78
|
+
start_array(c)
|
79
|
+
end
|
80
|
+
|
81
|
+
when_its :awaiting_object_attribute_key, ->(c) do
|
82
|
+
read_whitespace(c) or
|
83
|
+
start_attribute_key(c) or
|
84
|
+
close_object(c)
|
85
|
+
end
|
86
|
+
|
87
|
+
when_its :reading_object_attribute_key, ->(c) do
|
88
|
+
close_attribute_key(c) or
|
89
|
+
read_valid_string_char(c)
|
90
|
+
end
|
91
|
+
|
92
|
+
when_its :awaiting_object_colon_separator, ->(c) do
|
93
|
+
read_whitespace(c) or
|
94
|
+
read_colon(c)
|
95
|
+
end
|
96
|
+
|
97
|
+
when_its :awaiting_object_attribute_value, ->(c) do
|
98
|
+
read_whitespace(c) or
|
99
|
+
start_object(c) or
|
100
|
+
start_array(c) or
|
101
|
+
start_string(c) or
|
102
|
+
start_literal(c)
|
103
|
+
end
|
104
|
+
|
105
|
+
when_its :awaiting_array_value, ->(c) do
|
106
|
+
read_whitespace(c) or
|
107
|
+
start_object(c) or
|
108
|
+
start_array(c) or
|
109
|
+
start_string(c) or
|
110
|
+
start_literal(c) or
|
111
|
+
close_array(c)
|
112
|
+
end
|
113
|
+
|
114
|
+
when_its :reading_string, ->(c) do
|
115
|
+
close_string(c) or
|
116
|
+
read_valid_string_char(c)
|
117
|
+
end
|
118
|
+
|
119
|
+
when_its :awaiting_next_or_close, ->(c) do
|
120
|
+
read_whitespace(c) or
|
121
|
+
read_comma_separator(c) or
|
122
|
+
close_object(c) or
|
123
|
+
close_array(c)
|
124
|
+
end
|
125
|
+
|
126
|
+
when_its :reading_literal, ->(c) do
|
127
|
+
read_valid_literal_char(c) or (
|
128
|
+
close_literal(c) and (
|
129
|
+
read_whitespace(c) or
|
130
|
+
read_comma_separator(c) or
|
131
|
+
close_array(c) or
|
132
|
+
close_object(c)))
|
133
|
+
end
|
134
|
+
|
135
|
+
when_its :closed, ->(c) do
|
136
|
+
read_whitespace(c)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def when_its(state, act)
|
141
|
+
@all_parsers[state] = act
|
142
|
+
end
|
143
|
+
|
144
|
+
def parse_char(c)
|
145
|
+
next_step = @all_parsers[@current_state]
|
146
|
+
accepted = next_step.call(c)
|
147
|
+
reject_char(c) unless accepted
|
148
|
+
end
|
149
|
+
|
150
|
+
def read_whitespace(c)
|
151
|
+
whitespace?(c)
|
152
|
+
end
|
153
|
+
|
154
|
+
def read_colon(c)
|
155
|
+
if c == ":"
|
156
|
+
@current_state = :awaiting_object_attribute_value
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
false
|
160
|
+
end
|
161
|
+
|
162
|
+
def read_valid_string_char(c)
|
163
|
+
if @escape_next
|
164
|
+
@escape_next = false
|
165
|
+
return true
|
166
|
+
end
|
167
|
+
|
168
|
+
if c == ESCAPE_CHAR
|
169
|
+
@escape_next = true
|
170
|
+
return true
|
171
|
+
end
|
172
|
+
!control_char?(c) and c != "\""
|
173
|
+
end
|
174
|
+
|
175
|
+
def read_valid_literal_char(c)
|
176
|
+
if valid_literal_char?(c)
|
177
|
+
@current_literal_size += 1
|
178
|
+
return true
|
179
|
+
end
|
180
|
+
|
181
|
+
false
|
182
|
+
end
|
183
|
+
|
184
|
+
def read_comma_separator(c)
|
185
|
+
if c == ","
|
186
|
+
@current_state = :awaiting_object_attribute_key if @current_node == :object
|
187
|
+
@current_state = :awaiting_array_value if @current_node == :array
|
188
|
+
return true
|
189
|
+
end
|
190
|
+
false
|
191
|
+
end
|
192
|
+
|
193
|
+
# Object: {"k1":"val", "k2":[1,2,3], "k4": undefined, "k5": {"l1": 6}}
|
194
|
+
def start_object(c)
|
195
|
+
return false if whitespace?(c)
|
196
|
+
return false unless c == "{"
|
197
|
+
|
198
|
+
begin_node(:object)
|
199
|
+
@current_state = :awaiting_object_attribute_key
|
200
|
+
true
|
201
|
+
end
|
202
|
+
|
203
|
+
def close_object(c)
|
204
|
+
return false if whitespace?(c)
|
205
|
+
return false unless @current_node == :object and c == "}"
|
206
|
+
|
207
|
+
end_node
|
208
|
+
@current_state = :awaiting_next_or_close unless @current_node.nil?
|
209
|
+
true
|
210
|
+
end
|
211
|
+
|
212
|
+
# Array: [1, "two", true, undefined, {}, []]
|
213
|
+
def start_array(c)
|
214
|
+
return false unless c == "["
|
215
|
+
|
216
|
+
begin_node(:array)
|
217
|
+
@current_state = :awaiting_array_value
|
218
|
+
true
|
219
|
+
end
|
220
|
+
|
221
|
+
def close_array(c)
|
222
|
+
return false if whitespace?(c)
|
223
|
+
return false unless @current_node == :array and c == "]"
|
224
|
+
|
225
|
+
end_node
|
226
|
+
@current_state = :awaiting_next_or_close unless @current_node.nil?
|
227
|
+
true
|
228
|
+
end
|
229
|
+
|
230
|
+
def start_attribute_key(c)
|
231
|
+
return false unless c == "\""
|
232
|
+
|
233
|
+
begin_node(:string)
|
234
|
+
@current_state = :reading_object_attribute_key
|
235
|
+
true
|
236
|
+
end
|
237
|
+
|
238
|
+
def close_attribute_key(c)
|
239
|
+
return false if @escape_next
|
240
|
+
return false unless c == "\""
|
241
|
+
end_node
|
242
|
+
@current_state = :awaiting_object_colon_separator
|
243
|
+
true
|
244
|
+
end
|
245
|
+
|
246
|
+
# Strings: "Foo"
|
247
|
+
def start_string(c)
|
248
|
+
return false unless c == "\""
|
249
|
+
|
250
|
+
begin_node(:string)
|
251
|
+
@current_state = :reading_string
|
252
|
+
true
|
253
|
+
end
|
254
|
+
|
255
|
+
def close_string(c)
|
256
|
+
return false if @escape_next
|
257
|
+
return false unless c == "\""
|
258
|
+
end_node
|
259
|
+
@current_state = :awaiting_next_or_close
|
260
|
+
true
|
261
|
+
end
|
262
|
+
|
263
|
+
# literals: null, undefined, true, false, NaN, infinity, -123.456e10 -123,456e10
|
264
|
+
def start_literal(c)
|
265
|
+
return false unless valid_literal_char?(c)
|
266
|
+
|
267
|
+
begin_node(:literal)
|
268
|
+
@current_state = :reading_literal
|
269
|
+
@current_literal_size = 1
|
270
|
+
true
|
271
|
+
end
|
272
|
+
|
273
|
+
def close_literal(c)
|
274
|
+
raise JSONParserError, "Literal to large at #{@pos}" if @current_literal_size > MAX_LITERAL_SIZE
|
275
|
+
|
276
|
+
if whitespace?(c) || ENDING_VALUE_CHARS.include?(c)
|
277
|
+
end_node
|
278
|
+
@current_state = :awaiting_next_or_close
|
279
|
+
return true
|
280
|
+
end
|
281
|
+
|
282
|
+
false
|
283
|
+
end
|
284
|
+
|
285
|
+
# Marks the creation of a node (object, array, string or literal)
|
286
|
+
def begin_node(node_type)
|
287
|
+
# Accounting for the new node
|
288
|
+
@execution_stats[node_type] ||= 0
|
289
|
+
@execution_stats[node_type] += 1
|
290
|
+
|
291
|
+
# Managing the node execution stack
|
292
|
+
@parent_nodes.push(@current_node)
|
293
|
+
@current_node = node_type
|
294
|
+
end
|
295
|
+
|
296
|
+
# Marks the closure of a node (object, array, string or literal)
|
297
|
+
def end_node
|
298
|
+
@current_node = @parent_nodes.pop
|
299
|
+
@current_state = :closed if @current_node.nil?
|
300
|
+
end
|
301
|
+
|
302
|
+
def reject_char(char)
|
303
|
+
raise JSONParserError, "Unexpected char #{char} in position #{@pos}"
|
304
|
+
end
|
305
|
+
|
306
|
+
def whitespace?(c)
|
307
|
+
WHITESPACE_CHARS.include?(c)
|
308
|
+
end
|
309
|
+
|
310
|
+
def control_char?(c)
|
311
|
+
# control characters: (U+0000 through U+001F)
|
312
|
+
utf8_code = c.unpack('U*')[0]
|
313
|
+
utf8_code <= 31
|
314
|
+
end
|
315
|
+
|
316
|
+
def valid_literal_char?(c)
|
317
|
+
LITERALS_CHAR_TEMPLATE === c
|
318
|
+
end
|
319
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class FormatParser::JSONParser
|
2
|
+
include FormatParser::IOUtils
|
3
|
+
require_relative 'json_parser/validator'
|
4
|
+
|
5
|
+
JSON_MIME_TYPE = 'application/json'
|
6
|
+
|
7
|
+
def likely_match?(filename)
|
8
|
+
filename =~ /\.json$/i
|
9
|
+
end
|
10
|
+
|
11
|
+
def call(io)
|
12
|
+
io = FormatParser::IOConstraint.new(io)
|
13
|
+
validator = Validator.new(io)
|
14
|
+
|
15
|
+
validator.validate
|
16
|
+
|
17
|
+
FormatParser::Text.new(
|
18
|
+
format: :json,
|
19
|
+
content_type: JSON_MIME_TYPE,
|
20
|
+
)
|
21
|
+
rescue Validator::JSONParserError
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
FormatParser.register_parser new, natures: :text, formats: :json
|
25
|
+
end
|
data/lib/utf8_reader.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
##
|
2
|
+
# This class Reads individual characters from files using UTF-8 encoding
|
3
|
+
# This deals with two main concerns:
|
4
|
+
# - Variable byte length of characters
|
5
|
+
# - Reducing the number of read operations by loading bytes in chunks
|
6
|
+
|
7
|
+
class FormatParser::UTF8Reader
|
8
|
+
READ_CHUNK_SIZE = 128
|
9
|
+
|
10
|
+
class UTF8CharReaderError < StandardError
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(io)
|
14
|
+
@io = io
|
15
|
+
@chunk = ""
|
16
|
+
@index = 0
|
17
|
+
@eof = false
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_char
|
21
|
+
first_byte = read_byte
|
22
|
+
return if first_byte.nil?
|
23
|
+
|
24
|
+
char_length = assess_char_length(first_byte)
|
25
|
+
as_bytes = Array.new(char_length) do |i|
|
26
|
+
next first_byte if i == 0
|
27
|
+
read_byte
|
28
|
+
end
|
29
|
+
|
30
|
+
char = as_bytes.pack('c*').force_encoding('UTF-8')
|
31
|
+
raise UTF8CharReaderError, "Invalid UTF-8 character" unless char.valid_encoding?
|
32
|
+
|
33
|
+
char
|
34
|
+
rescue TypeError
|
35
|
+
raise UTF8CharReaderError, "Invalid UTF-8 character"
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def read_byte
|
41
|
+
manage_data_chunk
|
42
|
+
return if @chunk.nil?
|
43
|
+
byte = @chunk.bytes[@index]
|
44
|
+
@index += 1 unless byte.nil?
|
45
|
+
byte
|
46
|
+
end
|
47
|
+
|
48
|
+
def manage_data_chunk
|
49
|
+
return if @index < @chunk.length
|
50
|
+
@chunk = @io.read(READ_CHUNK_SIZE)
|
51
|
+
@chunk ||= ""
|
52
|
+
@index = 0
|
53
|
+
@eof = true if @chunk.nil? or @chunk.length < READ_CHUNK_SIZE
|
54
|
+
end
|
55
|
+
|
56
|
+
def assess_char_length(first_byte)
|
57
|
+
# 0_______ (1 byte)
|
58
|
+
# 110_____ (2 bytes) 192
|
59
|
+
# 1110____ (3 bytes) 224
|
60
|
+
# 11110___ (4 bytes) 240
|
61
|
+
case first_byte
|
62
|
+
when 240.. then 4
|
63
|
+
when 224..239 then 3
|
64
|
+
when 192..223 then 2
|
65
|
+
else 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,321 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::JSONParser::Validator do
|
4
|
+
def load_file(file_name)
|
5
|
+
io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
|
6
|
+
FormatParser::JSONParser::Validator.new(io)
|
7
|
+
end
|
8
|
+
|
9
|
+
def load_string(content)
|
10
|
+
io = StringIO.new(content.encode(Encoding::UTF_8))
|
11
|
+
FormatParser::JSONParser::Validator.new(io)
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'When reading root nodes' do
|
15
|
+
it "identifies objects as root nodes" do
|
16
|
+
v = load_string '{"key": "value"}'
|
17
|
+
|
18
|
+
completed = v.validate
|
19
|
+
|
20
|
+
expect(completed).to be true
|
21
|
+
expect(v.stats(:object)).to be 1
|
22
|
+
expect(v.stats(:string)).to be 2
|
23
|
+
end
|
24
|
+
|
25
|
+
it "identifies arrays as root nodes" do
|
26
|
+
v = load_string '["e1", "e2"]'
|
27
|
+
|
28
|
+
completed = v.validate
|
29
|
+
|
30
|
+
expect(completed).to be true
|
31
|
+
expect(v.stats(:array)).to be 1
|
32
|
+
expect(v.stats(:string)).to be 2
|
33
|
+
end
|
34
|
+
|
35
|
+
it "rejects strings as root nodes" do
|
36
|
+
expect do
|
37
|
+
v = load_string '"this is a string"'
|
38
|
+
v.validate
|
39
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "rejects literals as root nodes" do
|
43
|
+
expect do
|
44
|
+
v = load_string 'true'
|
45
|
+
v.validate
|
46
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'When reading objects' do
|
51
|
+
it "recognizes empty objects" do
|
52
|
+
v = load_string '{}'
|
53
|
+
|
54
|
+
completed = v.validate
|
55
|
+
expect(completed).to be true
|
56
|
+
expect(v.stats(:object)).to be 1
|
57
|
+
expect(v.stats(:string)).to be 0
|
58
|
+
end
|
59
|
+
|
60
|
+
it "recognizes objects with a single attribute" do
|
61
|
+
v = load_string '{"key": "value"}'
|
62
|
+
|
63
|
+
completed = v.validate
|
64
|
+
expect(completed).to be true
|
65
|
+
expect(v.stats(:object)).to be 1
|
66
|
+
expect(v.stats(:string)).to be 2
|
67
|
+
end
|
68
|
+
|
69
|
+
it "recognizes objects with attributes of different types" do
|
70
|
+
v = load_string '{"k1": "value", "k2": -123.456, "k3": null}'
|
71
|
+
|
72
|
+
completed = v.validate
|
73
|
+
expect(completed).to be true
|
74
|
+
expect(v.stats(:object)).to be 1
|
75
|
+
expect(v.stats(:string)).to be 4
|
76
|
+
expect(v.stats(:literal)).to be 2
|
77
|
+
end
|
78
|
+
|
79
|
+
it "recognizes condensed objects (no whitespaces)" do
|
80
|
+
v = load_string '{"a":"b","c":"d"}'
|
81
|
+
|
82
|
+
completed = v.validate
|
83
|
+
expect(completed).to be true
|
84
|
+
expect(v.stats(:object)).to be 1
|
85
|
+
expect(v.stats(:string)).to be 4
|
86
|
+
end
|
87
|
+
|
88
|
+
it "recognizes formatted objects" do
|
89
|
+
v = load_string '{
|
90
|
+
"a":"b",
|
91
|
+
"c":"d"
|
92
|
+
}'
|
93
|
+
|
94
|
+
completed = v.validate
|
95
|
+
expect(completed).to be true
|
96
|
+
expect(v.stats(:object)).to be 1
|
97
|
+
expect(v.stats(:string)).to be 4
|
98
|
+
end
|
99
|
+
|
100
|
+
it "recognizes objects with nested objects and arrays" do
|
101
|
+
v = load_string '{
|
102
|
+
"a": {
|
103
|
+
"a1": "-",
|
104
|
+
"a2": "-",
|
105
|
+
"a3": {
|
106
|
+
"a3.1": "-"
|
107
|
+
},
|
108
|
+
},
|
109
|
+
"c": [1, null]
|
110
|
+
}'
|
111
|
+
|
112
|
+
completed = v.validate
|
113
|
+
expect(completed).to be true
|
114
|
+
expect(v.stats(:object)).to be 3
|
115
|
+
expect(v.stats(:array)).to be 1
|
116
|
+
expect(v.stats(:string)).to be 9
|
117
|
+
expect(v.stats(:literal)).to be 2
|
118
|
+
end
|
119
|
+
|
120
|
+
it "rejects objects without double-quoted attribute names" do
|
121
|
+
expect do
|
122
|
+
v = load_string '{a:"b",c:"d"}'
|
123
|
+
v.validate
|
124
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "rejects objects without comma separators" do
|
128
|
+
expect do
|
129
|
+
v = load_string '{
|
130
|
+
"a":"b"
|
131
|
+
"c":"d"
|
132
|
+
}'
|
133
|
+
v.validate
|
134
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe 'When reading arrays' do
|
139
|
+
it "recognizes empty arrays" do
|
140
|
+
v = load_string '[]'
|
141
|
+
|
142
|
+
completed = v.validate
|
143
|
+
expect(completed).to be true
|
144
|
+
expect(v.stats(:array)).to be 1
|
145
|
+
expect(v.stats(:string)).to be 0
|
146
|
+
end
|
147
|
+
|
148
|
+
it "recognizes arrays with a single element" do
|
149
|
+
v = load_string '[{}]'
|
150
|
+
|
151
|
+
completed = v.validate
|
152
|
+
expect(completed).to be true
|
153
|
+
expect(v.stats(:array)).to be 1
|
154
|
+
expect(v.stats(:object)).to be 1
|
155
|
+
end
|
156
|
+
|
157
|
+
it "recognizes arrays with elements of different types" do
|
158
|
+
v = load_string '[{"k1": "value"}, [], "a string", null, -123.456]'
|
159
|
+
|
160
|
+
completed = v.validate
|
161
|
+
expect(completed).to be true
|
162
|
+
expect(v.stats(:array)).to be 2
|
163
|
+
expect(v.stats(:object)).to be 1
|
164
|
+
expect(v.stats(:string)).to be 3
|
165
|
+
expect(v.stats(:literal)).to be 2
|
166
|
+
end
|
167
|
+
|
168
|
+
it "recognizes condensed arrays (no whitespaces)" do
|
169
|
+
v = load_string '["a",2,null,false]'
|
170
|
+
|
171
|
+
completed = v.validate
|
172
|
+
expect(completed).to be true
|
173
|
+
expect(v.stats(:array)).to be 1
|
174
|
+
expect(v.stats(:string)).to be 1
|
175
|
+
expect(v.stats(:literal)).to be 3
|
176
|
+
end
|
177
|
+
|
178
|
+
it "recognizes formatted arrays" do
|
179
|
+
v = load_string '[
|
180
|
+
{
|
181
|
+
"a":"b"
|
182
|
+
},
|
183
|
+
{
|
184
|
+
"c":"d"
|
185
|
+
}
|
186
|
+
]'
|
187
|
+
|
188
|
+
completed = v.validate
|
189
|
+
expect(completed).to be true
|
190
|
+
expect(v.stats(:array)).to be 1
|
191
|
+
expect(v.stats(:object)).to be 2
|
192
|
+
expect(v.stats(:string)).to be 4
|
193
|
+
end
|
194
|
+
|
195
|
+
it "recognizes arrays with nested objects and arrays" do
|
196
|
+
v = load_string '[{
|
197
|
+
"a": {
|
198
|
+
"a1": "-",
|
199
|
+
"a2": "-",
|
200
|
+
"a3": {
|
201
|
+
"a3.1": "-"
|
202
|
+
},
|
203
|
+
},
|
204
|
+
"c": [1, null]
|
205
|
+
},
|
206
|
+
[{ "a": "b" }, { "c":"d" }]
|
207
|
+
]'
|
208
|
+
|
209
|
+
completed = v.validate
|
210
|
+
expect(completed).to be true
|
211
|
+
expect(v.stats(:array)).to be 3
|
212
|
+
expect(v.stats(:object)).to be 5
|
213
|
+
expect(v.stats(:string)).to be 13
|
214
|
+
expect(v.stats(:literal)).to be 2
|
215
|
+
end
|
216
|
+
|
217
|
+
it "rejects arrays without comma separators" do
|
218
|
+
expect do
|
219
|
+
v = load_string '[
|
220
|
+
"abc"
|
221
|
+
"def"
|
222
|
+
]'
|
223
|
+
v.validate
|
224
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
describe 'When reading strings' do
|
229
|
+
it "recognizes regular strings" do
|
230
|
+
v = load_string '["abc", "def", "ghi"]'
|
231
|
+
|
232
|
+
completed = v.validate
|
233
|
+
expect(completed).to be true
|
234
|
+
expect(v.stats(:string)).to be 3
|
235
|
+
end
|
236
|
+
|
237
|
+
it "recognizes strings containing excaped characters" do
|
238
|
+
v = load_string '["ab\"c", "6\\2=3"]'
|
239
|
+
|
240
|
+
completed = v.validate
|
241
|
+
expect(completed).to be true
|
242
|
+
expect(v.stats(:string)).to be 2
|
243
|
+
end
|
244
|
+
|
245
|
+
it "recognizes strings containing UTF8 characters" do
|
246
|
+
v = load_string '["abc😃🐶👀", "😃2🐶3👀"]'
|
247
|
+
|
248
|
+
completed = v.validate
|
249
|
+
expect(completed).to be true
|
250
|
+
expect(v.stats(:string)).to be 2
|
251
|
+
end
|
252
|
+
|
253
|
+
it "recognizes long strings containing UTF8 characters" do
|
254
|
+
v = load_string '["aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀aØڃಚ😁🐶👀"]'
|
255
|
+
|
256
|
+
completed = v.validate
|
257
|
+
expect(completed).to be true
|
258
|
+
expect(v.stats(:string)).to be 1
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
describe 'When reading literals' do
|
263
|
+
it "recognizes numbers" do
|
264
|
+
v = load_string '[1, -2.4, 1.0E+2]'
|
265
|
+
|
266
|
+
completed = v.validate
|
267
|
+
expect(completed).to be true
|
268
|
+
expect(v.stats(:literal)).to be 3
|
269
|
+
end
|
270
|
+
|
271
|
+
it "recognizes boolean values" do
|
272
|
+
v = load_string '[true, false]'
|
273
|
+
|
274
|
+
completed = v.validate
|
275
|
+
expect(completed).to be true
|
276
|
+
expect(v.stats(:literal)).to be 2
|
277
|
+
end
|
278
|
+
|
279
|
+
it "recognizes 'true', 'false' and 'null'" do
|
280
|
+
v = load_string '[true, false, null]'
|
281
|
+
|
282
|
+
completed = v.validate
|
283
|
+
expect(completed).to be true
|
284
|
+
expect(v.stats(:literal)).to be 3
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
describe 'When reading invalid JSON content' do
|
289
|
+
it "rejects truncated JSON content" do
|
290
|
+
expect do
|
291
|
+
v = load_string '[{
|
292
|
+
"a": ["abc","def"],
|
293
|
+
"b": 4'
|
294
|
+
v.validate
|
295
|
+
end.to raise_error(FormatParser::JSONParser::Validator::JSONParserError)
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
describe 'When reading large JSON files' do
|
300
|
+
it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON" do
|
301
|
+
v = load_file 'long_file_valid.json'
|
302
|
+
|
303
|
+
completed = v.validate
|
304
|
+
expect(completed).to be false
|
305
|
+
end
|
306
|
+
|
307
|
+
it "Returns 'false' without throwing errors when for long non-formatted JSON files" do
|
308
|
+
v = load_file 'long_file_valid_non_formatted.json'
|
309
|
+
|
310
|
+
completed = v.validate
|
311
|
+
expect(completed).to be false
|
312
|
+
end
|
313
|
+
|
314
|
+
it "Returns 'false' without throwing errors when the initial chunk of a file is a valid JSON even if there's an issue later" do
|
315
|
+
v = load_file 'long_file_malformed.json'
|
316
|
+
|
317
|
+
completed = v.validate
|
318
|
+
expect(completed).to be false
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FormatParser::JSONParser do
|
4
|
+
MAX_READS = 100
|
5
|
+
|
6
|
+
def load_file(file_name)
|
7
|
+
io = File.open(Pathname.new(fixtures_dir).join('JSON').join(file_name), 'rb')
|
8
|
+
FormatParser::ReadLimiter.new(io, max_reads: MAX_READS)
|
9
|
+
end
|
10
|
+
|
11
|
+
def file_size(file_name)
|
12
|
+
File.size(Pathname.new(fixtures_dir).join('JSON').join(file_name))
|
13
|
+
end
|
14
|
+
|
15
|
+
describe 'When reading objects valid JSON files' do
|
16
|
+
it "identifies JSON files with objects as root nodes" do
|
17
|
+
io = load_file 'object.json'
|
18
|
+
|
19
|
+
parsed = subject.call(io)
|
20
|
+
|
21
|
+
expect(parsed).not_to be_nil
|
22
|
+
expect(parsed.nature).to eq(:text)
|
23
|
+
expect(parsed.format).to eq(:json)
|
24
|
+
expect(parsed.content_type).to eq('application/json')
|
25
|
+
end
|
26
|
+
|
27
|
+
it "identifies JSON files carrying arrays as root nodes" do
|
28
|
+
io = load_file 'array.json'
|
29
|
+
|
30
|
+
parsed = subject.call(io)
|
31
|
+
|
32
|
+
expect(parsed).not_to be_nil
|
33
|
+
expect(parsed.nature).to eq(:text)
|
34
|
+
expect(parsed.format).to eq(:json)
|
35
|
+
expect(parsed.content_type).to eq('application/json')
|
36
|
+
end
|
37
|
+
|
38
|
+
it "identifies formatted JSON files" do
|
39
|
+
io = load_file 'formatted_object_utf8.json'
|
40
|
+
|
41
|
+
parsed = subject.call(io)
|
42
|
+
|
43
|
+
expect(parsed).not_to be_nil
|
44
|
+
expect(parsed.nature).to eq(:text)
|
45
|
+
expect(parsed.format).to eq(:json)
|
46
|
+
expect(parsed.content_type).to eq('application/json')
|
47
|
+
end
|
48
|
+
|
49
|
+
it "identifies files wrapped in whitespace characters" do
|
50
|
+
io = load_file 'whitespaces.json'
|
51
|
+
|
52
|
+
parsed = subject.call(io)
|
53
|
+
|
54
|
+
expect(parsed).not_to be_nil
|
55
|
+
expect(parsed.nature).to eq(:text)
|
56
|
+
expect(parsed.format).to eq(:json)
|
57
|
+
expect(parsed.content_type).to eq('application/json')
|
58
|
+
end
|
59
|
+
|
60
|
+
it "identifies files with nested objects and arrays" do
|
61
|
+
io = load_file 'nested_objects.json'
|
62
|
+
|
63
|
+
parsed = subject.call(io)
|
64
|
+
|
65
|
+
expect(parsed).not_to be_nil
|
66
|
+
expect(parsed.nature).to eq(:text)
|
67
|
+
expect(parsed.format).to eq(:json)
|
68
|
+
expect(parsed.content_type).to eq('application/json')
|
69
|
+
end
|
70
|
+
|
71
|
+
it "is reads the whole content of small files before accepting them" do
|
72
|
+
file_name = 'nested_objects.json'
|
73
|
+
io = load_file file_name
|
74
|
+
file_size = file_size file_name
|
75
|
+
|
76
|
+
parsed = subject.call(io)
|
77
|
+
|
78
|
+
expect(parsed).not_to be_nil
|
79
|
+
expect(parsed.nature).to eq(:text)
|
80
|
+
expect(parsed.format).to eq(:json)
|
81
|
+
expect(parsed.content_type).to eq('application/json')
|
82
|
+
expect(io.bytes).to be >= file_size
|
83
|
+
end
|
84
|
+
|
85
|
+
it "is accepts long files before reading the whole content" do
|
86
|
+
file_name = 'long_array_numbers.json'
|
87
|
+
io = load_file file_name
|
88
|
+
file_size = file_size file_name
|
89
|
+
|
90
|
+
parsed = subject.call(io)
|
91
|
+
|
92
|
+
expect(parsed).not_to be_nil
|
93
|
+
expect(parsed.nature).to eq(:text)
|
94
|
+
expect(parsed.format).to eq(:json)
|
95
|
+
expect(parsed.content_type).to eq('application/json')
|
96
|
+
expect(io.bytes).to be < file_size
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
describe 'When reading objects invalid JSON files' do
|
101
|
+
it "rejects files with corrupted JSON data" do
|
102
|
+
io = load_file 'malformed.json'
|
103
|
+
|
104
|
+
parsed = subject.call(io)
|
105
|
+
|
106
|
+
expect(parsed).to be_nil
|
107
|
+
end
|
108
|
+
|
109
|
+
it "rejects invalid files early without reading the whole content" do
|
110
|
+
io = load_file 'lorem_ipsum.json'
|
111
|
+
|
112
|
+
parsed = subject.call(io)
|
113
|
+
|
114
|
+
expect(parsed).to be_nil
|
115
|
+
expect(io.reads).to eq(1)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: format_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Noah Berman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-06-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: exifr
|
@@ -236,6 +236,8 @@ files:
|
|
236
236
|
- lib/parsers/iso_base_media_file_format/decoder.rb
|
237
237
|
- lib/parsers/iso_base_media_file_format/utils.rb
|
238
238
|
- lib/parsers/jpeg_parser.rb
|
239
|
+
- lib/parsers/json_parser.rb
|
240
|
+
- lib/parsers/json_parser/validator.rb
|
239
241
|
- lib/parsers/m3u_parser.rb
|
240
242
|
- lib/parsers/mov_parser.rb
|
241
243
|
- lib/parsers/mov_parser/decoder.rb
|
@@ -260,6 +262,7 @@ files:
|
|
260
262
|
- lib/remote_io.rb
|
261
263
|
- lib/string.rb
|
262
264
|
- lib/text.rb
|
265
|
+
- lib/utf8_reader.rb
|
263
266
|
- lib/video.rb
|
264
267
|
- spec/active_storage/blob_io_spec.rb
|
265
268
|
- spec/active_storage/rails_app_spec.rb
|
@@ -289,6 +292,8 @@ files:
|
|
289
292
|
- spec/parsers/iso_base_media_file_format/decoder_spec.rb
|
290
293
|
- spec/parsers/iso_base_media_file_format/utils_spec.rb
|
291
294
|
- spec/parsers/jpeg_parser_spec.rb
|
295
|
+
- spec/parsers/json_parser/validator_spec.rb
|
296
|
+
- spec/parsers/json_parser_spec.rb
|
292
297
|
- spec/parsers/m3u_parser_spec.rb
|
293
298
|
- spec/parsers/mov_parser_spec.rb
|
294
299
|
- spec/parsers/mp3_parser_spec.rb
|