json_mend 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +60 -0
- data/.tool-versions +1 -0
- data/CODE_OF_CONDUCT.md +128 -0
- data/LICENSE +21 -0
- data/README.md +65 -0
- data/Rakefile +12 -0
- data/lib/json_mend/parser.rb +1175 -0
- data/lib/json_mend/version.rb +5 -0
- data/lib/json_mend.rb +32 -0
- data/sig/json_mend.rbs +5 -0
- metadata +82 -0
|
@@ -0,0 +1,1175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'strscan'
|
|
4
|
+
require 'set'
|
|
5
|
+
|
|
6
|
+
# Root module
|
|
7
|
+
module JsonMend
|
|
8
|
+
# The core parser that does the heavy lifting of fixing the JSON
|
|
9
|
+
class Parser
|
|
10
|
+
COMMENT_DELIMETERS = ['#', '/'].freeze
|
|
11
|
+
NUMBER_CHARS = Set.new('0123456789-.eE/,_'.chars).freeze
|
|
12
|
+
STRING_DELIMITERS = ['"', "'", '“', '”'].freeze
|
|
13
|
+
ESCAPE_MAPPING = {
|
|
14
|
+
't' => "\t",
|
|
15
|
+
'n' => "\n",
|
|
16
|
+
'r' => "\r",
|
|
17
|
+
'b' => "\b",
|
|
18
|
+
'f' => "\f"
|
|
19
|
+
}.freeze
|
|
20
|
+
JSON_STOP_TOKEN = :json_mend_stop_token
|
|
21
|
+
|
|
22
|
+
# Optimized constants for performance (CollectionLiteralInLoop)
|
|
23
|
+
TERMINATORS_ARRAY = [']', '}'].freeze
|
|
24
|
+
TERMINATORS_OBJECT_KEY = [':', '}'].freeze
|
|
25
|
+
TERMINATORS_OBJECT_VALUE = [',', '}'].freeze
|
|
26
|
+
TERMINATORS_ARRAY_ITEM = [',', ']'].freeze
|
|
27
|
+
TERMINATORS_STRING_GUESSED = ['{', '}', '[', ']', ':', ','].freeze
|
|
28
|
+
TERMINATORS_VALUE = [',', ']', '}'].freeze
|
|
29
|
+
STRING_OR_OBJECT_START = (STRING_DELIMITERS + ['{', '[']).freeze
|
|
30
|
+
SKIPPED_KEYS = %i[merged_array stray_colon].freeze
|
|
31
|
+
BOOLEAN_OR_NULL_CHARS = %w[t f n].freeze
|
|
32
|
+
ESCAPE_START_CHARS = %w[t n r b \\].freeze
|
|
33
|
+
HEX_ESCAPE_PREFIXES = %w[u x].freeze
|
|
34
|
+
INVALID_NUMBER_TRAILERS = ['-', 'e', 'E', ','].freeze
|
|
35
|
+
|
|
36
|
+
# Pre-compile regexes for performance
|
|
37
|
+
NUMBER_REGEX = /[#{Regexp.escape(NUMBER_CHARS.to_a.join)}]+/
|
|
38
|
+
NUMBER_NO_COMMA_REGEX = /[#{Regexp.escape(NUMBER_CHARS.dup.tap { |s| s.delete(',') }.to_a.join)}]+/
|
|
39
|
+
|
|
40
|
+
def initialize(json_string)
|
|
41
|
+
@scanner = StringScanner.new(json_string)
|
|
42
|
+
@context = []
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Kicks off the parsing process. This is a direct port of the robust Python logic
|
|
46
|
+
def parse
|
|
47
|
+
json = parse_json
|
|
48
|
+
|
|
49
|
+
# If the first parse returns JSON_STOP_TOKEN, it means we found nothing (empty string or garbage)
|
|
50
|
+
# Return nil (or empty string representation logic elsewhere handles it)
|
|
51
|
+
return nil if json == JSON_STOP_TOKEN
|
|
52
|
+
|
|
53
|
+
unless @scanner.eos?
|
|
54
|
+
json = [json]
|
|
55
|
+
until @scanner.eos?
|
|
56
|
+
new_json = parse_json
|
|
57
|
+
if new_json == ''
|
|
58
|
+
@scanner.getch # continue
|
|
59
|
+
elsif new_json == JSON_STOP_TOKEN
|
|
60
|
+
# Found nothing but EOS or garbage terminator
|
|
61
|
+
break
|
|
62
|
+
else
|
|
63
|
+
# Ignore strings that look like closing braces garbage (e.g. "}", " ] ")
|
|
64
|
+
next if new_json.is_a?(String) && new_json.strip.match?(/^[}\]]+$/)
|
|
65
|
+
|
|
66
|
+
json.pop if both_hash?(json.last, new_json)
|
|
67
|
+
json << new_json
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
json = json.first if json.length == 1
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
json
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def parse_json
|
|
80
|
+
until @scanner.eos?
|
|
81
|
+
char = peek_char
|
|
82
|
+
case char
|
|
83
|
+
when '{'
|
|
84
|
+
@scanner.getch # consume '{'
|
|
85
|
+
return parse_object
|
|
86
|
+
when '['
|
|
87
|
+
@scanner.getch # consume '['
|
|
88
|
+
return parse_array
|
|
89
|
+
when *COMMENT_DELIMETERS
|
|
90
|
+
# Avoid recursion: consume comment and continue loop
|
|
91
|
+
parse_comment
|
|
92
|
+
else
|
|
93
|
+
if string_start?(char)
|
|
94
|
+
if @context.empty? && !STRING_DELIMITERS.include?(char)
|
|
95
|
+
# Top level unquoted string strictness:
|
|
96
|
+
# Only allow literals (true/false/null), ignore other text as garbage
|
|
97
|
+
val = parse_literal
|
|
98
|
+
return val if val != ''
|
|
99
|
+
|
|
100
|
+
@scanner.getch
|
|
101
|
+
next
|
|
102
|
+
end
|
|
103
|
+
return parse_string
|
|
104
|
+
elsif number_start?(char)
|
|
105
|
+
val = parse_number
|
|
106
|
+
return val unless val == ''
|
|
107
|
+
|
|
108
|
+
@scanner.getch
|
|
109
|
+
else
|
|
110
|
+
# Stop if we hit a terminator for the current context to avoid consuming it as garbage
|
|
111
|
+
if (current_context?(:array) && char == ']') ||
|
|
112
|
+
(current_context?(:object_value) && char == '}') ||
|
|
113
|
+
(current_context?(:object_key) && char == '}')
|
|
114
|
+
return JSON_STOP_TOKEN
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
@scanner.getch # moving by string, ignore this symbol
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
JSON_STOP_TOKEN
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Parses a JSON object.
|
|
125
|
+
def parse_object
|
|
126
|
+
object = {}
|
|
127
|
+
|
|
128
|
+
loop do
|
|
129
|
+
skip_whitespaces
|
|
130
|
+
|
|
131
|
+
# Explicitly consume comments to ensure they don't hide separators (like commas)
|
|
132
|
+
# or get parsed as part of the next key.
|
|
133
|
+
if COMMENT_DELIMETERS.include?(peek_char)
|
|
134
|
+
parse_comment
|
|
135
|
+
next
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# >> PRIMARY EXIT: End of object or end of string.
|
|
139
|
+
break if @scanner.eos? || @scanner.scan('}') || peek_char == ']'
|
|
140
|
+
|
|
141
|
+
# Leniently consume any leading junk characters (like stray commas or colons)
|
|
142
|
+
# that might appear before a key.
|
|
143
|
+
@scanner.skip(/[,\s]+/)
|
|
144
|
+
|
|
145
|
+
# --- Delegate to a helper to parse the next Key-Value pair ---
|
|
146
|
+
key, value, colon_found = parse_object_pair(object)
|
|
147
|
+
next if SKIPPED_KEYS.include?(key)
|
|
148
|
+
|
|
149
|
+
# If the helper returns nil for the key, it signals that we should
|
|
150
|
+
# stop parsing this object (e.g. a duplicate key was found,
|
|
151
|
+
# indicating the start of a new object).
|
|
152
|
+
if key.nil?
|
|
153
|
+
@scanner.scan('}')
|
|
154
|
+
break
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Assign the parsed pair to our object, avoiding empty keys.
|
|
158
|
+
# But only if we didn't firmly establish the key with a colon already.
|
|
159
|
+
skip_whitespaces
|
|
160
|
+
if peek_char == ':' && !colon_found
|
|
161
|
+
key = value.to_s
|
|
162
|
+
@scanner.getch # consume ':'
|
|
163
|
+
value = parse_object_value
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Assign the parsed pair to our object.
|
|
167
|
+
object[key] = value
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
object
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Attempts to parse a single key-value pair.
|
|
174
|
+
# Returns [key, value] on success, or [nil, nil] if parsing should stop.
|
|
175
|
+
def parse_object_pair(object)
|
|
176
|
+
# --- 1. Parse the Key ---
|
|
177
|
+
# This step includes the complex logic for merging dangling arrays.
|
|
178
|
+
pos_before_key = @scanner.pos
|
|
179
|
+
key, was_array_merged, is_bracketed = parse_object_key(object)
|
|
180
|
+
|
|
181
|
+
# If an array was merged, there's no K/V pair to process, so we restart the loop.
|
|
182
|
+
return [:merged_array, nil, false] if was_array_merged
|
|
183
|
+
|
|
184
|
+
# Check for a stray colon: invalid structure where we have no key (and no quotes consumed) but see a colon.
|
|
185
|
+
# This handles cases like: { "key": "value", : "garbage" }
|
|
186
|
+
if key.empty? && (@scanner.pos == pos_before_key) && peek_char == ':'
|
|
187
|
+
@scanner.getch # Skip ':'
|
|
188
|
+
parse_object_value # Consume and discard the value
|
|
189
|
+
return [:stray_colon, nil, false]
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# If we get an empty key and the next character is a closing brace, we're done.
|
|
193
|
+
return [nil, nil, false] if key.empty? && (peek_char.nil? || peek_char == '}')
|
|
194
|
+
|
|
195
|
+
# --- 2. Handle Duplicate Keys (Safer Method) ---
|
|
196
|
+
# This is a critical repair for lists of objects missing a comma separator.
|
|
197
|
+
if object.key?(key)
|
|
198
|
+
# Instead of rewriting the string, we safely rewind the scanner to the
|
|
199
|
+
# position before the duplicate key. This ends the parsing of the current
|
|
200
|
+
# object, allowing the top-level parser to see the duplicate key as the
|
|
201
|
+
# start of a new JSON object.
|
|
202
|
+
@scanner.pos = pos_before_key
|
|
203
|
+
return [nil, nil, false] # Signal to stop parsing this object.
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# --- 3. Parse the Separator (:) ---
|
|
207
|
+
skip_whitespaces
|
|
208
|
+
colon_found = @scanner.skip(/:/) # Leniently skip the colon if it exists.
|
|
209
|
+
|
|
210
|
+
# --- 4. Parse the Value ---
|
|
211
|
+
value = parse_object_value(colon_found: colon_found || is_bracketed)
|
|
212
|
+
|
|
213
|
+
if value == :inferred_true
|
|
214
|
+
return [nil, nil, false] if %w[true false null].include?(key.downcase)
|
|
215
|
+
|
|
216
|
+
value = true
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
[key, value, colon_found]
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Parses the key of an object, including the special logic for merging dangling arrays.
|
|
223
|
+
# Returns [key, was_array_merged_flag]
|
|
224
|
+
def parse_object_key(object)
|
|
225
|
+
# First, check for and handle the dangling array merge logic.
|
|
226
|
+
if try_to_merge_dangling_array(object)
|
|
227
|
+
return [nil, true, false] # Signal that an array was merged.
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# If no merge happened, proceed with standard key parsing.
|
|
231
|
+
@context.push(:object_key)
|
|
232
|
+
is_bracketed = false
|
|
233
|
+
|
|
234
|
+
if peek_char == '['
|
|
235
|
+
@scanner.getch # Consume '['
|
|
236
|
+
arr = parse_array
|
|
237
|
+
key = arr.first.to_s
|
|
238
|
+
is_bracketed = true
|
|
239
|
+
else
|
|
240
|
+
key = parse_string.to_s
|
|
241
|
+
end
|
|
242
|
+
@context.pop
|
|
243
|
+
|
|
244
|
+
# If the key is empty, consume any stray characters to prevent infinite loops.
|
|
245
|
+
@scanner.getch if key.empty? && !@scanner.check(/[:}]/) && !@scanner.eos?
|
|
246
|
+
|
|
247
|
+
[key, false, is_bracketed] # Signal that a key was parsed.
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Parses the value part of a key-value pair.
|
|
251
|
+
def parse_object_value(colon_found: true)
|
|
252
|
+
@context.push(:object_value)
|
|
253
|
+
skip_whitespaces
|
|
254
|
+
|
|
255
|
+
# Handle cases where the value is missing (e.g. "key": } or "key": ,)
|
|
256
|
+
if @scanner.eos? || @scanner.check(/[,}]/)
|
|
257
|
+
@context.pop
|
|
258
|
+
return colon_found ? '' : :inferred_true
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Delegate to the main JSON value parser.
|
|
262
|
+
value = parse_json
|
|
263
|
+
@context.pop
|
|
264
|
+
|
|
265
|
+
# If parse_json returned JSON_STOP_TOKEN (nothing found due to garbage->terminator),
|
|
266
|
+
# treat it as nil (null) for object values to be safe.
|
|
267
|
+
value == JSON_STOP_TOKEN ? nil : value
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Encapsulates the logic for merging an array that appears without a key.
|
|
271
|
+
def try_to_merge_dangling_array(object)
|
|
272
|
+
return false unless peek_char == '['
|
|
273
|
+
|
|
274
|
+
prev_key = object.keys.last
|
|
275
|
+
return false unless prev_key && object[prev_key].is_a?(Array)
|
|
276
|
+
|
|
277
|
+
@scanner.getch # Consume '['
|
|
278
|
+
new_array = parse_array
|
|
279
|
+
return false unless new_array.is_a?(Array)
|
|
280
|
+
|
|
281
|
+
to_merge = new_array.length == 1 && new_array.first.is_a?(Array) ? new_array.first : new_array
|
|
282
|
+
object[prev_key].concat(to_merge)
|
|
283
|
+
|
|
284
|
+
skip_whitespaces
|
|
285
|
+
@scanner.skip(',')
|
|
286
|
+
skip_whitespaces
|
|
287
|
+
|
|
288
|
+
true
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Parses a JSON array from the string.
|
|
292
|
+
# Assumes the opening '[' has already been consumed by the caller.
|
|
293
|
+
# This is a lenient parser designed to handle malformed JSON.
|
|
294
|
+
def parse_array
|
|
295
|
+
arr = []
|
|
296
|
+
@context.push(:array)
|
|
297
|
+
char = peek_char
|
|
298
|
+
# Stop when you find the closing bracket or an invalid character like '}'
|
|
299
|
+
while !@scanner.eos? && !TERMINATORS_ARRAY.include?(char)
|
|
300
|
+
skip_whitespaces
|
|
301
|
+
char = peek_char
|
|
302
|
+
|
|
303
|
+
# Check for comments explicitly inside array to avoid recursion or garbage consumption issues
|
|
304
|
+
if COMMENT_DELIMETERS.include?(char)
|
|
305
|
+
parse_comment
|
|
306
|
+
char = peek_char
|
|
307
|
+
next
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
value = ''
|
|
311
|
+
if STRING_DELIMITERS.include?(char)
|
|
312
|
+
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
|
|
313
|
+
# So we are going to check if this string is followed by a : or not
|
|
314
|
+
# And either parse the string or parse the object
|
|
315
|
+
i = 1
|
|
316
|
+
i = skip_to_character(char, start_idx: i)
|
|
317
|
+
i = skip_whitespaces_at(start_idx: i + 1)
|
|
318
|
+
value = (peek_char(i) == ':' ? parse_object : parse_string)
|
|
319
|
+
else
|
|
320
|
+
value = parse_json
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Handle JSON_STOP_TOKEN from parse_json (EOS or consumed terminator)
|
|
324
|
+
if value == JSON_STOP_TOKEN
|
|
325
|
+
# Do nothing, just skipped garbage
|
|
326
|
+
elsif strictly_empty?(value)
|
|
327
|
+
# Only consume if we didn't just hit a terminator that parse_json successfully respected
|
|
328
|
+
@scanner.getch unless value.nil? && TERMINATORS_ARRAY.include?(peek_char)
|
|
329
|
+
elsif value == '...' && @scanner.string.getbyte(@scanner.pos - 1) == 46
|
|
330
|
+
# just skip if the previous byte was a dot (46)
|
|
331
|
+
else
|
|
332
|
+
arr << value
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
char = peek_char
|
|
336
|
+
while char && char != ']' && (char.match?(/\s/) || char == ',')
|
|
337
|
+
@scanner.getch
|
|
338
|
+
char = peek_char
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# Handle a potentially missing closing bracket, a common LLM error.
|
|
343
|
+
unless @scanner.scan(']')
|
|
344
|
+
@scanner.scan('}') # Consume } if it was the closer
|
|
345
|
+
end
|
|
346
|
+
@context.pop
|
|
347
|
+
|
|
348
|
+
arr
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Parses a JSON string. This is a very lenient parser designed to handle
|
|
352
|
+
# many common errors found in LLM-generated JSON, such as missing quotes,
|
|
353
|
+
# incorrect escape sequences, and ambiguous string terminators
|
|
354
|
+
def parse_string
|
|
355
|
+
char = prepare_string_parsing
|
|
356
|
+
|
|
357
|
+
# A valid string can only start with a valid quote or, in our case, with a literal
|
|
358
|
+
while !@scanner.eos? && !STRING_DELIMITERS.include?(char) && !char.match?(/[\p{L}0-9]/)
|
|
359
|
+
return '' if TERMINATORS_STRING_GUESSED.include?(char)
|
|
360
|
+
|
|
361
|
+
@scanner.getch
|
|
362
|
+
char = peek_char
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
return '' if @scanner.eos?
|
|
366
|
+
|
|
367
|
+
return_result, *rest = determine_delimiters(char:)
|
|
368
|
+
return rest.first if return_result
|
|
369
|
+
|
|
370
|
+
lstring_delimiter, rstring_delimiter, missing_quotes = rest
|
|
371
|
+
|
|
372
|
+
@scanner.getch unless missing_quotes
|
|
373
|
+
|
|
374
|
+
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
|
375
|
+
return_result, *rest = handle_doubled_quotes(
|
|
376
|
+
lstring_delimiter:,
|
|
377
|
+
rstring_delimiter:
|
|
378
|
+
)
|
|
379
|
+
return rest.first if return_result
|
|
380
|
+
|
|
381
|
+
doubled_quotes = rest.first
|
|
382
|
+
|
|
383
|
+
string_parts = []
|
|
384
|
+
|
|
385
|
+
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
|
386
|
+
# In that case we need to use the ":|,|}" characters as terminators of the string
|
|
387
|
+
# So this will stop if:
|
|
388
|
+
# * It finds a closing quote
|
|
389
|
+
# * It iterated over the entire sequence
|
|
390
|
+
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
|
391
|
+
string_parts, char = check_unmatched_delimiters(
|
|
392
|
+
string_parts:,
|
|
393
|
+
lstring_delimiter:,
|
|
394
|
+
rstring_delimiter:,
|
|
395
|
+
missing_quotes:,
|
|
396
|
+
doubled_quotes:
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if !@scanner.eos? && missing_quotes && current_context?(:object_key) && char.match(/\s/)
|
|
400
|
+
skip_whitespaces
|
|
401
|
+
return '' unless [':', ','].include?(peek_char)
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
finalize_parsed_string(
|
|
405
|
+
string_parts:,
|
|
406
|
+
char:,
|
|
407
|
+
rstring_delimiter:,
|
|
408
|
+
missing_quotes:
|
|
409
|
+
)
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# string helper methods
|
|
413
|
+
|
|
414
|
+
def prepare_string_parsing
|
|
415
|
+
char = peek_char
|
|
416
|
+
|
|
417
|
+
# Consume comments that appear before the string starts
|
|
418
|
+
while COMMENT_DELIMETERS.include?(char)
|
|
419
|
+
parse_comment
|
|
420
|
+
char = peek_char
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
char
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
def determine_delimiters(char:)
|
|
427
|
+
missing_quotes = false
|
|
428
|
+
lstring_delimiter = rstring_delimiter = '"'
|
|
429
|
+
|
|
430
|
+
# --- Determine Delimiters and Handle Unquoted Literals ---
|
|
431
|
+
case char
|
|
432
|
+
when "'"
|
|
433
|
+
lstring_delimiter = rstring_delimiter = "'"
|
|
434
|
+
when '“'
|
|
435
|
+
lstring_delimiter = '“'
|
|
436
|
+
rstring_delimiter = '”'
|
|
437
|
+
when /[\p{L}0-9]/
|
|
438
|
+
# Could be a boolean/null, but not if it's an object key.
|
|
439
|
+
if BOOLEAN_OR_NULL_CHARS.include?(char.downcase) && !current_context?(:object_key)
|
|
440
|
+
# parse_literal is non-destructive if it fails to match.
|
|
441
|
+
value = parse_literal
|
|
442
|
+
return [true, value] if value != ''
|
|
443
|
+
end
|
|
444
|
+
# While parsing a string, we found a literal instead of a quote
|
|
445
|
+
missing_quotes = true
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
[false, lstring_delimiter, rstring_delimiter, missing_quotes]
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
def handle_doubled_quotes(
|
|
452
|
+
lstring_delimiter:,
|
|
453
|
+
rstring_delimiter:
|
|
454
|
+
)
|
|
455
|
+
doubled_quotes = false
|
|
456
|
+
|
|
457
|
+
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
|
458
|
+
if STRING_DELIMITERS.include?(peek_char) && peek_char == lstring_delimiter
|
|
459
|
+
next_value = peek_char(1)
|
|
460
|
+
|
|
461
|
+
if (
|
|
462
|
+
current_context?(:object_key) && next_value == ':'
|
|
463
|
+
) || (
|
|
464
|
+
current_context?(:object_value) && TERMINATORS_OBJECT_VALUE.include?(next_value)
|
|
465
|
+
)
|
|
466
|
+
@scanner.getch
|
|
467
|
+
return [true, '']
|
|
468
|
+
elsif next_value == lstring_delimiter
|
|
469
|
+
# There's something fishy about this, we found doubled quotes and then again quotes
|
|
470
|
+
return [true, '']
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
i = skip_to_character(rstring_delimiter, start_idx: 1)
|
|
474
|
+
next_c = peek_char(i)
|
|
475
|
+
|
|
476
|
+
if next_c && peek_char(i + 1) == rstring_delimiter
|
|
477
|
+
doubled_quotes = true
|
|
478
|
+
@scanner.getch
|
|
479
|
+
else
|
|
480
|
+
# Ok this is not a doubled quote, check if this is an empty string or not
|
|
481
|
+
i = skip_whitespaces_at(start_idx: 1)
|
|
482
|
+
next_c = peek_char(i)
|
|
483
|
+
if STRING_OR_OBJECT_START.include?(next_c)
|
|
484
|
+
@scanner.getch
|
|
485
|
+
return [true, '']
|
|
486
|
+
elsif !TERMINATORS_VALUE.include?(next_c)
|
|
487
|
+
@scanner.getch
|
|
488
|
+
end
|
|
489
|
+
end
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
[false, doubled_quotes]
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
|
496
|
+
# In that case we need to use the ":|,|}" characters as terminators of the string
|
|
497
|
+
# So this will stop if:
|
|
498
|
+
# * It finds a closing quote
|
|
499
|
+
# * It iterated over the entire sequence
|
|
500
|
+
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
|
501
|
+
def check_unmatched_delimiters(
|
|
502
|
+
string_parts:,
|
|
503
|
+
lstring_delimiter:,
|
|
504
|
+
rstring_delimiter:,
|
|
505
|
+
missing_quotes:,
|
|
506
|
+
doubled_quotes:
|
|
507
|
+
)
|
|
508
|
+
char = peek_char
|
|
509
|
+
unmatched_delimiter = false
|
|
510
|
+
# --- Main Parsing Loop ---
|
|
511
|
+
while !@scanner.eos? && char != rstring_delimiter
|
|
512
|
+
break if context_termination_reached?(
|
|
513
|
+
char:,
|
|
514
|
+
missing_quotes:
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
if current_context?(:object_value) && TERMINATORS_OBJECT_VALUE.include?(char) &&
|
|
518
|
+
(string_parts.empty? || string_parts.last != rstring_delimiter)
|
|
519
|
+
|
|
520
|
+
is_break = check_rstring_delimiter_missing(
|
|
521
|
+
string_parts:,
|
|
522
|
+
lstring_delimiter:,
|
|
523
|
+
rstring_delimiter:,
|
|
524
|
+
missing_quotes:
|
|
525
|
+
)
|
|
526
|
+
break if is_break
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
if char == ']' && context_contain?(:array) && string_parts.last != rstring_delimiter
|
|
530
|
+
i = skip_to_character(rstring_delimiter)
|
|
531
|
+
# No delimiter found
|
|
532
|
+
break unless peek_char(i)
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
if current_context?(:object_value) && char == '}'
|
|
536
|
+
# We found the end of an object while parsing a value
|
|
537
|
+
# Check if the object is really over, to avoid doubling the closing brace
|
|
538
|
+
i = skip_whitespaces_at(start_idx: 1)
|
|
539
|
+
next_c = peek_char(i)
|
|
540
|
+
break unless next_c
|
|
541
|
+
end
|
|
542
|
+
|
|
543
|
+
string_parts << char
|
|
544
|
+
@scanner.getch # Consume the character
|
|
545
|
+
char = peek_char
|
|
546
|
+
|
|
547
|
+
if !@scanner.eos? && string_parts.last == '\\'
|
|
548
|
+
# This is a special case, if people use real strings this might happen
|
|
549
|
+
is_next, string_parts, char = parse_escape_sequence(
|
|
550
|
+
string_parts:,
|
|
551
|
+
char:,
|
|
552
|
+
rstring_delimiter:
|
|
553
|
+
)
|
|
554
|
+
next if is_next
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# If we are in object key context and we find a colon, it could be a missing right quote
|
|
558
|
+
if char == ':' && !missing_quotes && current_context?(:object_key)
|
|
559
|
+
is_break = handle_missing_quotes_termination(
|
|
560
|
+
lstring_delimiter:,
|
|
561
|
+
rstring_delimiter:
|
|
562
|
+
)
|
|
563
|
+
break if is_break
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
if char == rstring_delimiter && string_parts.last != '\\'
|
|
567
|
+
if check_doubled_quotes(doubled_quotes, rstring_delimiter)
|
|
568
|
+
# Consumed in helper
|
|
569
|
+
elsif check_missing_quotes_in_object_value(missing_quotes, lstring_delimiter, rstring_delimiter)
|
|
570
|
+
char = peek_char
|
|
571
|
+
break
|
|
572
|
+
elsif unmatched_delimiter
|
|
573
|
+
unmatched_delimiter = false
|
|
574
|
+
string_parts << char.to_s
|
|
575
|
+
@scanner.getch # Consume the character
|
|
576
|
+
char = peek_char
|
|
577
|
+
else
|
|
578
|
+
should_consume, set_unmatched = determine_complex_delimiter_action(lstring_delimiter, rstring_delimiter)
|
|
579
|
+
if should_consume
|
|
580
|
+
unmatched_delimiter = true if set_unmatched
|
|
581
|
+
string_parts << char.to_s
|
|
582
|
+
@scanner.getch
|
|
583
|
+
char = peek_char
|
|
584
|
+
end
|
|
585
|
+
end
|
|
586
|
+
end
|
|
587
|
+
end
|
|
588
|
+
|
|
589
|
+
[
|
|
590
|
+
string_parts,
|
|
591
|
+
char
|
|
592
|
+
]
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
def check_doubled_quotes(doubled_quotes, rstring_delimiter)
|
|
596
|
+
if doubled_quotes && peek_char(1) == rstring_delimiter
|
|
597
|
+
@scanner.getch
|
|
598
|
+
return true
|
|
599
|
+
end
|
|
600
|
+
false
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
def check_missing_quotes_in_object_value(missing_quotes, lstring_delimiter, rstring_delimiter)
|
|
604
|
+
return false unless missing_quotes && current_context?(:object_value)
|
|
605
|
+
|
|
606
|
+
i = 1
|
|
607
|
+
next_c = peek_char(i)
|
|
608
|
+
while next_c && ![rstring_delimiter, lstring_delimiter].include?(next_c)
|
|
609
|
+
i += 1
|
|
610
|
+
next_c = peek_char(i)
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
return false unless next_c
|
|
614
|
+
|
|
615
|
+
# We found a quote, now let's make sure there's a ":" following
|
|
616
|
+
i += 1
|
|
617
|
+
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
618
|
+
i = skip_whitespaces_at(start_idx: i)
|
|
619
|
+
next_c = peek_char(i)
|
|
620
|
+
|
|
621
|
+
if next_c && next_c == ':'
|
|
622
|
+
@scanner.pos -= 1
|
|
623
|
+
return true
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
false
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
def determine_complex_delimiter_action(lstring_delimiter, rstring_delimiter)
|
|
630
|
+
i = 1
|
|
631
|
+
next_c = peek_char(i)
|
|
632
|
+
check_comma_in_object_value = true
|
|
633
|
+
|
|
634
|
+
# Check if eventually there is a rstring delimiter, otherwise we bail
|
|
635
|
+
while next_c && ![rstring_delimiter, lstring_delimiter].include?(next_c)
|
|
636
|
+
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
|
|
637
|
+
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
|
|
638
|
+
check_comma_in_object_value = false if check_comma_in_object_value && next_c.match?(/\p{L}/)
|
|
639
|
+
# If we are in an object context, let's check for the right delimiters
|
|
640
|
+
if (context_contain?(:object_key) && TERMINATORS_OBJECT_KEY.include?(next_c)) ||
|
|
641
|
+
(context_contain?(:object_value) && TERMINATORS_OBJECT_KEY.include?(next_c)) ||
|
|
642
|
+
(context_contain?(:array) && TERMINATORS_ARRAY_ITEM.include?(next_c)) ||
|
|
643
|
+
(
|
|
644
|
+
check_comma_in_object_value &&
|
|
645
|
+
current_context?(:object_value) &&
|
|
646
|
+
next_c == ','
|
|
647
|
+
)
|
|
648
|
+
break
|
|
649
|
+
end
|
|
650
|
+
|
|
651
|
+
i += 1
|
|
652
|
+
next_c = peek_char(i)
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
|
656
|
+
if next_c == ',' && current_context?(:object_value)
|
|
657
|
+
i += 1
|
|
658
|
+
i = skip_to_character(rstring_delimiter, start_idx: i)
|
|
659
|
+
peek_char(i)
|
|
660
|
+
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
|
|
661
|
+
i += 1
|
|
662
|
+
i = skip_whitespaces_at(start_idx: i)
|
|
663
|
+
next_c = peek_char(i)
|
|
664
|
+
return [true, false] if TERMINATORS_OBJECT_VALUE.include?(next_c)
|
|
665
|
+
elsif next_c == rstring_delimiter && peek_char(i - 1) != '\\'
|
|
666
|
+
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
|
667
|
+
return [false, false] if (1..i).all? { |j| peek_char(j).to_s.match(/\s/) }
|
|
668
|
+
|
|
669
|
+
if current_context?(:object_value)
|
|
670
|
+
return check_unmatched_in_object_value(index: i, lstring_delimiter:, rstring_delimiter:)
|
|
671
|
+
elsif current_context?(:array)
|
|
672
|
+
return check_unmatched_in_array(rstring_delimiter:)
|
|
673
|
+
elsif current_context?(:object_key)
|
|
674
|
+
return [true, false]
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
[false, false]
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
def check_unmatched_in_object_value(index:, lstring_delimiter:, rstring_delimiter:)
|
|
682
|
+
index = skip_whitespaces_at(start_idx: index + 1)
|
|
683
|
+
if peek_char(index) == ','
|
|
684
|
+
# So we found a comma, this could be a case of a single quote like "va"lue",
|
|
685
|
+
# Search if it's followed by another key, starting with the first delimeter
|
|
686
|
+
index = skip_to_character(lstring_delimiter, start_idx: index + 1)
|
|
687
|
+
index += 1
|
|
688
|
+
index = skip_to_character(rstring_delimiter, start_idx: index + 1)
|
|
689
|
+
index += 1
|
|
690
|
+
index = skip_whitespaces_at(start_idx: index)
|
|
691
|
+
next_c = peek_char(index)
|
|
692
|
+
return [true, false] if next_c == ':'
|
|
693
|
+
end
|
|
694
|
+
# We found a delimiter and we need to check if this is a key
|
|
695
|
+
# so find a rstring_delimiter and a colon after
|
|
696
|
+
index = skip_to_character(rstring_delimiter, start_idx: index + 1)
|
|
697
|
+
index += 1
|
|
698
|
+
next_c = peek_char(index)
|
|
699
|
+
while next_c && next_c != ':'
|
|
700
|
+
if TERMINATORS_VALUE.include?(next_c) || (
|
|
701
|
+
next_c == rstring_delimiter &&
|
|
702
|
+
peek_char(index - 1) != '\\'
|
|
703
|
+
)
|
|
704
|
+
break
|
|
705
|
+
end
|
|
706
|
+
|
|
707
|
+
index += 1
|
|
708
|
+
next_c = peek_char(index)
|
|
709
|
+
end
|
|
710
|
+
|
|
711
|
+
# Only if we fail to find a ':' then we know this is misplaced quote
|
|
712
|
+
return [true, true] if next_c != ':'
|
|
713
|
+
|
|
714
|
+
[false, false]
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
def check_unmatched_in_array(rstring_delimiter:)
|
|
718
|
+
# Heuristic: Check if this quote is a closer or internal.
|
|
719
|
+
# 1. Find the NEXT delimiter (quote) index `j`.
|
|
720
|
+
j = 1
|
|
721
|
+
found_next = false
|
|
722
|
+
while (c = peek_char(j))
|
|
723
|
+
if c == rstring_delimiter
|
|
724
|
+
# Check if escaped (count preceding backslashes)
|
|
725
|
+
bk = 1
|
|
726
|
+
slashes = 0
|
|
727
|
+
while j - bk >= 0 && peek_char(j - bk) == '\\'
|
|
728
|
+
slashes += 1
|
|
729
|
+
bk += 1
|
|
730
|
+
end
|
|
731
|
+
if slashes.even?
|
|
732
|
+
found_next = true
|
|
733
|
+
break
|
|
734
|
+
end
|
|
735
|
+
end
|
|
736
|
+
j += 1
|
|
737
|
+
end
|
|
738
|
+
|
|
739
|
+
# 2. Check conditions to STOP (treat as closing quote):
|
|
740
|
+
# a) Strictly whitespace between quotes: ["a" "b"]
|
|
741
|
+
is_whitespace = (1...j).all? { |k| peek_char(k).match?(/\s/) }
|
|
742
|
+
|
|
743
|
+
# b) Next quote is followed by a separator: ["val1" val2",]
|
|
744
|
+
is_next_closer = false
|
|
745
|
+
if found_next
|
|
746
|
+
k = j + 1
|
|
747
|
+
k += 1 while peek_char(k)&.match?(/\s/) # skip whitespaces
|
|
748
|
+
is_next_closer = TERMINATORS_VALUE.include?(peek_char(k))
|
|
749
|
+
end
|
|
750
|
+
|
|
751
|
+
return [true, true] unless is_whitespace || is_next_closer
|
|
752
|
+
|
|
753
|
+
[false, false]
|
|
754
|
+
end
|
|
755
|
+
|
|
756
|
+
def check_rstring_delimiter_missing(
|
|
757
|
+
string_parts:,
|
|
758
|
+
lstring_delimiter:,
|
|
759
|
+
rstring_delimiter:,
|
|
760
|
+
missing_quotes:
|
|
761
|
+
)
|
|
762
|
+
rstring_delimiter_missing = true
|
|
763
|
+
# check if this is a case in which the closing comma is NOT missing instead
|
|
764
|
+
skip_whitespaces
|
|
765
|
+
if peek_char(1) == '\\'
|
|
766
|
+
# Ok this is a quoted string, skip
|
|
767
|
+
rstring_delimiter_missing = false
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
i = skip_to_character(rstring_delimiter, start_idx: 1)
|
|
771
|
+
next_c = peek_char(i)
|
|
772
|
+
|
|
773
|
+
is_gap_clean = true
|
|
774
|
+
is_gap_clean = (1...i).all? { |k| peek_char(k)&.match?(/\s/) } if missing_quotes && next_c
|
|
775
|
+
if next_c && is_gap_clean
|
|
776
|
+
i += 1
|
|
777
|
+
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
778
|
+
# or the string ended
|
|
779
|
+
i = skip_whitespaces_at(start_idx: i)
|
|
780
|
+
next_c = peek_char(i)
|
|
781
|
+
if next_c.nil? || TERMINATORS_OBJECT_VALUE.include?(next_c)
|
|
782
|
+
rstring_delimiter_missing = false
|
|
783
|
+
else
|
|
784
|
+
# OK but this could still be some garbage at the end of the string
|
|
785
|
+
# So we need to check if we find a new lstring_delimiter afterwards
|
|
786
|
+
# If we do, maybe this is a missing delimiter
|
|
787
|
+
i = skip_to_character(lstring_delimiter, start_idx: i)
|
|
788
|
+
next_c = peek_char(i)
|
|
789
|
+
if next_c.nil?
|
|
790
|
+
rstring_delimiter_missing = false
|
|
791
|
+
else
|
|
792
|
+
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
|
|
793
|
+
# Check if we find a : afterwards (skipping space)
|
|
794
|
+
i = skip_whitespaces_at(start_idx: i + 1)
|
|
795
|
+
next_c = peek_char(i)
|
|
796
|
+
rstring_delimiter_missing = false if next_c && next_c != ':'
|
|
797
|
+
end
|
|
798
|
+
end
|
|
799
|
+
elsif next_c
|
|
800
|
+
rstring_delimiter_missing = false
|
|
801
|
+
else
|
|
802
|
+
# There could be a case in which even the next key:value is missing delimeters
|
|
803
|
+
# because it might be a systemic issue with the output
|
|
804
|
+
# So let's check if we can find a : in the string instead
|
|
805
|
+
i = skip_to_character(':', start_idx: 1)
|
|
806
|
+
next_c = peek_char(i)
|
|
807
|
+
return true if next_c
|
|
808
|
+
|
|
809
|
+
# OK then this is a systemic issue with the output
|
|
810
|
+
|
|
811
|
+
# skip any whitespace first
|
|
812
|
+
i = skip_whitespaces_at(start_idx: 1)
|
|
813
|
+
# We couldn't find any rstring_delimeter before the end of the string
|
|
814
|
+
# check if this is the last string of an object and therefore we can keep going
|
|
815
|
+
# make an exception if this is the last char before the closing brace
|
|
816
|
+
j = skip_to_character('}', start_idx: i)
|
|
817
|
+
if j - i > 1
|
|
818
|
+
# Ok it's not right after the comma
|
|
819
|
+
# Let's ignore
|
|
820
|
+
rstring_delimiter_missing = false
|
|
821
|
+
elsif peek_char(j)
|
|
822
|
+
# Check for an unmatched opening brace in string_parts
|
|
823
|
+
string_parts.reverse_each do |c|
|
|
824
|
+
next unless c == '{'
|
|
825
|
+
|
|
826
|
+
# Ok then this is part of the string
|
|
827
|
+
rstring_delimiter_missing = false
|
|
828
|
+
break
|
|
829
|
+
end
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
rstring_delimiter_missing
|
|
835
|
+
end
|
|
836
|
+
|
|
837
|
+
def parse_escape_sequence(
|
|
838
|
+
string_parts:,
|
|
839
|
+
char:,
|
|
840
|
+
rstring_delimiter:
|
|
841
|
+
)
|
|
842
|
+
if !@scanner.eos? && string_parts.last == '\\'
|
|
843
|
+
# This is a special case, if people use real strings this might happen
|
|
844
|
+
if char == rstring_delimiter || ESCAPE_START_CHARS.include?(char)
|
|
845
|
+
string_parts.pop
|
|
846
|
+
string_parts << ESCAPE_MAPPING.fetch(char, char)
|
|
847
|
+
|
|
848
|
+
@scanner.getch # Consume the character
|
|
849
|
+
char = peek_char
|
|
850
|
+
while !@scanner.eos? && string_parts.last == '\\' && (char == rstring_delimiter || char == '\\')
|
|
851
|
+
# this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
|
|
852
|
+
# I don't love it though
|
|
853
|
+
string_parts.pop
|
|
854
|
+
string_parts << char
|
|
855
|
+
@scanner.getch # Consume the character
|
|
856
|
+
char = peek_char
|
|
857
|
+
end
|
|
858
|
+
return [true, string_parts, char]
|
|
859
|
+
elsif HEX_ESCAPE_PREFIXES.include?(char)
|
|
860
|
+
entry_pos = @scanner.pos
|
|
861
|
+
@scanner.getch # consume 'u' or 'x'
|
|
862
|
+
|
|
863
|
+
num_chars = (char == 'u' ? 4 : 2)
|
|
864
|
+
hex_parts = []
|
|
865
|
+
|
|
866
|
+
# Use getch in loop to correctly extract chars (handling multibyte)
|
|
867
|
+
num_chars.times do
|
|
868
|
+
c = @scanner.getch
|
|
869
|
+
break unless c
|
|
870
|
+
|
|
871
|
+
hex_parts << c
|
|
872
|
+
end
|
|
873
|
+
|
|
874
|
+
# Validate valid hex digits
|
|
875
|
+
if hex_parts.length == num_chars && hex_parts.all? { |c| c.match?(/[0-9a-fA-F]/) }
|
|
876
|
+
string_parts.pop
|
|
877
|
+
string_parts << hex_parts.join.to_i(16).chr('UTF-8')
|
|
878
|
+
|
|
879
|
+
# Scanner is already advanced past digits
|
|
880
|
+
char = peek_char
|
|
881
|
+
return [true, string_parts, char]
|
|
882
|
+
else
|
|
883
|
+
# Not a valid escape sequence; backtrack so the main loop can treat 'u'/'x' as literal.
|
|
884
|
+
@scanner.pos = entry_pos
|
|
885
|
+
end
|
|
886
|
+
elsif STRING_DELIMITERS.include?(char) && char != rstring_delimiter
|
|
887
|
+
string_parts.pop
|
|
888
|
+
string_parts << char
|
|
889
|
+
@scanner.getch # Consume the character
|
|
890
|
+
char = peek_char
|
|
891
|
+
return [true, string_parts, char]
|
|
892
|
+
end
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
[false, string_parts, char]
|
|
896
|
+
end
|
|
897
|
+
|
|
898
|
+
def handle_missing_quotes_termination(
|
|
899
|
+
lstring_delimiter:,
|
|
900
|
+
rstring_delimiter:
|
|
901
|
+
)
|
|
902
|
+
i = skip_to_character(lstring_delimiter, start_idx: 1)
|
|
903
|
+
next_c = peek_char(i)
|
|
904
|
+
return true unless next_c
|
|
905
|
+
|
|
906
|
+
i += 1
|
|
907
|
+
# found the first delimiter
|
|
908
|
+
i = skip_to_character(rstring_delimiter, start_idx: i)
|
|
909
|
+
next_c = peek_char(i)
|
|
910
|
+
if next_c
|
|
911
|
+
# found a second delimiter
|
|
912
|
+
i += 1
|
|
913
|
+
# Skip spaces
|
|
914
|
+
i = skip_whitespaces_at(start_idx: i)
|
|
915
|
+
next_c = peek_char(i)
|
|
916
|
+
return true if next_c && TERMINATORS_OBJECT_VALUE.include?(next_c)
|
|
917
|
+
end
|
|
918
|
+
|
|
919
|
+
false
|
|
920
|
+
end
|
|
921
|
+
|
|
922
|
+
def finalize_parsed_string(
|
|
923
|
+
string_parts:,
|
|
924
|
+
char:,
|
|
925
|
+
rstring_delimiter:,
|
|
926
|
+
missing_quotes:
|
|
927
|
+
)
|
|
928
|
+
# A fallout of the previous special case in the while loop,
|
|
929
|
+
# we need to update the index only if we had a closing quote
|
|
930
|
+
if char == rstring_delimiter
|
|
931
|
+
@scanner.getch
|
|
932
|
+
elsif missing_quotes && current_context?(:object_key) && string_parts.last == ','
|
|
933
|
+
string_parts.pop
|
|
934
|
+
end
|
|
935
|
+
|
|
936
|
+
final_str = string_parts.join
|
|
937
|
+
final_str = final_str.rstrip if missing_quotes || final_str.end_with?("\n")
|
|
938
|
+
|
|
939
|
+
final_str
|
|
940
|
+
end
|
|
941
|
+
|
|
942
|
+
def context_termination_reached?(
|
|
943
|
+
char:,
|
|
944
|
+
missing_quotes:
|
|
945
|
+
)
|
|
946
|
+
return false unless missing_quotes
|
|
947
|
+
return true if current_context?(:object_key) && (char == ':' || char.match?(/\s/))
|
|
948
|
+
return true if current_context?(:object_key) && TERMINATORS_ARRAY.include?(char)
|
|
949
|
+
return true if current_context?(:array) && TERMINATORS_ARRAY_ITEM.include?(char)
|
|
950
|
+
|
|
951
|
+
false
|
|
952
|
+
end
|
|
953
|
+
|
|
954
|
+
# Parses a JSON number, which can be an integer or a floating-point value.
|
|
955
|
+
# This parser is lenient and will also handle currency-like strings with commas,
|
|
956
|
+
# returning them as a string. It attempts to handle various malformed number
|
|
957
|
+
# inputs that might be generated by LLMs
|
|
958
|
+
def parse_number
|
|
959
|
+
# Use pre-compiled regex based on context
|
|
960
|
+
regex = current_context?(:array) ? NUMBER_NO_COMMA_REGEX : NUMBER_REGEX
|
|
961
|
+
|
|
962
|
+
scanned_str = @scanner.scan(regex)
|
|
963
|
+
return nil unless scanned_str
|
|
964
|
+
|
|
965
|
+
# Handle cases where the number ends with an invalid character.
|
|
966
|
+
if !scanned_str.empty? && INVALID_NUMBER_TRAILERS.include?(scanned_str[-1])
|
|
967
|
+
# Do not rewind scanner, simply discard the invalid trailing char (garbage)
|
|
968
|
+
scanned_str = scanned_str[0...-1]
|
|
969
|
+
# Handle cases where what looked like a number is actually a string.
|
|
970
|
+
# e.g. "123-abc"
|
|
971
|
+
elsif peek_char&.match?(/\p{L}/)
|
|
972
|
+
# Roll back the entire scan and re-parse as a string.
|
|
973
|
+
@scanner.pos -= scanned_str.bytesize
|
|
974
|
+
return parse_string
|
|
975
|
+
end
|
|
976
|
+
|
|
977
|
+
# Sometimes numbers are followed by a quote, which is garbage
|
|
978
|
+
@scanner.getch if peek_char == '"'
|
|
979
|
+
|
|
980
|
+
# Attempt to convert the string to the appropriate number type.
|
|
981
|
+
# Use rescue to handle conversion errors gracefully, returning the original string.
|
|
982
|
+
begin
|
|
983
|
+
# Fix for Ruby < 3.4: "1." is not a valid float.
|
|
984
|
+
# If it ends with '.', we strip the dot and force Float conversion
|
|
985
|
+
# to ensure "1." becomes 1.0 (Float) instead of 1 (Integer).
|
|
986
|
+
if scanned_str.end_with?('.')
|
|
987
|
+
Float(scanned_str[0...-1])
|
|
988
|
+
elsif scanned_str.include?(',')
|
|
989
|
+
Float(scanned_str.tr(',', '.'))
|
|
990
|
+
elsif scanned_str.match?(/[.eE]/)
|
|
991
|
+
Float(scanned_str)
|
|
992
|
+
else
|
|
993
|
+
Integer(scanned_str, 10)
|
|
994
|
+
end
|
|
995
|
+
rescue ArgumentError
|
|
996
|
+
scanned_str
|
|
997
|
+
end
|
|
998
|
+
end
|
|
999
|
+
|
|
1000
|
+
# Parses the JSON literals `true`, `false`, or `null`.
|
|
1001
|
+
# This is case-insensitive.
|
|
1002
|
+
def parse_literal
|
|
1003
|
+
if @scanner.scan(/true/i)
|
|
1004
|
+
return true
|
|
1005
|
+
elsif @scanner.scan(/false/i)
|
|
1006
|
+
return false
|
|
1007
|
+
elsif @scanner.scan(/null/i)
|
|
1008
|
+
return nil
|
|
1009
|
+
end
|
|
1010
|
+
|
|
1011
|
+
# If nothing matches, return an empty string to signify that this
|
|
1012
|
+
# was not a boolean or null value
|
|
1013
|
+
''
|
|
1014
|
+
end
|
|
1015
|
+
|
|
1016
|
+
# Parses and skips over code-style comments.
|
|
1017
|
+
# - # line comment
|
|
1018
|
+
# - // line comment
|
|
1019
|
+
# - /* block comment */
|
|
1020
|
+
# After skipping the comment, it returns, allowing the caller to loop.
|
|
1021
|
+
def parse_comment
|
|
1022
|
+
# Check for a block comment `/* ... */`
|
|
1023
|
+
if @scanner.scan(%r{/\*})
|
|
1024
|
+
# Scan until the closing delimiter is found.
|
|
1025
|
+
# The `lazy` quantifier `*?` ensures we stop at the *first* `*/`.
|
|
1026
|
+
@scanner.scan_until(%r{\*/}) || @scanner.terminate
|
|
1027
|
+
|
|
1028
|
+
# Check for a line comment `//...` or `#...`
|
|
1029
|
+
elsif @scanner.scan(%r{//|#})
|
|
1030
|
+
# Determine valid line comment termination characters based on context.
|
|
1031
|
+
termination_chars = ["\n", "\r"]
|
|
1032
|
+
termination_chars << ']' if context_contain?(:array)
|
|
1033
|
+
termination_chars << '}' if context_contain?(:object_value)
|
|
1034
|
+
termination_chars << ':' if context_contain?(:object_key)
|
|
1035
|
+
|
|
1036
|
+
# Create a regex that will scan until it hits one of the terminators.
|
|
1037
|
+
# The terminators are positive lookaheads, so they aren't consumed by the scan.
|
|
1038
|
+
terminator_regex = Regexp.new("(?=#{termination_chars.map { |c| Regexp.escape(c) }.join('|')})")
|
|
1039
|
+
|
|
1040
|
+
# Scan until the end of the comment.
|
|
1041
|
+
@scanner.scan_until(terminator_regex)
|
|
1042
|
+
else
|
|
1043
|
+
# The character at the current position (likely '/') is not the start of a
|
|
1044
|
+
# valid comment. To prevent an infinite loop in the calling parser, we must
|
|
1045
|
+
# consume this single stray character before exiting
|
|
1046
|
+
@scanner.getch
|
|
1047
|
+
end
|
|
1048
|
+
|
|
1049
|
+
skip_whitespaces
|
|
1050
|
+
end
|
|
1051
|
+
|
|
1052
|
+
# This function is a non-destructive lookahead.
|
|
1053
|
+
# It quickly iterates to find a character, handling escaped characters, and
|
|
1054
|
+
# returns the index (offset) from the scanner
|
|
1055
|
+
def skip_to_character(characters, start_idx: 0)
|
|
1056
|
+
pattern = characters.is_a?(Array) ? Regexp.union(characters) : characters
|
|
1057
|
+
|
|
1058
|
+
saved_pos = @scanner.pos
|
|
1059
|
+
# Skip start_idx
|
|
1060
|
+
start_idx.times { @scanner.getch }
|
|
1061
|
+
|
|
1062
|
+
# Track accumulated length in chars
|
|
1063
|
+
acc_len = start_idx
|
|
1064
|
+
found_idx = nil
|
|
1065
|
+
|
|
1066
|
+
while (matched_text = @scanner.scan_until(pattern))
|
|
1067
|
+
chunk_len = matched_text.length
|
|
1068
|
+
delimiter_len = @scanner.matched.length
|
|
1069
|
+
|
|
1070
|
+
# Check escapes
|
|
1071
|
+
# matched_text ends with delimiter.
|
|
1072
|
+
# Check chars before the last one.
|
|
1073
|
+
content_before = matched_text[0...-delimiter_len]
|
|
1074
|
+
bs_count = 0
|
|
1075
|
+
idx = content_before.length - 1
|
|
1076
|
+
while idx >= 0 && content_before[idx] == '\\'
|
|
1077
|
+
bs_count += 1
|
|
1078
|
+
idx -= 1
|
|
1079
|
+
end
|
|
1080
|
+
|
|
1081
|
+
if bs_count.even?
|
|
1082
|
+
# Found it
|
|
1083
|
+
found_idx = acc_len + (chunk_len - delimiter_len)
|
|
1084
|
+
break
|
|
1085
|
+
else
|
|
1086
|
+
# Escaped, continue
|
|
1087
|
+
acc_len += chunk_len
|
|
1088
|
+
end
|
|
1089
|
+
end
|
|
1090
|
+
|
|
1091
|
+
if found_idx.nil?
|
|
1092
|
+
# Not found. Return remaining distance.
|
|
1093
|
+
# We scanned to EOS (if loop finished) or stopped.
|
|
1094
|
+
found_idx = acc_len + @scanner.rest.length
|
|
1095
|
+
end
|
|
1096
|
+
|
|
1097
|
+
@scanner.pos = saved_pos
|
|
1098
|
+
found_idx
|
|
1099
|
+
end
|
|
1100
|
+
|
|
1101
|
+
# This function uses the StringScanner to skip whitespace from the current position.
|
|
1102
|
+
# It is more efficient and idiomatic than manual index management
|
|
1103
|
+
def skip_whitespaces_at(start_idx: 0)
|
|
1104
|
+
saved_pos = @scanner.pos
|
|
1105
|
+
start_idx.times { @scanner.getch }
|
|
1106
|
+
|
|
1107
|
+
# Check forward for non-whitespace
|
|
1108
|
+
matched = @scanner.check_until(/\S/)
|
|
1109
|
+
|
|
1110
|
+
res = if matched
|
|
1111
|
+
# matched contains spaces then one non-space.
|
|
1112
|
+
# The index of that non-space (relative to current pos after start_idx)
|
|
1113
|
+
# is matched.length - 1
|
|
1114
|
+
(matched.length - 1) + start_idx
|
|
1115
|
+
else
|
|
1116
|
+
# No non-space found.
|
|
1117
|
+
@scanner.rest.length + start_idx
|
|
1118
|
+
end
|
|
1119
|
+
|
|
1120
|
+
@scanner.pos = saved_pos
|
|
1121
|
+
res
|
|
1122
|
+
end
|
|
1123
|
+
|
|
1124
|
+
def both_hash?(obj1, obj2)
|
|
1125
|
+
obj1.is_a?(Hash) && obj2.is_a?(Hash)
|
|
1126
|
+
end
|
|
1127
|
+
|
|
1128
|
+
def strictly_empty?(value)
|
|
1129
|
+
# Check if the value is a container AND if it's empty.
|
|
1130
|
+
case value
|
|
1131
|
+
when String, Array, Hash, Set
|
|
1132
|
+
value.empty?
|
|
1133
|
+
else
|
|
1134
|
+
false
|
|
1135
|
+
end
|
|
1136
|
+
end
|
|
1137
|
+
|
|
1138
|
+
# Skips whitespaces
|
|
1139
|
+
def skip_whitespaces
|
|
1140
|
+
@scanner.skip(/\s+/)
|
|
1141
|
+
end
|
|
1142
|
+
|
|
1143
|
+
# Peeks the next character without advancing the scanner
|
|
1144
|
+
def peek_char(offset = 0)
|
|
1145
|
+
return @scanner.check(/./m) if offset.zero?
|
|
1146
|
+
|
|
1147
|
+
saved_pos = @scanner.pos
|
|
1148
|
+
c = nil
|
|
1149
|
+
(offset + 1).times do
|
|
1150
|
+
c = @scanner.getch
|
|
1151
|
+
break if c.nil?
|
|
1152
|
+
end
|
|
1153
|
+
@scanner.pos = saved_pos
|
|
1154
|
+
c
|
|
1155
|
+
end
|
|
1156
|
+
|
|
1157
|
+
def current_context?(value)
|
|
1158
|
+
@context&.last == value
|
|
1159
|
+
end
|
|
1160
|
+
|
|
1161
|
+
def context_contain?(value)
|
|
1162
|
+
@context.include?(value)
|
|
1163
|
+
end
|
|
1164
|
+
|
|
1165
|
+
# Checks if the character signifies the start of a string or literal
|
|
1166
|
+
def string_start?(char)
|
|
1167
|
+
STRING_DELIMITERS.include?(char) || char&.match?(/\p{L}/)
|
|
1168
|
+
end
|
|
1169
|
+
|
|
1170
|
+
# Checks if the character signifies the start of a number
|
|
1171
|
+
def number_start?(char)
|
|
1172
|
+
char&.match?(/\d/) || char == '-' || char == '.'
|
|
1173
|
+
end
|
|
1174
|
+
end
|
|
1175
|
+
end
|