json_mend 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1175 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'strscan'
4
+ require 'set'
5
+
6
+ # Root module
7
+ module JsonMend
8
+ # The core parser that does the heavy lifting of fixing the JSON
9
+ class Parser
10
+ COMMENT_DELIMETERS = ['#', '/'].freeze
11
+ NUMBER_CHARS = Set.new('0123456789-.eE/,_'.chars).freeze
12
+ STRING_DELIMITERS = ['"', "'", '“', '”'].freeze
13
+ ESCAPE_MAPPING = {
14
+ 't' => "\t",
15
+ 'n' => "\n",
16
+ 'r' => "\r",
17
+ 'b' => "\b",
18
+ 'f' => "\f"
19
+ }.freeze
20
+ JSON_STOP_TOKEN = :json_mend_stop_token
21
+
22
+ # Optimized constants for performance (CollectionLiteralInLoop)
23
+ TERMINATORS_ARRAY = [']', '}'].freeze
24
+ TERMINATORS_OBJECT_KEY = [':', '}'].freeze
25
+ TERMINATORS_OBJECT_VALUE = [',', '}'].freeze
26
+ TERMINATORS_ARRAY_ITEM = [',', ']'].freeze
27
+ TERMINATORS_STRING_GUESSED = ['{', '}', '[', ']', ':', ','].freeze
28
+ TERMINATORS_VALUE = [',', ']', '}'].freeze
29
+ STRING_OR_OBJECT_START = (STRING_DELIMITERS + ['{', '[']).freeze
30
+ SKIPPED_KEYS = %i[merged_array stray_colon].freeze
31
+ BOOLEAN_OR_NULL_CHARS = %w[t f n].freeze
32
+ ESCAPE_START_CHARS = %w[t n r b \\].freeze
33
+ HEX_ESCAPE_PREFIXES = %w[u x].freeze
34
+ INVALID_NUMBER_TRAILERS = ['-', 'e', 'E', ','].freeze
35
+
36
+ # Pre-compile regexes for performance
37
+ NUMBER_REGEX = /[#{Regexp.escape(NUMBER_CHARS.to_a.join)}]+/
38
+ NUMBER_NO_COMMA_REGEX = /[#{Regexp.escape(NUMBER_CHARS.dup.tap { |s| s.delete(',') }.to_a.join)}]+/
39
+
40
+ def initialize(json_string)
41
+ @scanner = StringScanner.new(json_string)
42
+ @context = []
43
+ end
44
+
45
+ # Kicks off the parsing process. This is a direct port of the robust Python logic
46
+ def parse
47
+ json = parse_json
48
+
49
+ # If the first parse returns JSON_STOP_TOKEN, it means we found nothing (empty string or garbage)
50
+ # Return nil (or empty string representation logic elsewhere handles it)
51
+ return nil if json == JSON_STOP_TOKEN
52
+
53
+ unless @scanner.eos?
54
+ json = [json]
55
+ until @scanner.eos?
56
+ new_json = parse_json
57
+ if new_json == ''
58
+ @scanner.getch # continue
59
+ elsif new_json == JSON_STOP_TOKEN
60
+ # Found nothing but EOS or garbage terminator
61
+ break
62
+ else
63
+ # Ignore strings that look like closing braces garbage (e.g. "}", " ] ")
64
+ next if new_json.is_a?(String) && new_json.strip.match?(/^[}\]]+$/)
65
+
66
+ json.pop if both_hash?(json.last, new_json)
67
+ json << new_json
68
+ end
69
+ end
70
+
71
+ json = json.first if json.length == 1
72
+ end
73
+
74
+ json
75
+ end
76
+
77
+ private
78
+
79
+ def parse_json
80
+ until @scanner.eos?
81
+ char = peek_char
82
+ case char
83
+ when '{'
84
+ @scanner.getch # consume '{'
85
+ return parse_object
86
+ when '['
87
+ @scanner.getch # consume '['
88
+ return parse_array
89
+ when *COMMENT_DELIMETERS
90
+ # Avoid recursion: consume comment and continue loop
91
+ parse_comment
92
+ else
93
+ if string_start?(char)
94
+ if @context.empty? && !STRING_DELIMITERS.include?(char)
95
+ # Top level unquoted string strictness:
96
+ # Only allow literals (true/false/null), ignore other text as garbage
97
+ val = parse_literal
98
+ return val if val != ''
99
+
100
+ @scanner.getch
101
+ next
102
+ end
103
+ return parse_string
104
+ elsif number_start?(char)
105
+ val = parse_number
106
+ return val unless val == ''
107
+
108
+ @scanner.getch
109
+ else
110
+ # Stop if we hit a terminator for the current context to avoid consuming it as garbage
111
+ if (current_context?(:array) && char == ']') ||
112
+ (current_context?(:object_value) && char == '}') ||
113
+ (current_context?(:object_key) && char == '}')
114
+ return JSON_STOP_TOKEN
115
+ end
116
+
117
+ @scanner.getch # moving by string, ignore this symbol
118
+ end
119
+ end
120
+ end
121
+ JSON_STOP_TOKEN
122
+ end
123
+
124
+ # Parses a JSON object.
125
+ def parse_object
126
+ object = {}
127
+
128
+ loop do
129
+ skip_whitespaces
130
+
131
+ # Explicitly consume comments to ensure they don't hide separators (like commas)
132
+ # or get parsed as part of the next key.
133
+ if COMMENT_DELIMETERS.include?(peek_char)
134
+ parse_comment
135
+ next
136
+ end
137
+
138
+ # >> PRIMARY EXIT: End of object or end of string.
139
+ break if @scanner.eos? || @scanner.scan('}') || peek_char == ']'
140
+
141
+ # Leniently consume any leading junk characters (like stray commas or colons)
142
+ # that might appear before a key.
143
+ @scanner.skip(/[,\s]+/)
144
+
145
+ # --- Delegate to a helper to parse the next Key-Value pair ---
146
+ key, value, colon_found = parse_object_pair(object)
147
+ next if SKIPPED_KEYS.include?(key)
148
+
149
+ # If the helper returns nil for the key, it signals that we should
150
+ # stop parsing this object (e.g. a duplicate key was found,
151
+ # indicating the start of a new object).
152
+ if key.nil?
153
+ @scanner.scan('}')
154
+ break
155
+ end
156
+
157
+ # Assign the parsed pair to our object, avoiding empty keys.
158
+ # But only if we didn't firmly establish the key with a colon already.
159
+ skip_whitespaces
160
+ if peek_char == ':' && !colon_found
161
+ key = value.to_s
162
+ @scanner.getch # consume ':'
163
+ value = parse_object_value
164
+ end
165
+
166
+ # Assign the parsed pair to our object.
167
+ object[key] = value
168
+ end
169
+
170
+ object
171
+ end
172
+
173
+ # Attempts to parse a single key-value pair.
174
+ # Returns [key, value] on success, or [nil, nil] if parsing should stop.
175
+ def parse_object_pair(object)
176
+ # --- 1. Parse the Key ---
177
+ # This step includes the complex logic for merging dangling arrays.
178
+ pos_before_key = @scanner.pos
179
+ key, was_array_merged, is_bracketed = parse_object_key(object)
180
+
181
+ # If an array was merged, there's no K/V pair to process, so we restart the loop.
182
+ return [:merged_array, nil, false] if was_array_merged
183
+
184
+ # Check for a stray colon: invalid structure where we have no key (and no quotes consumed) but see a colon.
185
+ # This handles cases like: { "key": "value", : "garbage" }
186
+ if key.empty? && (@scanner.pos == pos_before_key) && peek_char == ':'
187
+ @scanner.getch # Skip ':'
188
+ parse_object_value # Consume and discard the value
189
+ return [:stray_colon, nil, false]
190
+ end
191
+
192
+ # If we get an empty key and the next character is a closing brace, we're done.
193
+ return [nil, nil, false] if key.empty? && (peek_char.nil? || peek_char == '}')
194
+
195
+ # --- 2. Handle Duplicate Keys (Safer Method) ---
196
+ # This is a critical repair for lists of objects missing a comma separator.
197
+ if object.key?(key)
198
+ # Instead of rewriting the string, we safely rewind the scanner to the
199
+ # position before the duplicate key. This ends the parsing of the current
200
+ # object, allowing the top-level parser to see the duplicate key as the
201
+ # start of a new JSON object.
202
+ @scanner.pos = pos_before_key
203
+ return [nil, nil, false] # Signal to stop parsing this object.
204
+ end
205
+
206
+ # --- 3. Parse the Separator (:) ---
207
+ skip_whitespaces
208
+ colon_found = @scanner.skip(/:/) # Leniently skip the colon if it exists.
209
+
210
+ # --- 4. Parse the Value ---
211
+ value = parse_object_value(colon_found: colon_found || is_bracketed)
212
+
213
+ if value == :inferred_true
214
+ return [nil, nil, false] if %w[true false null].include?(key.downcase)
215
+
216
+ value = true
217
+ end
218
+
219
+ [key, value, colon_found]
220
+ end
221
+
222
+ # Parses the key of an object, including the special logic for merging dangling arrays.
223
+ # Returns [key, was_array_merged_flag]
224
+ def parse_object_key(object)
225
+ # First, check for and handle the dangling array merge logic.
226
+ if try_to_merge_dangling_array(object)
227
+ return [nil, true, false] # Signal that an array was merged.
228
+ end
229
+
230
+ # If no merge happened, proceed with standard key parsing.
231
+ @context.push(:object_key)
232
+ is_bracketed = false
233
+
234
+ if peek_char == '['
235
+ @scanner.getch # Consume '['
236
+ arr = parse_array
237
+ key = arr.first.to_s
238
+ is_bracketed = true
239
+ else
240
+ key = parse_string.to_s
241
+ end
242
+ @context.pop
243
+
244
+ # If the key is empty, consume any stray characters to prevent infinite loops.
245
+ @scanner.getch if key.empty? && !@scanner.check(/[:}]/) && !@scanner.eos?
246
+
247
+ [key, false, is_bracketed] # Signal that a key was parsed.
248
+ end
249
+
250
+ # Parses the value part of a key-value pair.
251
+ def parse_object_value(colon_found: true)
252
+ @context.push(:object_value)
253
+ skip_whitespaces
254
+
255
+ # Handle cases where the value is missing (e.g. "key": } or "key": ,)
256
+ if @scanner.eos? || @scanner.check(/[,}]/)
257
+ @context.pop
258
+ return colon_found ? '' : :inferred_true
259
+ end
260
+
261
+ # Delegate to the main JSON value parser.
262
+ value = parse_json
263
+ @context.pop
264
+
265
+ # If parse_json returned JSON_STOP_TOKEN (nothing found due to garbage->terminator),
266
+ # treat it as nil (null) for object values to be safe.
267
+ value == JSON_STOP_TOKEN ? nil : value
268
+ end
269
+
270
+ # Encapsulates the logic for merging an array that appears without a key.
271
+ def try_to_merge_dangling_array(object)
272
+ return false unless peek_char == '['
273
+
274
+ prev_key = object.keys.last
275
+ return false unless prev_key && object[prev_key].is_a?(Array)
276
+
277
+ @scanner.getch # Consume '['
278
+ new_array = parse_array
279
+ return false unless new_array.is_a?(Array)
280
+
281
+ to_merge = new_array.length == 1 && new_array.first.is_a?(Array) ? new_array.first : new_array
282
+ object[prev_key].concat(to_merge)
283
+
284
+ skip_whitespaces
285
+ @scanner.skip(',')
286
+ skip_whitespaces
287
+
288
+ true
289
+ end
290
+
291
+ # Parses a JSON array from the string.
292
+ # Assumes the opening '[' has already been consumed by the caller.
293
+ # This is a lenient parser designed to handle malformed JSON.
294
+ def parse_array
295
+ arr = []
296
+ @context.push(:array)
297
+ char = peek_char
298
+ # Stop when you find the closing bracket or an invalid character like '}'
299
+ while !@scanner.eos? && !TERMINATORS_ARRAY.include?(char)
300
+ skip_whitespaces
301
+ char = peek_char
302
+
303
+ # Check for comments explicitly inside array to avoid recursion or garbage consumption issues
304
+ if COMMENT_DELIMETERS.include?(char)
305
+ parse_comment
306
+ char = peek_char
307
+ next
308
+ end
309
+
310
+ value = ''
311
+ if STRING_DELIMITERS.include?(char)
312
+ # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
313
+ # So we are going to check if this string is followed by a : or not
314
+ # And either parse the string or parse the object
315
+ i = 1
316
+ i = skip_to_character(char, start_idx: i)
317
+ i = skip_whitespaces_at(start_idx: i + 1)
318
+ value = (peek_char(i) == ':' ? parse_object : parse_string)
319
+ else
320
+ value = parse_json
321
+ end
322
+
323
+ # Handle JSON_STOP_TOKEN from parse_json (EOS or consumed terminator)
324
+ if value == JSON_STOP_TOKEN
325
+ # Do nothing, just skipped garbage
326
+ elsif strictly_empty?(value)
327
+ # Only consume if we didn't just hit a terminator that parse_json successfully respected
328
+ @scanner.getch unless value.nil? && TERMINATORS_ARRAY.include?(peek_char)
329
+ elsif value == '...' && @scanner.string.getbyte(@scanner.pos - 1) == 46
330
+ # just skip if the previous byte was a dot (46)
331
+ else
332
+ arr << value
333
+ end
334
+
335
+ char = peek_char
336
+ while char && char != ']' && (char.match?(/\s/) || char == ',')
337
+ @scanner.getch
338
+ char = peek_char
339
+ end
340
+ end
341
+
342
+ # Handle a potentially missing closing bracket, a common LLM error.
343
+ unless @scanner.scan(']')
344
+ @scanner.scan('}') # Consume } if it was the closer
345
+ end
346
+ @context.pop
347
+
348
+ arr
349
+ end
350
+
351
+ # Parses a JSON string. This is a very lenient parser designed to handle
352
+ # many common errors found in LLM-generated JSON, such as missing quotes,
353
+ # incorrect escape sequences, and ambiguous string terminators
354
+ def parse_string
355
+ char = prepare_string_parsing
356
+
357
+ # A valid string can only start with a valid quote or, in our case, with a literal
358
+ while !@scanner.eos? && !STRING_DELIMITERS.include?(char) && !char.match?(/[\p{L}0-9]/)
359
+ return '' if TERMINATORS_STRING_GUESSED.include?(char)
360
+
361
+ @scanner.getch
362
+ char = peek_char
363
+ end
364
+
365
+ return '' if @scanner.eos?
366
+
367
+ return_result, *rest = determine_delimiters(char:)
368
+ return rest.first if return_result
369
+
370
+ lstring_delimiter, rstring_delimiter, missing_quotes = rest
371
+
372
+ @scanner.getch unless missing_quotes
373
+
374
+ # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
375
+ return_result, *rest = handle_doubled_quotes(
376
+ lstring_delimiter:,
377
+ rstring_delimiter:
378
+ )
379
+ return rest.first if return_result
380
+
381
+ doubled_quotes = rest.first
382
+
383
+ string_parts = []
384
+
385
+ # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
386
+ # In that case we need to use the ":|,|}" characters as terminators of the string
387
+ # So this will stop if:
388
+ # * It finds a closing quote
389
+ # * It iterated over the entire sequence
390
+ # * If we are fixing missing quotes in an object, when it finds the special terminators
391
+ string_parts, char = check_unmatched_delimiters(
392
+ string_parts:,
393
+ lstring_delimiter:,
394
+ rstring_delimiter:,
395
+ missing_quotes:,
396
+ doubled_quotes:
397
+ )
398
+
399
+ if !@scanner.eos? && missing_quotes && current_context?(:object_key) && char.match(/\s/)
400
+ skip_whitespaces
401
+ return '' unless [':', ','].include?(peek_char)
402
+ end
403
+
404
+ finalize_parsed_string(
405
+ string_parts:,
406
+ char:,
407
+ rstring_delimiter:,
408
+ missing_quotes:
409
+ )
410
+ end
411
+
412
+ # string helper methods
413
+
414
+ def prepare_string_parsing
415
+ char = peek_char
416
+
417
+ # Consume comments that appear before the string starts
418
+ while COMMENT_DELIMETERS.include?(char)
419
+ parse_comment
420
+ char = peek_char
421
+ end
422
+
423
+ char
424
+ end
425
+
426
+ def determine_delimiters(char:)
427
+ missing_quotes = false
428
+ lstring_delimiter = rstring_delimiter = '"'
429
+
430
+ # --- Determine Delimiters and Handle Unquoted Literals ---
431
+ case char
432
+ when "'"
433
+ lstring_delimiter = rstring_delimiter = "'"
434
+ when '“'
435
+ lstring_delimiter = '“'
436
+ rstring_delimiter = '”'
437
+ when /[\p{L}0-9]/
438
+ # Could be a boolean/null, but not if it's an object key.
439
+ if BOOLEAN_OR_NULL_CHARS.include?(char.downcase) && !current_context?(:object_key)
440
+ # parse_literal is non-destructive if it fails to match.
441
+ value = parse_literal
442
+ return [true, value] if value != ''
443
+ end
444
+ # While parsing a string, we found a literal instead of a quote
445
+ missing_quotes = true
446
+ end
447
+
448
+ [false, lstring_delimiter, rstring_delimiter, missing_quotes]
449
+ end
450
+
451
+ def handle_doubled_quotes(
452
+ lstring_delimiter:,
453
+ rstring_delimiter:
454
+ )
455
+ doubled_quotes = false
456
+
457
+ # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
458
+ if STRING_DELIMITERS.include?(peek_char) && peek_char == lstring_delimiter
459
+ next_value = peek_char(1)
460
+
461
+ if (
462
+ current_context?(:object_key) && next_value == ':'
463
+ ) || (
464
+ current_context?(:object_value) && TERMINATORS_OBJECT_VALUE.include?(next_value)
465
+ )
466
+ @scanner.getch
467
+ return [true, '']
468
+ elsif next_value == lstring_delimiter
469
+ # There's something fishy about this, we found doubled quotes and then again quotes
470
+ return [true, '']
471
+ end
472
+
473
+ i = skip_to_character(rstring_delimiter, start_idx: 1)
474
+ next_c = peek_char(i)
475
+
476
+ if next_c && peek_char(i + 1) == rstring_delimiter
477
+ doubled_quotes = true
478
+ @scanner.getch
479
+ else
480
+ # Ok this is not a doubled quote, check if this is an empty string or not
481
+ i = skip_whitespaces_at(start_idx: 1)
482
+ next_c = peek_char(i)
483
+ if STRING_OR_OBJECT_START.include?(next_c)
484
+ @scanner.getch
485
+ return [true, '']
486
+ elsif !TERMINATORS_VALUE.include?(next_c)
487
+ @scanner.getch
488
+ end
489
+ end
490
+ end
491
+
492
+ [false, doubled_quotes]
493
+ end
494
+
495
+ # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
496
+ # In that case we need to use the ":|,|}" characters as terminators of the string
497
+ # So this will stop if:
498
+ # * It finds a closing quote
499
+ # * It iterated over the entire sequence
500
+ # * If we are fixing missing quotes in an object, when it finds the special terminators
501
+ def check_unmatched_delimiters(
502
+ string_parts:,
503
+ lstring_delimiter:,
504
+ rstring_delimiter:,
505
+ missing_quotes:,
506
+ doubled_quotes:
507
+ )
508
+ char = peek_char
509
+ unmatched_delimiter = false
510
+ # --- Main Parsing Loop ---
511
+ while !@scanner.eos? && char != rstring_delimiter
512
+ break if context_termination_reached?(
513
+ char:,
514
+ missing_quotes:
515
+ )
516
+
517
+ if current_context?(:object_value) && TERMINATORS_OBJECT_VALUE.include?(char) &&
518
+ (string_parts.empty? || string_parts.last != rstring_delimiter)
519
+
520
+ is_break = check_rstring_delimiter_missing(
521
+ string_parts:,
522
+ lstring_delimiter:,
523
+ rstring_delimiter:,
524
+ missing_quotes:
525
+ )
526
+ break if is_break
527
+ end
528
+
529
+ if char == ']' && context_contain?(:array) && string_parts.last != rstring_delimiter
530
+ i = skip_to_character(rstring_delimiter)
531
+ # No delimiter found
532
+ break unless peek_char(i)
533
+ end
534
+
535
+ if current_context?(:object_value) && char == '}'
536
+ # We found the end of an object while parsing a value
537
+ # Check if the object is really over, to avoid doubling the closing brace
538
+ i = skip_whitespaces_at(start_idx: 1)
539
+ next_c = peek_char(i)
540
+ break unless next_c
541
+ end
542
+
543
+ string_parts << char
544
+ @scanner.getch # Consume the character
545
+ char = peek_char
546
+
547
+ if !@scanner.eos? && string_parts.last == '\\'
548
+ # This is a special case, if people use real strings this might happen
549
+ is_next, string_parts, char = parse_escape_sequence(
550
+ string_parts:,
551
+ char:,
552
+ rstring_delimiter:
553
+ )
554
+ next if is_next
555
+ end
556
+
557
+ # If we are in object key context and we find a colon, it could be a missing right quote
558
+ if char == ':' && !missing_quotes && current_context?(:object_key)
559
+ is_break = handle_missing_quotes_termination(
560
+ lstring_delimiter:,
561
+ rstring_delimiter:
562
+ )
563
+ break if is_break
564
+ end
565
+
566
+ if char == rstring_delimiter && string_parts.last != '\\'
567
+ if check_doubled_quotes(doubled_quotes, rstring_delimiter)
568
+ # Consumed in helper
569
+ elsif check_missing_quotes_in_object_value(missing_quotes, lstring_delimiter, rstring_delimiter)
570
+ char = peek_char
571
+ break
572
+ elsif unmatched_delimiter
573
+ unmatched_delimiter = false
574
+ string_parts << char.to_s
575
+ @scanner.getch # Consume the character
576
+ char = peek_char
577
+ else
578
+ should_consume, set_unmatched = determine_complex_delimiter_action(lstring_delimiter, rstring_delimiter)
579
+ if should_consume
580
+ unmatched_delimiter = true if set_unmatched
581
+ string_parts << char.to_s
582
+ @scanner.getch
583
+ char = peek_char
584
+ end
585
+ end
586
+ end
587
+ end
588
+
589
+ [
590
+ string_parts,
591
+ char
592
+ ]
593
+ end
594
+
595
+ def check_doubled_quotes(doubled_quotes, rstring_delimiter)
596
+ if doubled_quotes && peek_char(1) == rstring_delimiter
597
+ @scanner.getch
598
+ return true
599
+ end
600
+ false
601
+ end
602
+
603
+ def check_missing_quotes_in_object_value(missing_quotes, lstring_delimiter, rstring_delimiter)
604
+ return false unless missing_quotes && current_context?(:object_value)
605
+
606
+ i = 1
607
+ next_c = peek_char(i)
608
+ while next_c && ![rstring_delimiter, lstring_delimiter].include?(next_c)
609
+ i += 1
610
+ next_c = peek_char(i)
611
+ end
612
+
613
+ return false unless next_c
614
+
615
+ # We found a quote, now let's make sure there's a ":" following
616
+ i += 1
617
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
618
+ i = skip_whitespaces_at(start_idx: i)
619
+ next_c = peek_char(i)
620
+
621
+ if next_c && next_c == ':'
622
+ @scanner.pos -= 1
623
+ return true
624
+ end
625
+
626
+ false
627
+ end
628
+
629
+ def determine_complex_delimiter_action(lstring_delimiter, rstring_delimiter)
630
+ i = 1
631
+ next_c = peek_char(i)
632
+ check_comma_in_object_value = true
633
+
634
+ # Check if eventually there is a rstring delimiter, otherwise we bail
635
+ while next_c && ![rstring_delimiter, lstring_delimiter].include?(next_c)
636
+ # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
637
+ # This is because the routine after will make sure to correct any bad guess and this solves a corner case
638
+ check_comma_in_object_value = false if check_comma_in_object_value && next_c.match?(/\p{L}/)
639
+ # If we are in an object context, let's check for the right delimiters
640
+ if (context_contain?(:object_key) && TERMINATORS_OBJECT_KEY.include?(next_c)) ||
641
+ (context_contain?(:object_value) && TERMINATORS_OBJECT_KEY.include?(next_c)) ||
642
+ (context_contain?(:array) && TERMINATORS_ARRAY_ITEM.include?(next_c)) ||
643
+ (
644
+ check_comma_in_object_value &&
645
+ current_context?(:object_value) &&
646
+ next_c == ','
647
+ )
648
+ break
649
+ end
650
+
651
+ i += 1
652
+ next_c = peek_char(i)
653
+ end
654
+
655
+ # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
656
+ if next_c == ',' && current_context?(:object_value)
657
+ i += 1
658
+ i = skip_to_character(rstring_delimiter, start_idx: i)
659
+ peek_char(i)
660
+ # Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
661
+ i += 1
662
+ i = skip_whitespaces_at(start_idx: i)
663
+ next_c = peek_char(i)
664
+ return [true, false] if TERMINATORS_OBJECT_VALUE.include?(next_c)
665
+ elsif next_c == rstring_delimiter && peek_char(i - 1) != '\\'
666
+ # Check if self.index:self.index+i is only whitespaces, break if that's the case
667
+ return [false, false] if (1..i).all? { |j| peek_char(j).to_s.match(/\s/) }
668
+
669
+ if current_context?(:object_value)
670
+ return check_unmatched_in_object_value(index: i, lstring_delimiter:, rstring_delimiter:)
671
+ elsif current_context?(:array)
672
+ return check_unmatched_in_array(rstring_delimiter:)
673
+ elsif current_context?(:object_key)
674
+ return [true, false]
675
+ end
676
+ end
677
+
678
+ [false, false]
679
+ end
680
+
681
+ def check_unmatched_in_object_value(index:, lstring_delimiter:, rstring_delimiter:)
682
+ index = skip_whitespaces_at(start_idx: index + 1)
683
+ if peek_char(index) == ','
684
+ # So we found a comma, this could be a case of a single quote like "va"lue",
685
+ # Search if it's followed by another key, starting with the first delimeter
686
+ index = skip_to_character(lstring_delimiter, start_idx: index + 1)
687
+ index += 1
688
+ index = skip_to_character(rstring_delimiter, start_idx: index + 1)
689
+ index += 1
690
+ index = skip_whitespaces_at(start_idx: index)
691
+ next_c = peek_char(index)
692
+ return [true, false] if next_c == ':'
693
+ end
694
+ # We found a delimiter and we need to check if this is a key
695
+ # so find a rstring_delimiter and a colon after
696
+ index = skip_to_character(rstring_delimiter, start_idx: index + 1)
697
+ index += 1
698
+ next_c = peek_char(index)
699
+ while next_c && next_c != ':'
700
+ if TERMINATORS_VALUE.include?(next_c) || (
701
+ next_c == rstring_delimiter &&
702
+ peek_char(index - 1) != '\\'
703
+ )
704
+ break
705
+ end
706
+
707
+ index += 1
708
+ next_c = peek_char(index)
709
+ end
710
+
711
+ # Only if we fail to find a ':' then we know this is misplaced quote
712
+ return [true, true] if next_c != ':'
713
+
714
+ [false, false]
715
+ end
716
+
717
+ def check_unmatched_in_array(rstring_delimiter:)
718
+ # Heuristic: Check if this quote is a closer or internal.
719
+ # 1. Find the NEXT delimiter (quote) index `j`.
720
+ j = 1
721
+ found_next = false
722
+ while (c = peek_char(j))
723
+ if c == rstring_delimiter
724
+ # Check if escaped (count preceding backslashes)
725
+ bk = 1
726
+ slashes = 0
727
+ while j - bk >= 0 && peek_char(j - bk) == '\\'
728
+ slashes += 1
729
+ bk += 1
730
+ end
731
+ if slashes.even?
732
+ found_next = true
733
+ break
734
+ end
735
+ end
736
+ j += 1
737
+ end
738
+
739
+ # 2. Check conditions to STOP (treat as closing quote):
740
+ # a) Strictly whitespace between quotes: ["a" "b"]
741
+ is_whitespace = (1...j).all? { |k| peek_char(k).match?(/\s/) }
742
+
743
+ # b) Next quote is followed by a separator: ["val1" val2",]
744
+ is_next_closer = false
745
+ if found_next
746
+ k = j + 1
747
+ k += 1 while peek_char(k)&.match?(/\s/) # skip whitespaces
748
+ is_next_closer = TERMINATORS_VALUE.include?(peek_char(k))
749
+ end
750
+
751
+ return [true, true] unless is_whitespace || is_next_closer
752
+
753
+ [false, false]
754
+ end
755
+
756
+ def check_rstring_delimiter_missing(
757
+ string_parts:,
758
+ lstring_delimiter:,
759
+ rstring_delimiter:,
760
+ missing_quotes:
761
+ )
762
+ rstring_delimiter_missing = true
763
+ # check if this is a case in which the closing comma is NOT missing instead
764
+ skip_whitespaces
765
+ if peek_char(1) == '\\'
766
+ # Ok this is a quoted string, skip
767
+ rstring_delimiter_missing = false
768
+ end
769
+
770
+ i = skip_to_character(rstring_delimiter, start_idx: 1)
771
+ next_c = peek_char(i)
772
+
773
+ is_gap_clean = true
774
+ is_gap_clean = (1...i).all? { |k| peek_char(k)&.match?(/\s/) } if missing_quotes && next_c
775
+ if next_c && is_gap_clean
776
+ i += 1
777
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
778
+ # or the string ended
779
+ i = skip_whitespaces_at(start_idx: i)
780
+ next_c = peek_char(i)
781
+ if next_c.nil? || TERMINATORS_OBJECT_VALUE.include?(next_c)
782
+ rstring_delimiter_missing = false
783
+ else
784
+ # OK but this could still be some garbage at the end of the string
785
+ # So we need to check if we find a new lstring_delimiter afterwards
786
+ # If we do, maybe this is a missing delimiter
787
+ i = skip_to_character(lstring_delimiter, start_idx: i)
788
+ next_c = peek_char(i)
789
+ if next_c.nil?
790
+ rstring_delimiter_missing = false
791
+ else
792
+ # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
793
+ # Check if we find a : afterwards (skipping space)
794
+ i = skip_whitespaces_at(start_idx: i + 1)
795
+ next_c = peek_char(i)
796
+ rstring_delimiter_missing = false if next_c && next_c != ':'
797
+ end
798
+ end
799
+ elsif next_c
800
+ rstring_delimiter_missing = false
801
+ else
802
+ # There could be a case in which even the next key:value is missing delimeters
803
+ # because it might be a systemic issue with the output
804
+ # So let's check if we can find a : in the string instead
805
+ i = skip_to_character(':', start_idx: 1)
806
+ next_c = peek_char(i)
807
+ return true if next_c
808
+
809
+ # OK then this is a systemic issue with the output
810
+
811
+ # skip any whitespace first
812
+ i = skip_whitespaces_at(start_idx: 1)
813
+ # We couldn't find any rstring_delimeter before the end of the string
814
+ # check if this is the last string of an object and therefore we can keep going
815
+ # make an exception if this is the last char before the closing brace
816
+ j = skip_to_character('}', start_idx: i)
817
+ if j - i > 1
818
+ # Ok it's not right after the comma
819
+ # Let's ignore
820
+ rstring_delimiter_missing = false
821
+ elsif peek_char(j)
822
+ # Check for an unmatched opening brace in string_parts
823
+ string_parts.reverse_each do |c|
824
+ next unless c == '{'
825
+
826
+ # Ok then this is part of the string
827
+ rstring_delimiter_missing = false
828
+ break
829
+ end
830
+ end
831
+
832
+ end
833
+
834
+ rstring_delimiter_missing
835
+ end
836
+
837
+ def parse_escape_sequence(
838
+ string_parts:,
839
+ char:,
840
+ rstring_delimiter:
841
+ )
842
+ if !@scanner.eos? && string_parts.last == '\\'
843
+ # This is a special case, if people use real strings this might happen
844
+ if char == rstring_delimiter || ESCAPE_START_CHARS.include?(char)
845
+ string_parts.pop
846
+ string_parts << ESCAPE_MAPPING.fetch(char, char)
847
+
848
+ @scanner.getch # Consume the character
849
+ char = peek_char
850
+ while !@scanner.eos? && string_parts.last == '\\' && (char == rstring_delimiter || char == '\\')
851
+ # this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
852
+ # I don't love it though
853
+ string_parts.pop
854
+ string_parts << char
855
+ @scanner.getch # Consume the character
856
+ char = peek_char
857
+ end
858
+ return [true, string_parts, char]
859
+ elsif HEX_ESCAPE_PREFIXES.include?(char)
860
+ entry_pos = @scanner.pos
861
+ @scanner.getch # consume 'u' or 'x'
862
+
863
+ num_chars = (char == 'u' ? 4 : 2)
864
+ hex_parts = []
865
+
866
+ # Use getch in loop to correctly extract chars (handling multibyte)
867
+ num_chars.times do
868
+ c = @scanner.getch
869
+ break unless c
870
+
871
+ hex_parts << c
872
+ end
873
+
874
+ # Validate valid hex digits
875
+ if hex_parts.length == num_chars && hex_parts.all? { |c| c.match?(/[0-9a-fA-F]/) }
876
+ string_parts.pop
877
+ string_parts << hex_parts.join.to_i(16).chr('UTF-8')
878
+
879
+ # Scanner is already advanced past digits
880
+ char = peek_char
881
+ return [true, string_parts, char]
882
+ else
883
+ # Not a valid escape sequence; backtrack so the main loop can treat 'u'/'x' as literal.
884
+ @scanner.pos = entry_pos
885
+ end
886
+ elsif STRING_DELIMITERS.include?(char) && char != rstring_delimiter
887
+ string_parts.pop
888
+ string_parts << char
889
+ @scanner.getch # Consume the character
890
+ char = peek_char
891
+ return [true, string_parts, char]
892
+ end
893
+ end
894
+
895
+ [false, string_parts, char]
896
+ end
897
+
898
+ def handle_missing_quotes_termination(
899
+ lstring_delimiter:,
900
+ rstring_delimiter:
901
+ )
902
+ i = skip_to_character(lstring_delimiter, start_idx: 1)
903
+ next_c = peek_char(i)
904
+ return true unless next_c
905
+
906
+ i += 1
907
+ # found the first delimiter
908
+ i = skip_to_character(rstring_delimiter, start_idx: i)
909
+ next_c = peek_char(i)
910
+ if next_c
911
+ # found a second delimiter
912
+ i += 1
913
+ # Skip spaces
914
+ i = skip_whitespaces_at(start_idx: i)
915
+ next_c = peek_char(i)
916
+ return true if next_c && TERMINATORS_OBJECT_VALUE.include?(next_c)
917
+ end
918
+
919
+ false
920
+ end
921
+
922
+ def finalize_parsed_string(
923
+ string_parts:,
924
+ char:,
925
+ rstring_delimiter:,
926
+ missing_quotes:
927
+ )
928
+ # A fallout of the previous special case in the while loop,
929
+ # we need to update the index only if we had a closing quote
930
+ if char == rstring_delimiter
931
+ @scanner.getch
932
+ elsif missing_quotes && current_context?(:object_key) && string_parts.last == ','
933
+ string_parts.pop
934
+ end
935
+
936
+ final_str = string_parts.join
937
+ final_str = final_str.rstrip if missing_quotes || final_str.end_with?("\n")
938
+
939
+ final_str
940
+ end
941
+
942
+ def context_termination_reached?(
943
+ char:,
944
+ missing_quotes:
945
+ )
946
+ return false unless missing_quotes
947
+ return true if current_context?(:object_key) && (char == ':' || char.match?(/\s/))
948
+ return true if current_context?(:object_key) && TERMINATORS_ARRAY.include?(char)
949
+ return true if current_context?(:array) && TERMINATORS_ARRAY_ITEM.include?(char)
950
+
951
+ false
952
+ end
953
+
954
+ # Parses a JSON number, which can be an integer or a floating-point value.
955
+ # This parser is lenient and will also handle currency-like strings with commas,
956
+ # returning them as a string. It attempts to handle various malformed number
957
+ # inputs that might be generated by LLMs
958
+ def parse_number
959
+ # Use pre-compiled regex based on context
960
+ regex = current_context?(:array) ? NUMBER_NO_COMMA_REGEX : NUMBER_REGEX
961
+
962
+ scanned_str = @scanner.scan(regex)
963
+ return nil unless scanned_str
964
+
965
+ # Handle cases where the number ends with an invalid character.
966
+ if !scanned_str.empty? && INVALID_NUMBER_TRAILERS.include?(scanned_str[-1])
967
+ # Do not rewind scanner, simply discard the invalid trailing char (garbage)
968
+ scanned_str = scanned_str[0...-1]
969
+ # Handle cases where what looked like a number is actually a string.
970
+ # e.g. "123-abc"
971
+ elsif peek_char&.match?(/\p{L}/)
972
+ # Roll back the entire scan and re-parse as a string.
973
+ @scanner.pos -= scanned_str.bytesize
974
+ return parse_string
975
+ end
976
+
977
+ # Sometimes numbers are followed by a quote, which is garbage
978
+ @scanner.getch if peek_char == '"'
979
+
980
+ # Attempt to convert the string to the appropriate number type.
981
+ # Use rescue to handle conversion errors gracefully, returning the original string.
982
+ begin
983
+ # Fix for Ruby < 3.4: "1." is not a valid float.
984
+ # If it ends with '.', we strip the dot and force Float conversion
985
+ # to ensure "1." becomes 1.0 (Float) instead of 1 (Integer).
986
+ if scanned_str.end_with?('.')
987
+ Float(scanned_str[0...-1])
988
+ elsif scanned_str.include?(',')
989
+ Float(scanned_str.tr(',', '.'))
990
+ elsif scanned_str.match?(/[.eE]/)
991
+ Float(scanned_str)
992
+ else
993
+ Integer(scanned_str, 10)
994
+ end
995
+ rescue ArgumentError
996
+ scanned_str
997
+ end
998
+ end
999
+
1000
+ # Parses the JSON literals `true`, `false`, or `null`.
1001
+ # This is case-insensitive.
1002
+ def parse_literal
1003
+ if @scanner.scan(/true/i)
1004
+ return true
1005
+ elsif @scanner.scan(/false/i)
1006
+ return false
1007
+ elsif @scanner.scan(/null/i)
1008
+ return nil
1009
+ end
1010
+
1011
+ # If nothing matches, return an empty string to signify that this
1012
+ # was not a boolean or null value
1013
+ ''
1014
+ end
1015
+
1016
+ # Parses and skips over code-style comments.
1017
+ # - # line comment
1018
+ # - // line comment
1019
+ # - /* block comment */
1020
+ # After skipping the comment, it returns, allowing the caller to loop.
1021
+ def parse_comment
1022
+ # Check for a block comment `/* ... */`
1023
+ if @scanner.scan(%r{/\*})
1024
+ # Scan until the closing delimiter is found.
1025
+ # The `lazy` quantifier `*?` ensures we stop at the *first* `*/`.
1026
+ @scanner.scan_until(%r{\*/}) || @scanner.terminate
1027
+
1028
+ # Check for a line comment `//...` or `#...`
1029
+ elsif @scanner.scan(%r{//|#})
1030
+ # Determine valid line comment termination characters based on context.
1031
+ termination_chars = ["\n", "\r"]
1032
+ termination_chars << ']' if context_contain?(:array)
1033
+ termination_chars << '}' if context_contain?(:object_value)
1034
+ termination_chars << ':' if context_contain?(:object_key)
1035
+
1036
+ # Create a regex that will scan until it hits one of the terminators.
1037
+ # The terminators are positive lookaheads, so they aren't consumed by the scan.
1038
+ terminator_regex = Regexp.new("(?=#{termination_chars.map { |c| Regexp.escape(c) }.join('|')})")
1039
+
1040
+ # Scan until the end of the comment.
1041
+ @scanner.scan_until(terminator_regex)
1042
+ else
1043
+ # The character at the current position (likely '/') is not the start of a
1044
+ # valid comment. To prevent an infinite loop in the calling parser, we must
1045
+ # consume this single stray character before exiting
1046
+ @scanner.getch
1047
+ end
1048
+
1049
+ skip_whitespaces
1050
+ end
1051
+
1052
+ # This function is a non-destructive lookahead.
1053
+ # It quickly iterates to find a character, handling escaped characters, and
1054
+ # returns the index (offset) from the scanner
1055
+ def skip_to_character(characters, start_idx: 0)
1056
+ pattern = characters.is_a?(Array) ? Regexp.union(characters) : characters
1057
+
1058
+ saved_pos = @scanner.pos
1059
+ # Skip start_idx
1060
+ start_idx.times { @scanner.getch }
1061
+
1062
+ # Track accumulated length in chars
1063
+ acc_len = start_idx
1064
+ found_idx = nil
1065
+
1066
+ while (matched_text = @scanner.scan_until(pattern))
1067
+ chunk_len = matched_text.length
1068
+ delimiter_len = @scanner.matched.length
1069
+
1070
+ # Check escapes
1071
+ # matched_text ends with delimiter.
1072
+ # Check chars before the last one.
1073
+ content_before = matched_text[0...-delimiter_len]
1074
+ bs_count = 0
1075
+ idx = content_before.length - 1
1076
+ while idx >= 0 && content_before[idx] == '\\'
1077
+ bs_count += 1
1078
+ idx -= 1
1079
+ end
1080
+
1081
+ if bs_count.even?
1082
+ # Found it
1083
+ found_idx = acc_len + (chunk_len - delimiter_len)
1084
+ break
1085
+ else
1086
+ # Escaped, continue
1087
+ acc_len += chunk_len
1088
+ end
1089
+ end
1090
+
1091
+ if found_idx.nil?
1092
+ # Not found. Return remaining distance.
1093
+ # We scanned to EOS (if loop finished) or stopped.
1094
+ found_idx = acc_len + @scanner.rest.length
1095
+ end
1096
+
1097
+ @scanner.pos = saved_pos
1098
+ found_idx
1099
+ end
1100
+
1101
+ # This function uses the StringScanner to skip whitespace from the current position.
1102
+ # It is more efficient and idiomatic than manual index management
1103
+ def skip_whitespaces_at(start_idx: 0)
1104
+ saved_pos = @scanner.pos
1105
+ start_idx.times { @scanner.getch }
1106
+
1107
+ # Check forward for non-whitespace
1108
+ matched = @scanner.check_until(/\S/)
1109
+
1110
+ res = if matched
1111
+ # matched contains spaces then one non-space.
1112
+ # The index of that non-space (relative to current pos after start_idx)
1113
+ # is matched.length - 1
1114
+ (matched.length - 1) + start_idx
1115
+ else
1116
+ # No non-space found.
1117
+ @scanner.rest.length + start_idx
1118
+ end
1119
+
1120
+ @scanner.pos = saved_pos
1121
+ res
1122
+ end
1123
+
1124
+ def both_hash?(obj1, obj2)
1125
+ obj1.is_a?(Hash) && obj2.is_a?(Hash)
1126
+ end
1127
+
1128
+ def strictly_empty?(value)
1129
+ # Check if the value is a container AND if it's empty.
1130
+ case value
1131
+ when String, Array, Hash, Set
1132
+ value.empty?
1133
+ else
1134
+ false
1135
+ end
1136
+ end
1137
+
1138
+ # Skips whitespaces
1139
+ def skip_whitespaces
1140
+ @scanner.skip(/\s+/)
1141
+ end
1142
+
1143
+ # Peeks the next character without advancing the scanner
1144
+ def peek_char(offset = 0)
1145
+ return @scanner.check(/./m) if offset.zero?
1146
+
1147
+ saved_pos = @scanner.pos
1148
+ c = nil
1149
+ (offset + 1).times do
1150
+ c = @scanner.getch
1151
+ break if c.nil?
1152
+ end
1153
+ @scanner.pos = saved_pos
1154
+ c
1155
+ end
1156
+
1157
+ def current_context?(value)
1158
+ @context&.last == value
1159
+ end
1160
+
1161
+ def context_contain?(value)
1162
+ @context.include?(value)
1163
+ end
1164
+
1165
+ # Checks if the character signifies the start of a string or literal
1166
+ def string_start?(char)
1167
+ STRING_DELIMITERS.include?(char) || char&.match?(/\p{L}/)
1168
+ end
1169
+
1170
+ # Checks if the character signifies the start of a number
1171
+ def number_start?(char)
1172
+ char&.match?(/\d/) || char == '-' || char == '.'
1173
+ end
1174
+ end
1175
+ end