json-repair 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/json/repairer.rb CHANGED
@@ -25,17 +25,24 @@ module JSON
25
25
  't' => "\t"
26
26
  }.freeze
27
27
 
28
+ MARKDOWN_OPEN_BLOCKS = ['```', '[```', '{```'].freeze
29
+ MARKDOWN_CLOSE_BLOCKS = ['```', '```]', '```}'].freeze
30
+
28
31
  def initialize(json)
29
32
  @json = json
30
33
  @index = 0
31
- @output = ''
34
+ @output = +''
32
35
  end
33
36
 
34
37
  def repair
38
+ parse_markdown_code_block(MARKDOWN_OPEN_BLOCKS)
39
+
35
40
  processed = parse_value
36
41
 
37
42
  throw_unexpected_end unless processed
38
43
 
44
+ parse_markdown_code_block(MARKDOWN_CLOSE_BLOCKS)
45
+
39
46
  processed_comma = parse_character(COMMA)
40
47
  parse_whitespace_and_skip_comments if processed_comma
41
48
 
@@ -71,22 +78,45 @@ module JSON
71
78
 
72
79
  def parse_value
73
80
  parse_whitespace_and_skip_comments
74
- process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
81
+ process = parse_object ||
82
+ parse_array ||
83
+ parse_string ||
84
+ parse_number ||
85
+ parse_keywords ||
86
+ parse_unquoted_string(false) ||
87
+ parse_regex
75
88
  parse_whitespace_and_skip_comments
76
89
 
77
90
  process
78
91
  end
79
92
 
80
- def parse_whitespace
81
- whitespace = ''
82
- while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
83
- whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
93
+ def parse_whitespace_and_skip_comments(skip_newline: true)
94
+ start = @index
95
+
96
+ changed = parse_whitespace(skip_newline: skip_newline)
97
+ loop do
98
+ changed = parse_comment
99
+ changed = parse_whitespace(skip_newline: skip_newline) if changed
100
+ break unless changed
101
+ end
102
+
103
+ @index > start
104
+ end
105
+
106
+ def parse_whitespace(skip_newline: true)
107
+ whitespace = +''
108
+ while @json[@index] && (
109
+ (skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])) ||
110
+ special_whitespace?(@json[@index])
111
+ )
112
+ ws = skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])
113
+ whitespace << (ws ? @json[@index] : ' ')
84
114
 
85
115
  @index += 1
86
116
  end
87
117
 
88
118
  unless whitespace.empty?
89
- @output += whitespace
119
+ @output << whitespace
90
120
  return true
91
121
  end
92
122
 
@@ -110,11 +140,41 @@ module JSON
110
140
  end
111
141
  end
112
142
 
143
+ # Find and skip over a Markdown fenced code block:
144
+ # ``` ... ```
145
+ # or
146
+ # ```json ... ```
147
+ def parse_markdown_code_block(blocks)
148
+ return false unless skip_markdown_code_block(blocks)
149
+
150
+ if function_name_char_start?(@json[@index])
151
+ # strip the optional language specifier like "json"
152
+ @index += 1 while @index < @json.length && function_name_char?(@json[@index])
153
+ end
154
+
155
+ parse_whitespace_and_skip_comments
156
+
157
+ true
158
+ end
159
+
160
+ def skip_markdown_code_block(blocks)
161
+ parse_whitespace(skip_newline: true)
162
+
163
+ blocks.each do |block|
164
+ if @json[@index, block.length] == block
165
+ @index += block.length
166
+ return true
167
+ end
168
+ end
169
+
170
+ false
171
+ end
172
+
113
173
  # Parse an object like '{"key": "value"}'
114
174
  def parse_object
115
175
  return false unless @json[@index] == OPENING_BRACE
116
176
 
117
- @output += '{'
177
+ @output << '{'
118
178
  @index += 1
119
179
  parse_whitespace_and_skip_comments
120
180
 
@@ -137,7 +197,7 @@ module JSON
137
197
 
138
198
  skip_ellipsis
139
199
 
140
- processed_key = parse_string || parse_unquoted_string
200
+ processed_key = parse_string || parse_unquoted_string(true)
141
201
  unless processed_key
142
202
  if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
143
203
  @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
@@ -166,7 +226,7 @@ module JSON
166
226
  unless processed_value
167
227
  if processed_colon || truncated_text
168
228
  # repair missing object value
169
- @output += 'null'
229
+ @output << 'null'
170
230
  else
171
231
  throw_colon_expected
172
232
  end
@@ -174,7 +234,7 @@ module JSON
174
234
  end
175
235
 
176
236
  if @json[@index] == CLOSING_BRACE
177
- @output += '}'
237
+ @output << '}'
178
238
  @index += 1
179
239
  else
180
240
  # repair missing end bracket
@@ -217,199 +277,273 @@ module JSON
217
277
  # - If it turns out that the string does not have a valid end quote followed
218
278
  # by a delimiter (which should be the case), the function runs again in a
219
279
  # more conservative way, stopping the string at the first next delimiter
220
- # and fixing the string by inserting a quote there.
221
- def parse_string(stop_at_delimiter: false)
222
- if @json[@index] == BACKSLASH
280
+ # and fixing the string by inserting a quote there, or stopping at a
281
+ # stop index detected in the first iteration.
282
+ def parse_string(stop_at_delimiter: false, stop_at_index: -1)
283
+ skip_escape_chars = @json[@index] == BACKSLASH
284
+ if skip_escape_chars
223
285
  # repair: remove the first escape character
224
286
  @index += 1
225
- skip_escape_chars = true
226
287
  end
227
288
 
228
- if quote?(@json[@index])
229
- # double quotes are correct JSON,
230
- # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
231
- # otherwise, we will match any double-quote-like start with a double-quote-like end,
232
- # or any single-quote-like start with a single-quote-like end
233
- is_end_quote = if double_quote?(@json[@index])
234
- method(:double_quote?)
235
- elsif single_quote?(@json[@index])
236
- method(:single_quote?)
237
- elsif single_quote_like?(@json[@index])
238
- method(:single_quote_like?)
239
- else
240
- method(:double_quote_like?)
241
- end
242
-
243
- i_before = @index
244
- o_before = @output.length
245
-
246
- str = '"'
247
- @index += 1
289
+ return false unless quote?(@json[@index])
290
+
291
+ # double quotes are correct JSON,
292
+ # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
293
+ # otherwise, we will match any double-quote-like start with a double-quote-like end,
294
+ # or any single-quote-like start with a single-quote-like end
295
+ is_end_quote = if double_quote?(@json[@index])
296
+ method(:double_quote?)
297
+ elsif single_quote?(@json[@index])
298
+ method(:single_quote?)
299
+ elsif single_quote_like?(@json[@index])
300
+ method(:single_quote_like?)
301
+ else
302
+ method(:double_quote_like?)
303
+ end
304
+
305
+ i_before = @index
306
+ o_before = @output.length
307
+
308
+ str = +'"'
309
+ @index += 1
248
310
 
249
- loop do
250
- if @index >= @json.length
251
- # end of text, we are missing an end quote
311
+ loop do
312
+ if @index >= @json.length
313
+ # end of text, we are missing an end quote
314
+
315
+ i_prev = prev_non_whitespace_index(@index - 1)
316
+ if !stop_at_delimiter && delimiter?(@json[i_prev])
317
+ # if the text ends with a delimiter, like ["hello],
318
+ # so the missing end quote should be inserted before this delimiter
319
+ # retry parsing the string, stopping at the first next delimiter
320
+ @index = i_before
321
+ @output = @output[0...o_before]
322
+
323
+ return parse_string(stop_at_delimiter: true)
324
+ end
252
325
 
253
- i_prev = prev_non_whitespace_index(@index - 1)
254
- if !stop_at_delimiter && delimiter?(@json[i_prev])
255
- # if the text ends with a delimiter, like ["hello],
256
- # so the missing end quote should be inserted before this delimiter
257
- # retry parsing the string, stopping at the first next delimiter
258
- @index = i_before
259
- @output = @output[0...o_before]
326
+ # repair missing quote
327
+ str = insert_before_last_whitespace(str, '"')
328
+ @output << str
260
329
 
261
- return parse_string(stop_at_delimiter: true)
262
- end
330
+ return true
331
+ end
263
332
 
264
- # repair missing quote
265
- str = insert_before_last_whitespace(str, '"')
266
- @output += str
333
+ if @index == stop_at_index
334
+ # use the stop index detected in the first iteration, and repair end quote
335
+ str = insert_before_last_whitespace(str, '"')
336
+ @output << str
267
337
 
268
- return true
269
- elsif is_end_quote.call(@json[@index])
270
- # end quote
271
- i_quote = @index
272
- o_quote = str.length
273
- str += '"'
274
- @index += 1
275
- @output += str
338
+ return true
339
+ end
276
340
 
277
- parse_whitespace_and_skip_comments
341
+ if is_end_quote.call(@json[@index])
342
+ # end quote
343
+ # let us check what is before and after the quote to verify whether this is a legit end quote
344
+ i_quote = @index
345
+ o_quote = str.length
346
+ str << '"'
347
+ @index += 1
348
+ @output << str
278
349
 
279
- if stop_at_delimiter ||
280
- @index >= @json.length ||
281
- delimiter?(@json[@index]) ||
282
- quote?(@json[@index]) ||
283
- digit?(@json[@index])
284
- # The quote is followed by the end of the text, a delimiter, or a next value
285
- parse_concatenated_string
350
+ parse_whitespace_and_skip_comments(skip_newline: false)
286
351
 
287
- return true
288
- end
352
+ if stop_at_delimiter ||
353
+ @index >= @json.length ||
354
+ delimiter?(@json[@index]) ||
355
+ quote?(@json[@index]) ||
356
+ digit?(@json[@index])
357
+ # The quote is followed by the end of the text, a delimiter, or a next value
358
+ parse_concatenated_string
289
359
 
290
- if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
291
- # This is not the right end quote: it is preceded by a delimiter,
292
- # and NOT followed by a delimiter. So, there is an end quote missing
293
- # parse the string again and then stop at the first next delimiter
294
- @index = i_before
295
- @output = @output[...o_before]
360
+ return true
361
+ end
296
362
 
297
- return parse_string(stop_at_delimiter: true)
298
- end
363
+ i_prev_char = prev_non_whitespace_index(i_quote - 1)
364
+ prev_char = @json[i_prev_char]
365
+
366
+ if prev_char == ','
367
+ # A comma followed by a quote, like '{"a":"b,c,"d":"e"}'.
368
+ # We assume that the quote is a start quote, and that the end quote
369
+ # should have been located right before the comma but is missing.
370
+ @index = i_before
371
+ @output = @output[0...o_before]
299
372
 
300
- # revert to right after the quote but before any whitespace, and continue parsing the string
373
+ return parse_string(stop_at_delimiter: false, stop_at_index: i_prev_char)
374
+ end
375
+
376
+ if delimiter?(prev_char)
377
+ # This is not the right end quote: it is preceded by a delimiter,
378
+ # and NOT followed by a delimiter. So, there is an end quote missing
379
+ # parse the string again and then stop at the first next delimiter
380
+ @index = i_before
301
381
  @output = @output[...o_before]
302
- @index = i_quote + 1
303
382
 
304
- # repair unescaped quote
305
- str = "#{str[...o_quote]}\\#{str[o_quote..]}"
306
- elsif stop_at_delimiter && delimiter?(@json[@index])
307
- # we're in the mode to stop the string at the first delimiter
308
- # because there is an end quote missing
383
+ return parse_string(stop_at_delimiter: true)
384
+ end
309
385
 
310
- # repair missing quote
311
- str = insert_before_last_whitespace(str, '"')
312
- @output += str
386
+ # revert to right after the quote but before any whitespace, and continue parsing the string
387
+ @output = @output[...o_before]
388
+ @index = i_quote + 1
389
+
390
+ # repair unescaped quote
391
+ str = "#{str[...o_quote]}\\#{str[o_quote..]}"
392
+ elsif stop_at_delimiter && unquoted_string_delimiter?(@json[@index])
393
+ # we're in the mode to stop the string at the first delimiter
394
+ # because there is an end quote missing
395
+
396
+ # test start of an url like "https://..." (this would be parsed as a comment)
397
+ if @json[@index - 1] == ':' &&
398
+ REGEX_URL_START.match?(@json[(i_before + 1)..(@index + 1)] || '')
399
+ while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
400
+ str << @json[@index]
401
+ @index += 1
402
+ end
403
+ end
313
404
 
314
- parse_concatenated_string
405
+ # repair missing quote
406
+ str = insert_before_last_whitespace(str, '"')
407
+ @output << str
315
408
 
316
- return true
317
- elsif @json[@index] == BACKSLASH
318
- # handle escaped content like \n or \u2605
319
- char = @json[@index + 1]
320
- escape_char = ESCAPE_CHARACTERS[char]
321
- if escape_char
322
- str += @json[@index, 2]
323
- @index += 2
324
- elsif char == 'u'
325
- j = 2
326
- j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
327
- if j == 6
328
- str += @json[@index, 6]
329
- @index += 6
330
- elsif @index + j >= @json.length
331
- # repair invalid or truncated unicode char at the end of the text
332
- # by removing the unicode char and ending the string here
333
- @index = @json.length
334
- else
335
- throw_invalid_unicode_character
336
- end
409
+ parse_concatenated_string
410
+
411
+ return true
412
+ elsif @json[@index] == BACKSLASH
413
+ # handle escaped content like \n or ★
414
+ char = @json[@index + 1]
415
+ escape_char = ESCAPE_CHARACTERS[char]
416
+ if escape_char
417
+ str << @json[@index, 2]
418
+ @index += 2
419
+ elsif char == 'u'
420
+ j = 2
421
+ j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
422
+ if j == 6
423
+ str << @json[@index, 6]
424
+ @index += 6
425
+ elsif @index + j >= @json.length
426
+ # repair invalid or truncated unicode char at the end of the text
427
+ # by removing the unicode char and ending the string here
428
+ @index = @json.length
337
429
  else
338
- # repair invalid escape character: remove it
339
- str += char
340
- @index += 2
430
+ throw_invalid_unicode_character
341
431
  end
432
+ elsif char == "\n"
433
+ # repair a backslash escaped newline (like in Bash scripts)
434
+ str << '\n'
435
+ @index += 2
342
436
  else
343
- # handle regular characters
344
- char = @json[@index]
345
-
346
- if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
347
- # repair unescaped double quote
348
- str += "\\#{char}"
349
- elsif control_character?(char)
350
- # unescaped control character
351
- str += CONTROL_CHARACTERS[char]
352
- else
353
- throw_invalid_character(char) unless valid_string_character?(char)
354
- str += char
355
- end
356
-
357
- @index += 1
437
+ # repair invalid escape character: remove it
438
+ str << char
439
+ @index += 2
358
440
  end
359
-
360
- if skip_escape_chars
361
- # repair: skipped escape character (nothing to do)
362
- skip_escape_character
441
+ else
442
+ # handle regular characters
443
+ char = @json[@index]
444
+
445
+ if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
446
+ # repair unescaped double quote
447
+ str << "\\#{char}"
448
+ elsif control_character?(char)
449
+ # unescaped control character
450
+ str << CONTROL_CHARACTERS[char]
451
+ else
452
+ throw_invalid_character(char) unless valid_string_character?(char)
453
+ str << char
363
454
  end
455
+ @index += 1
364
456
  end
365
- end
366
457
 
367
- false
458
+ if skip_escape_chars
459
+ # repair: skipped escape character (nothing to do)
460
+ skip_escape_character
461
+ end
462
+ end
368
463
  end
369
464
 
370
465
  # Repair an unquoted string by adding quotes around it
371
466
  # Repair a MongoDB function call like NumberLong("2")
372
467
  # Repair a JSONP function call like callback({...});
373
- def parse_unquoted_string
468
+ def parse_unquoted_string(is_key)
469
+ # NOTE: that the symbol can end with whitespaces: we stop at the next delimiter
470
+ # also, note that we allow strings to contain a slash / in order to support repairing regular expressions
374
471
  start = @index
375
- @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
376
- return if @index <= start
377
472
 
378
- if @json[@index] == '(' && function_name?(@json[start...@index].strip)
379
- # Repair a MongoDB function call like NumberLong("2")
380
- # Repair a JSONP function call like callback({...});
381
- @index += 1
473
+ if function_name_char_start?(@json[@index])
474
+ @index += 1 while @index < @json.length && function_name_char?(@json[@index])
382
475
 
383
- parse_value
476
+ j = @index
477
+ j += 1 while whitespace?(@json[j])
384
478
 
385
- if @json[@index] == ')'
386
- # Repair: skip close bracket of function call
387
- @index += 1
388
- # Repair: skip semicolon after JSONP call
389
- @index += 1 if @json[@index] == ';'
390
- end
391
- else
392
- # Repair unquoted string
393
- # Also, repair undefined into null
479
+ if @json[j] == '('
480
+ # repair a MongoDB function call like NumberLong("2")
481
+ # repair a JSONP function call like callback({...});
482
+ @index = j + 1
394
483
 
395
- # First, go back to prevent getting trailing whitespaces in the string
396
- @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
484
+ parse_value
397
485
 
398
- symbol = @json[start...@index]
399
- @output += symbol == 'undefined' ? 'null' : symbol.inspect
486
+ if @json[@index] == ')'
487
+ # Repair: skip close bracket of function call
488
+ @index += 1
489
+ # Repair: skip semicolon after JSONP call
490
+ @index += 1 if @json[@index] == ';'
491
+ end
400
492
 
401
- if @json[@index] == '"'
402
- # We had a missing start quote, but now we encountered the end quote, so we can skip that one
403
- @index += 1
493
+ return true
404
494
  end
405
495
  end
406
496
 
497
+ while @index < @json.length &&
498
+ !unquoted_string_delimiter?(@json[@index]) &&
499
+ !quote?(@json[@index]) &&
500
+ (!is_key || @json[@index] != ':')
501
+ @index += 1
502
+ end
503
+
504
+ # test start of an url like "https://..." (this would be parsed as a comment)
505
+ if @json[@index - 1] == ':' &&
506
+ REGEX_URL_START.match?(@json[start...(@index + 2)] || '')
507
+ @index += 1 while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
508
+ end
509
+
510
+ return false if @index <= start
511
+
512
+ # Repair unquoted string
513
+ # Also, repair undefined into null
514
+
515
+ # First, go back to prevent getting trailing whitespaces in the string
516
+ @index -= 1 while @index.positive? && whitespace?(@json[@index - 1])
517
+
518
+ symbol = @json[start...@index]
519
+ @output << (symbol == 'undefined' ? 'null' : symbol.inspect)
520
+
521
+ if @json[@index] == '"'
522
+ # We had a missing start quote, but now we encountered the end quote, so we can skip that one
523
+ @index += 1
524
+ end
525
+
526
+ true
527
+ end
528
+
529
+ # Parse a regular expression literal like /foo/ or /foo\/bar/
530
+ def parse_regex
531
+ return false unless @json[@index] == '/'
532
+
533
+ start = @index
534
+ @index += 1
535
+
536
+ @index += 1 while @index < @json.length && (@json[@index] != '/' || @json[@index - 1] == BACKSLASH)
537
+ @index += 1
538
+
539
+ @output << @json[start...@index].inspect
540
+
407
541
  true
408
542
  end
409
543
 
410
544
  def parse_character(char)
411
545
  if @json[@index] == char
412
- @output += @json[@index]
546
+ @output << @json[@index]
413
547
  @index += 1
414
548
  true
415
549
  else
@@ -417,19 +551,6 @@ module JSON
417
551
  end
418
552
  end
419
553
 
420
- def parse_whitespace_and_skip_comments
421
- start = @index
422
-
423
- changed = parse_whitespace
424
- loop do
425
- changed = parse_comment
426
- changed = parse_whitespace if changed
427
- break unless changed
428
- end
429
-
430
- @index > start
431
- end
432
-
433
554
  # Parse a number like 2.4 or 2.4e6
434
555
  def parse_number
435
556
  start = @index
@@ -489,7 +610,7 @@ module JSON
489
610
  num = @json[start...@index]
490
611
  has_invalid_leading_zero = num.match?(/^0\d/)
491
612
 
492
- @output += has_invalid_leading_zero ? "\"#{num}\"" : num
613
+ @output << (has_invalid_leading_zero ? "\"#{num}\"" : num)
493
614
  return true
494
615
  end
495
616
 
@@ -503,7 +624,7 @@ module JSON
503
624
  # Parse an array like '["item1", "item2", ...]'
504
625
  def parse_array
505
626
  if @json[@index] == OPENING_BRACKET
506
- @output += '['
627
+ @output << '['
507
628
  @index += 1
508
629
  parse_whitespace_and_skip_comments
509
630
 
@@ -531,7 +652,7 @@ module JSON
531
652
  end
532
653
 
533
654
  if @json[@index] == CLOSING_BRACKET
534
- @output += ']'
655
+ @output << ']'
535
656
  @index += 1
536
657
  else
537
658
  # repair missing closing array bracket
@@ -580,7 +701,7 @@ module JSON
580
701
  # repair numbers cut off at the end
581
702
  # this will only be called when we end after a '.', '-', or 'e' and does not
582
703
  # change the number more than it needs to make it valid JSON
583
- @output += "#{@json[start...@index]}0"
704
+ @output << "#{@json[start...@index]}0"
584
705
  end
585
706
 
586
707
  # Parse and repair Newline Delimited JSON (NDJSON):
@@ -0,0 +1,16 @@
1
+ module JSON
2
+ module Repair
3
+ class CLI
4
+ # `::IO | ::StringIO` because `::StringIO` is not an `::IO` subclass; the
5
+ # specs inject `::StringIO` instances and any other duck-typed stream
6
+ # that implements `#read` / `#write` / `#puts` would work too.
7
+ type stream = ::IO | ::StringIO
8
+
9
+ def self.call: (::Array[::String] argv, ?stdin: stream, ?stdout: stream, ?stderr: stream) -> ::Integer
10
+
11
+ def initialize: (?stdin: stream, ?stdout: stream, ?stderr: stream) -> void
12
+
13
+ def call: (::Array[::String] argv) -> ::Integer
14
+ end
15
+ end
16
+ end