json-repair 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,647 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'string_utils'
4
-
5
- module JSON
6
- module Repair
7
- class Repairer
8
- include StringUtils
9
-
10
- CONTROL_CHARACTERS = {
11
- "\b" => '\b',
12
- "\f" => '\f',
13
- "\n" => '\n',
14
- "\r" => '\r',
15
- "\t" => '\t'
16
- }.freeze
17
-
18
- ESCAPE_CHARACTERS = {
19
- '"' => '"',
20
- '\\' => '\\',
21
- '/' => '/',
22
- 'b' => "\b",
23
- 'f' => "\f",
24
- 'n' => "\n",
25
- 'r' => "\r",
26
- 't' => "\t"
27
- }.freeze
28
-
29
- def initialize(json)
30
- @json = json
31
- @index = 0
32
- @output = ''
33
- end
34
-
35
- def repair
36
- processed = parse_value
37
-
38
- throw_unexpected_end unless processed
39
-
40
- processed_comma = parse_character(COMMA)
41
- parse_whitespace_and_skip_comments if processed_comma
42
-
43
- if start_of_value?(@json[@index]) && ends_with_comma_or_newline?(@output)
44
- # start of a new value after end of the root level object: looks like
45
- # newline delimited JSON -> turn into a root level array
46
- unless processed_comma
47
- # repair missing comma
48
- @output = insert_before_last_whitespace(@output, ',')
49
- end
50
-
51
- parse_newline_delimited_json
52
- elsif processed_comma
53
- # repair: remove trailing comma
54
- @output = strip_last_occurrence(@output, ',')
55
- end
56
-
57
- # repair redundant end quotes
58
- while @json[@index] == CLOSING_BRACE || @json[@index] == CLOSING_BRACKET
59
- @index += 1
60
- parse_whitespace_and_skip_comments
61
- end
62
-
63
- if @index >= @json.length
64
- # reached the end of the document properly
65
- return @output
66
- end
67
-
68
- throw_unexpected_character
69
- end
70
-
71
- private
72
-
73
- def parse_value
74
- parse_whitespace_and_skip_comments
75
- process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
76
- parse_whitespace_and_skip_comments
77
-
78
- process
79
- end
80
-
81
- def parse_whitespace
82
- whitespace = ''
83
- while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
84
- whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
85
-
86
- @index += 1
87
- end
88
-
89
- unless whitespace.empty?
90
- @output += whitespace
91
- return true
92
- end
93
-
94
- false
95
- end
96
-
97
- def parse_comment
98
- if @json[@index] == '/' && @json[@index + 1] == '*'
99
- # Block comment
100
- @index += 2
101
- @index += 1 until @json[@index].nil? || (@json[@index] == '*' && @json[@index + 1] == '/')
102
- @index += 2
103
- true
104
- elsif @json[@index] == '/' && @json[@index + 1] == '/'
105
- # Line comment
106
- @index += 2
107
- @index += 1 until @json[@index].nil? || @json[@index] == "\n"
108
- true
109
- else
110
- false
111
- end
112
- end
113
-
114
- # Parse an object like '{"key": "value"}'
115
- def parse_object
116
- return false unless @json[@index] == OPENING_BRACE
117
-
118
- @output += '{'
119
- @index += 1
120
- parse_whitespace_and_skip_comments
121
-
122
- # repair: skip leading comma like in {, message: "hi"}
123
- parse_whitespace_and_skip_comments if skip_character(COMMA)
124
-
125
- initial = true
126
- while @index < @json.length && @json[@index] != CLOSING_BRACE
127
- processed_comma = true
128
- if initial
129
- initial = false
130
- else
131
- processed_comma = parse_character(COMMA)
132
- unless processed_comma
133
- # repair missing comma
134
- @output = insert_before_last_whitespace(@output, ',')
135
- end
136
- parse_whitespace_and_skip_comments
137
- end
138
-
139
- skip_ellipsis
140
-
141
- processed_key = parse_string || parse_unquoted_string
142
- unless processed_key
143
- if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
144
- @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
145
- @json[@index].nil?
146
- # repair trailing comma
147
- @output = strip_last_occurrence(@output, ',')
148
- else
149
- throw_object_key_expected
150
- end
151
- break
152
- end
153
-
154
- parse_whitespace_and_skip_comments
155
- processed_colon = parse_character(COLON)
156
- truncated_text = @index >= @json.length
157
- unless processed_colon
158
- if start_of_value?(@json[@index]) || truncated_text
159
- # repair missing colon
160
- @output = insert_before_last_whitespace(@output, ':')
161
- else
162
- throw_colon_expected
163
- end
164
- end
165
-
166
- processed_value = parse_value
167
- unless processed_value
168
- if processed_colon || truncated_text
169
- # repair missing object value
170
- @output += 'null'
171
- else
172
- throw_colon_expected
173
- end
174
- end
175
- end
176
-
177
- if @json[@index] == CLOSING_BRACE
178
- @output += '}'
179
- @index += 1
180
- else
181
- # repair missing end bracket
182
- @output = insert_before_last_whitespace(@output, '}')
183
- end
184
-
185
- true
186
- end
187
-
188
- def skip_character(char)
189
- if @json[@index] == char
190
- @index += 1
191
- true
192
- else
193
- false
194
- end
195
- end
196
-
197
- # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
198
- # or a similar construct in objects.
199
- def skip_ellipsis
200
- parse_whitespace_and_skip_comments
201
-
202
- if @json[@index] == DOT &&
203
- @json[@index + 1] == DOT &&
204
- @json[@index + 2] == DOT
205
- # repair: remove the ellipsis (three dots) and optionally a comma
206
- @index += 3
207
- parse_whitespace_and_skip_comments
208
- skip_character(COMMA)
209
- end
210
- end
211
-
212
- # Parse a string enclosed by double quotes "...". Can contain escaped quotes
213
- # Repair strings enclosed in single quotes or special quotes
214
- # Repair an escaped string
215
- #
216
- # The function can run in two stages:
217
- # - First, it assumes the string has a valid end quote
218
- # - If it turns out that the string does not have a valid end quote followed
219
- # by a delimiter (which should be the case), the function runs again in a
220
- # more conservative way, stopping the string at the first next delimiter
221
- # and fixing the string by inserting a quote there.
222
- def parse_string(stop_at_delimiter: false)
223
- if @json[@index] == BACKSLASH
224
- # repair: remove the first escape character
225
- @index += 1
226
- skip_escape_chars = true
227
- end
228
-
229
- if quote?(@json[@index])
230
- # double quotes are correct JSON,
231
- # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
232
- # otherwise, we will match any double-quote-like start with a double-quote-like end,
233
- # or any single-quote-like start with a single-quote-like end
234
- is_end_quote = if double_quote?(@json[@index])
235
- method(:double_quote?)
236
- elsif single_quote?(@json[@index])
237
- method(:single_quote?)
238
- elsif single_quote_like?(@json[@index])
239
- method(:single_quote_like?)
240
- else
241
- method(:double_quote_like?)
242
- end
243
-
244
- i_before = @index
245
- o_before = @output.length
246
-
247
- str = '"'
248
- @index += 1
249
-
250
- loop do
251
- if @index >= @json.length
252
- # end of text, we are missing an end quote
253
-
254
- i_prev = prev_non_whitespace_index(@index - 1)
255
- if !stop_at_delimiter && delimiter?(@json[i_prev])
256
- # if the text ends with a delimiter, like ["hello],
257
- # so the missing end quote should be inserted before this delimiter
258
- # retry parsing the string, stopping at the first next delimiter
259
- @index = i_before
260
- @output = @output[0...o_before]
261
-
262
- return parse_string(stop_at_delimiter: true)
263
- end
264
-
265
- # repair missing quote
266
- str = insert_before_last_whitespace(str, '"')
267
- @output += str
268
-
269
- return true
270
- elsif is_end_quote.call(@json[@index])
271
- # end quote
272
- i_quote = @index
273
- o_quote = str.length
274
- str += '"'
275
- @index += 1
276
- @output += str
277
-
278
- parse_whitespace_and_skip_comments
279
-
280
- if stop_at_delimiter ||
281
- @index >= @json.length ||
282
- delimiter?(@json[@index]) ||
283
- quote?(@json[@index]) ||
284
- digit?(@json[@index])
285
- # The quote is followed by the end of the text, a delimiter, or a next value
286
- parse_concatenated_string
287
-
288
- return true
289
- end
290
-
291
- if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
292
- # This is not the right end quote: it is preceded by a delimiter,
293
- # and NOT followed by a delimiter. So, there is an end quote missing
294
- # parse the string again and then stop at the first next delimiter
295
- @index = i_before
296
- @output = @output[...o_before]
297
-
298
- return parse_string(stop_at_delimiter: true)
299
- end
300
-
301
- # revert to right after the quote but before any whitespace, and continue parsing the string
302
- @output = @output[...o_before]
303
- @index = i_quote + 1
304
-
305
- # repair unescaped quote
306
- str = "#{str[...o_quote]}\\#{str[o_quote..]}"
307
- elsif stop_at_delimiter && delimiter?(@json[@index])
308
- # we're in the mode to stop the string at the first delimiter
309
- # because there is an end quote missing
310
-
311
- # repair missing quote
312
- str = insert_before_last_whitespace(str, '"')
313
- @output += str
314
-
315
- parse_concatenated_string
316
-
317
- return true
318
- elsif @json[@index] == BACKSLASH
319
- # handle escaped content like \n or \u2605
320
- char = @json[@index + 1]
321
- escape_char = ESCAPE_CHARACTERS[char]
322
- if escape_char
323
- str += @json[@index, 2]
324
- @index += 2
325
- elsif char == 'u'
326
- j = 2
327
- j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
328
- if j == 6
329
- str += @json[@index, 6]
330
- @index += 6
331
- elsif @index + j >= @json.length
332
- # repair invalid or truncated unicode char at the end of the text
333
- # by removing the unicode char and ending the string here
334
- @index = @json.length
335
- else
336
- throw_invalid_unicode_character
337
- end
338
- else
339
- # repair invalid escape character: remove it
340
- str += char
341
- @index += 2
342
- end
343
- else
344
- # handle regular characters
345
- char = @json[@index]
346
-
347
- if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
348
- # repair unescaped double quote
349
- str += "\\#{char}"
350
- elsif control_character?(char)
351
- # unescaped control character
352
- str += CONTROL_CHARACTERS[char]
353
- else
354
- throw_invalid_character(char) unless valid_string_character?(char)
355
- str += char
356
- end
357
-
358
- @index += 1
359
- end
360
-
361
- if skip_escape_chars
362
- # repair: skipped escape character (nothing to do)
363
- skip_escape_character
364
- end
365
- end
366
- end
367
-
368
- false
369
- end
370
-
371
- # Repair an unquoted string by adding quotes around it
372
- # Repair a MongoDB function call like NumberLong("2")
373
- # Repair a JSONP function call like callback({...});
374
- def parse_unquoted_string
375
- start = @index
376
- @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
377
- return if @index <= start
378
-
379
- if @json[@index] == '(' && function_name?(@json[start...@index].strip)
380
- # Repair a MongoDB function call like NumberLong("2")
381
- # Repair a JSONP function call like callback({...});
382
- @index += 1
383
-
384
- parse_value
385
-
386
- if @json[@index] == ')'
387
- # Repair: skip close bracket of function call
388
- @index += 1
389
- # Repair: skip semicolon after JSONP call
390
- @index += 1 if @json[@index] == ';'
391
- end
392
- else
393
- # Repair unquoted string
394
- # Also, repair undefined into null
395
-
396
- # First, go back to prevent getting trailing whitespaces in the string
397
- @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
398
-
399
- symbol = @json[start...@index]
400
- @output += symbol == 'undefined' ? 'null' : symbol.inspect
401
-
402
- if @json[@index] == '"'
403
- # We had a missing start quote, but now we encountered the end quote, so we can skip that one
404
- @index += 1
405
- end
406
- end
407
-
408
- true
409
- end
410
-
411
- def parse_character(char)
412
- if @json[@index] == char
413
- @output += @json[@index]
414
- @index += 1
415
- true
416
- else
417
- false
418
- end
419
- end
420
-
421
- def parse_whitespace_and_skip_comments
422
- start = @index
423
-
424
- changed = parse_whitespace
425
- loop do
426
- changed = parse_comment
427
- changed = parse_whitespace if changed
428
- break unless changed
429
- end
430
-
431
- @index > start
432
- end
433
-
434
- # Parse a number like 2.4 or 2.4e6
435
- def parse_number
436
- start = @index
437
- if @json[@index] == '-'
438
- @index += 1
439
- if at_end_of_number?
440
- repair_number_ending_with_numeric_symbol(start)
441
- return true
442
- end
443
- unless digit?(@json[@index])
444
- @index = start
445
- return false
446
- end
447
- end
448
-
449
- # Note that in JSON leading zeros like "00789" are not allowed.
450
- # We will allow all leading zeros here though and at the end of parse_number
451
- # check against trailing zeros and repair that if needed.
452
- # Leading zeros can have meaning, so we should not clear them.
453
- @index += 1 while digit?(@json[@index])
454
-
455
- if @json[@index] == '.'
456
- @index += 1
457
- if at_end_of_number?
458
- repair_number_ending_with_numeric_symbol(start)
459
- return true
460
- end
461
- unless digit?(@json[@index])
462
- @index = start
463
- return false
464
- end
465
- @index += 1 while digit?(@json[@index])
466
- end
467
-
468
- if @json[@index] && @json[@index].downcase == 'e'
469
- @index += 1
470
- @index += 1 if ['-', '+'].include?(@json[@index])
471
- if at_end_of_number?
472
- repair_number_ending_with_numeric_symbol(start)
473
- return true
474
- end
475
- unless digit?(@json[@index])
476
- @index = start
477
- return false
478
- end
479
- @index += 1 while digit?(@json[@index])
480
- end
481
-
482
- # if we're not at the end of the number by this point, allow this to be parsed as another type
483
- unless at_end_of_number?
484
- @index = start
485
- return false
486
- end
487
-
488
- if @index > start
489
- # repair a number with leading zeros like "00789"
490
- num = @json[start...@index]
491
- has_invalid_leading_zero = num.match?(/^0\d/)
492
-
493
- @output += has_invalid_leading_zero ? "\"#{num}\"" : num
494
- return true
495
- end
496
-
497
- false
498
- end
499
-
500
- def at_end_of_number?
501
- @index >= @json.length || delimiter?(@json[@index]) || whitespace?(@json[@index])
502
- end
503
-
504
- # Parse an array like '["item1", "item2", ...]'
505
- def parse_array
506
- if @json[@index] == OPENING_BRACKET
507
- @output += '['
508
- @index += 1
509
- parse_whitespace_and_skip_comments
510
-
511
- # repair: skip leading comma like in [,1,2,3]
512
- parse_whitespace_and_skip_comments if skip_character(COMMA)
513
-
514
- initial = true
515
- while @index < @json.length && @json[@index] != CLOSING_BRACKET
516
- if initial
517
- initial = false
518
- else
519
- processed_comma = parse_character(COMMA)
520
- # repair missing comma
521
- @output = insert_before_last_whitespace(@output, ',') unless processed_comma
522
- end
523
-
524
- skip_ellipsis
525
-
526
- processed_value = parse_value
527
- next if processed_value
528
-
529
- # repair trailing comma
530
- @output = strip_last_occurrence(@output, ',')
531
- break
532
- end
533
-
534
- if @json[@index] == CLOSING_BRACKET
535
- @output += ']'
536
- @index += 1
537
- else
538
- # repair missing closing array bracket
539
- @output = insert_before_last_whitespace(@output, ']')
540
- end
541
-
542
- true
543
- else
544
- false
545
- end
546
- end
547
-
548
- def prev_non_whitespace_index(start)
549
- prev = start
550
- prev -= 1 while prev.positive? && whitespace?(@json[prev])
551
- prev
552
- end
553
-
554
- # Repair concatenated strings like "hello" + "world", change this into "helloworld"
555
- def parse_concatenated_string
556
- processed = false
557
-
558
- parse_whitespace_and_skip_comments
559
- while @json[@index] == PLUS
560
- processed = true
561
- @index += 1
562
- parse_whitespace_and_skip_comments
563
-
564
- # repair: remove the end quote of the first string
565
- @output = strip_last_occurrence(@output, '"', strip_remaining_text: true)
566
- start = @output.length
567
- parsed_str = parse_string
568
- @output = if parsed_str
569
- # repair: remove the start quote of the second string
570
- remove_at_index(@output, start, 1)
571
- else
572
- # repair: remove the '+' because it is not followed by a string
573
- insert_before_last_whitespace(@output, '"')
574
- end
575
- end
576
-
577
- processed
578
- end
579
-
580
- def repair_number_ending_with_numeric_symbol(start)
581
- # repair numbers cut off at the end
582
- # this will only be called when we end after a '.', '-', or 'e' and does not
583
- # change the number more than it needs to make it valid JSON
584
- @output += "#{@json[start...@index]}0"
585
- end
586
-
587
- # Parse and repair Newline Delimited JSON (NDJSON):
588
- # multiple JSON objects separated by a newline character
589
- def parse_newline_delimited_json
590
- # repair NDJSON
591
- initial = true
592
- processed_value = true
593
- while processed_value
594
- if initial
595
- initial = false
596
- else
597
- # parse optional comma, insert when missing
598
- processed_comma = parse_character(COMMA)
599
- unless processed_comma
600
- # repair: add missing comma
601
- @output = insert_before_last_whitespace(@output, ',')
602
- end
603
- end
604
-
605
- processed_value = parse_value
606
- end
607
-
608
- unless processed_value
609
- # repair: remove trailing comma
610
- @output = strip_last_occurrence(@output, ',')
611
- end
612
-
613
- # repair: wrap the output inside array brackets
614
- @output = "[\n#{@output}\n]"
615
- end
616
-
617
- def skip_escape_character
618
- skip_character(BACKSLASH)
619
- end
620
-
621
- def throw_invalid_character(char)
622
- raise JSONRepairError, "Invalid character #{char.inspect} at index #{@index}"
623
- end
624
-
625
- def throw_unexpected_character
626
- raise JSONRepairError, "Unexpected character #{@json[@index].inspect} at index #{@index}"
627
- end
628
-
629
- def throw_unexpected_end
630
- raise JSONRepairError, 'Unexpected end of json string'
631
- end
632
-
633
- def throw_object_key_expected
634
- raise JSONRepairError, 'Object key expected'
635
- end
636
-
637
- def throw_colon_expected
638
- raise JSONRepairError, 'Colon expected'
639
- end
640
-
641
- def throw_invalid_unicode_character
642
- chars = @json[@index, 6]
643
- raise JSONRepairError, "Invalid unicode character #{chars.inspect} at index #{@index}"
644
- end
645
- end
646
- end
647
- end