json-repair 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,647 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'string_utils'
4
-
5
- module JSON
6
- module Repair
7
- class Repairer
8
- include StringUtils
9
-
10
- CONTROL_CHARACTERS = {
11
- "\b" => '\b',
12
- "\f" => '\f',
13
- "\n" => '\n',
14
- "\r" => '\r',
15
- "\t" => '\t'
16
- }.freeze
17
-
18
- ESCAPE_CHARACTERS = {
19
- '"' => '"',
20
- '\\' => '\\',
21
- '/' => '/',
22
- 'b' => "\b",
23
- 'f' => "\f",
24
- 'n' => "\n",
25
- 'r' => "\r",
26
- 't' => "\t"
27
- }.freeze
28
-
29
- def initialize(json)
30
- @json = json
31
- @index = 0
32
- @output = ''
33
- end
34
-
35
- def repair
36
- processed = parse_value
37
-
38
- throw_unexpected_end unless processed
39
-
40
- processed_comma = parse_character(COMMA)
41
- parse_whitespace_and_skip_comments if processed_comma
42
-
43
- if start_of_value?(@json[@index]) && ends_with_comma_or_newline?(@output)
44
- # start of a new value after end of the root level object: looks like
45
- # newline delimited JSON -> turn into a root level array
46
- unless processed_comma
47
- # repair missing comma
48
- @output = insert_before_last_whitespace(@output, ',')
49
- end
50
-
51
- parse_newline_delimited_json
52
- elsif processed_comma
53
- # repair: remove trailing comma
54
- @output = strip_last_occurrence(@output, ',')
55
- end
56
-
57
- # repair redundant end quotes
58
- while @json[@index] == CLOSING_BRACE || @json[@index] == CLOSING_BRACKET
59
- @index += 1
60
- parse_whitespace_and_skip_comments
61
- end
62
-
63
- if @index >= @json.length
64
- # reached the end of the document properly
65
- return @output
66
- end
67
-
68
- throw_unexpected_character
69
- end
70
-
71
- private
72
-
73
- def parse_value
74
- parse_whitespace_and_skip_comments
75
- process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
76
- parse_whitespace_and_skip_comments
77
-
78
- process
79
- end
80
-
81
- def parse_whitespace
82
- whitespace = ''
83
- while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
84
- whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
85
-
86
- @index += 1
87
- end
88
-
89
- unless whitespace.empty?
90
- @output += whitespace
91
- return true
92
- end
93
-
94
- false
95
- end
96
-
97
- def parse_comment
98
- if @json[@index] == '/' && @json[@index + 1] == '*'
99
- # Block comment
100
- @index += 2
101
- @index += 1 until @json[@index].nil? || (@json[@index] == '*' && @json[@index + 1] == '/')
102
- @index += 2
103
- true
104
- elsif @json[@index] == '/' && @json[@index + 1] == '/'
105
- # Line comment
106
- @index += 2
107
- @index += 1 until @json[@index].nil? || @json[@index] == "\n"
108
- true
109
- else
110
- false
111
- end
112
- end
113
-
114
- # Parse an object like '{"key": "value"}'
115
- def parse_object
116
- return false unless @json[@index] == OPENING_BRACE
117
-
118
- @output += '{'
119
- @index += 1
120
- parse_whitespace_and_skip_comments
121
-
122
- # repair: skip leading comma like in {, message: "hi"}
123
- parse_whitespace_and_skip_comments if skip_character(COMMA)
124
-
125
- initial = true
126
- while @index < @json.length && @json[@index] != CLOSING_BRACE
127
- processed_comma = true
128
- if initial
129
- initial = false
130
- else
131
- processed_comma = parse_character(COMMA)
132
- unless processed_comma
133
- # repair missing comma
134
- @output = insert_before_last_whitespace(@output, ',')
135
- end
136
- parse_whitespace_and_skip_comments
137
- end
138
-
139
- skip_ellipsis
140
-
141
- processed_key = parse_string || parse_unquoted_string
142
- unless processed_key
143
- if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
144
- @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
145
- @json[@index].nil?
146
- # repair trailing comma
147
- @output = strip_last_occurrence(@output, ',')
148
- else
149
- throw_object_key_expected
150
- end
151
- break
152
- end
153
-
154
- parse_whitespace_and_skip_comments
155
- processed_colon = parse_character(COLON)
156
- truncated_text = @index >= @json.length
157
- unless processed_colon
158
- if start_of_value?(@json[@index]) || truncated_text
159
- # repair missing colon
160
- @output = insert_before_last_whitespace(@output, ':')
161
- else
162
- throw_colon_expected
163
- end
164
- end
165
-
166
- processed_value = parse_value
167
- unless processed_value
168
- if processed_colon || truncated_text
169
- # repair missing object value
170
- @output += 'null'
171
- else
172
- throw_colon_expected
173
- end
174
- end
175
- end
176
-
177
- if @json[@index] == CLOSING_BRACE
178
- @output += '}'
179
- @index += 1
180
- else
181
- # repair missing end bracket
182
- @output = insert_before_last_whitespace(@output, '}')
183
- end
184
-
185
- true
186
- end
187
-
188
- def skip_character(char)
189
- if @json[@index] == char
190
- @index += 1
191
- true
192
- else
193
- false
194
- end
195
- end
196
-
197
- # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
198
- # or a similar construct in objects.
199
- def skip_ellipsis
200
- parse_whitespace_and_skip_comments
201
-
202
- if @json[@index] == DOT &&
203
- @json[@index + 1] == DOT &&
204
- @json[@index + 2] == DOT
205
- # repair: remove the ellipsis (three dots) and optionally a comma
206
- @index += 3
207
- parse_whitespace_and_skip_comments
208
- skip_character(COMMA)
209
- end
210
- end
211
-
212
- # Parse a string enclosed by double quotes "...". Can contain escaped quotes
213
- # Repair strings enclosed in single quotes or special quotes
214
- # Repair an escaped string
215
- #
216
- # The function can run in two stages:
217
- # - First, it assumes the string has a valid end quote
218
- # - If it turns out that the string does not have a valid end quote followed
219
- # by a delimiter (which should be the case), the function runs again in a
220
- # more conservative way, stopping the string at the first next delimiter
221
- # and fixing the string by inserting a quote there.
222
- def parse_string(stop_at_delimiter: false)
223
- if @json[@index] == BACKSLASH
224
- # repair: remove the first escape character
225
- @index += 1
226
- skip_escape_chars = true
227
- end
228
-
229
- if quote?(@json[@index])
230
- # double quotes are correct JSON,
231
- # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
232
- # otherwise, we will match any double-quote-like start with a double-quote-like end,
233
- # or any single-quote-like start with a single-quote-like end
234
- is_end_quote = if double_quote?(@json[@index])
235
- method(:double_quote?)
236
- elsif single_quote?(@json[@index])
237
- method(:single_quote?)
238
- elsif single_quote_like?(@json[@index])
239
- method(:single_quote_like?)
240
- else
241
- method(:double_quote_like?)
242
- end
243
-
244
- i_before = @index
245
- o_before = @output.length
246
-
247
- str = '"'
248
- @index += 1
249
-
250
- loop do
251
- if @index >= @json.length
252
- # end of text, we are missing an end quote
253
-
254
- i_prev = prev_non_whitespace_index(@index - 1)
255
- if !stop_at_delimiter && delimiter?(@json[i_prev])
256
- # if the text ends with a delimiter, like ["hello],
257
- # so the missing end quote should be inserted before this delimiter
258
- # retry parsing the string, stopping at the first next delimiter
259
- @index = i_before
260
- @output = @output[0...o_before]
261
-
262
- return parse_string(stop_at_delimiter: true)
263
- end
264
-
265
- # repair missing quote
266
- str = insert_before_last_whitespace(str, '"')
267
- @output += str
268
-
269
- return true
270
- elsif is_end_quote.call(@json[@index])
271
- # end quote
272
- i_quote = @index
273
- o_quote = str.length
274
- str += '"'
275
- @index += 1
276
- @output += str
277
-
278
- parse_whitespace_and_skip_comments
279
-
280
- if stop_at_delimiter ||
281
- @index >= @json.length ||
282
- delimiter?(@json[@index]) ||
283
- quote?(@json[@index]) ||
284
- digit?(@json[@index])
285
- # The quote is followed by the end of the text, a delimiter, or a next value
286
- parse_concatenated_string
287
-
288
- return true
289
- end
290
-
291
- if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
292
- # This is not the right end quote: it is preceded by a delimiter,
293
- # and NOT followed by a delimiter. So, there is an end quote missing
294
- # parse the string again and then stop at the first next delimiter
295
- @index = i_before
296
- @output = @output[...o_before]
297
-
298
- return parse_string(stop_at_delimiter: true)
299
- end
300
-
301
- # revert to right after the quote but before any whitespace, and continue parsing the string
302
- @output = @output[...o_before]
303
- @index = i_quote + 1
304
-
305
- # repair unescaped quote
306
- str = "#{str[...o_quote]}\\#{str[o_quote..]}"
307
- elsif stop_at_delimiter && delimiter?(@json[@index])
308
- # we're in the mode to stop the string at the first delimiter
309
- # because there is an end quote missing
310
-
311
- # repair missing quote
312
- str = insert_before_last_whitespace(str, '"')
313
- @output += str
314
-
315
- parse_concatenated_string
316
-
317
- return true
318
- elsif @json[@index] == BACKSLASH
319
- # handle escaped content like \n or \u2605
320
- char = @json[@index + 1]
321
- escape_char = ESCAPE_CHARACTERS[char]
322
- if escape_char
323
- str += @json[@index, 2]
324
- @index += 2
325
- elsif char == 'u'
326
- j = 2
327
- j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
328
- if j == 6
329
- str += @json[@index, 6]
330
- @index += 6
331
- elsif @index + j >= @json.length
332
- # repair invalid or truncated unicode char at the end of the text
333
- # by removing the unicode char and ending the string here
334
- @index = @json.length
335
- else
336
- throw_invalid_unicode_character
337
- end
338
- else
339
- # repair invalid escape character: remove it
340
- str += char
341
- @index += 2
342
- end
343
- else
344
- # handle regular characters
345
- char = @json[@index]
346
-
347
- if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
348
- # repair unescaped double quote
349
- str += "\\#{char}"
350
- elsif control_character?(char)
351
- # unescaped control character
352
- str += CONTROL_CHARACTERS[char]
353
- else
354
- throw_invalid_character(char) unless valid_string_character?(char)
355
- str += char
356
- end
357
-
358
- @index += 1
359
- end
360
-
361
- if skip_escape_chars
362
- # repair: skipped escape character (nothing to do)
363
- skip_escape_character
364
- end
365
- end
366
- end
367
-
368
- false
369
- end
370
-
371
- # Repair an unquoted string by adding quotes around it
372
- # Repair a MongoDB function call like NumberLong("2")
373
- # Repair a JSONP function call like callback({...});
374
- def parse_unquoted_string
375
- start = @index
376
- @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
377
- return if @index <= start
378
-
379
- if @json[@index] == '(' && function_name?(@json[start...@index].strip)
380
- # Repair a MongoDB function call like NumberLong("2")
381
- # Repair a JSONP function call like callback({...});
382
- @index += 1
383
-
384
- parse_value
385
-
386
- if @json[@index] == ')'
387
- # Repair: skip close bracket of function call
388
- @index += 1
389
- # Repair: skip semicolon after JSONP call
390
- @index += 1 if @json[@index] == ';'
391
- end
392
- else
393
- # Repair unquoted string
394
- # Also, repair undefined into null
395
-
396
- # First, go back to prevent getting trailing whitespaces in the string
397
- @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
398
-
399
- symbol = @json[start...@index]
400
- @output += symbol == 'undefined' ? 'null' : symbol.inspect
401
-
402
- if @json[@index] == '"'
403
- # We had a missing start quote, but now we encountered the end quote, so we can skip that one
404
- @index += 1
405
- end
406
- end
407
-
408
- true
409
- end
410
-
411
- def parse_character(char)
412
- if @json[@index] == char
413
- @output += @json[@index]
414
- @index += 1
415
- true
416
- else
417
- false
418
- end
419
- end
420
-
421
- def parse_whitespace_and_skip_comments
422
- start = @index
423
-
424
- changed = parse_whitespace
425
- loop do
426
- changed = parse_comment
427
- changed = parse_whitespace if changed
428
- break unless changed
429
- end
430
-
431
- @index > start
432
- end
433
-
434
- # Parse a number like 2.4 or 2.4e6
435
- def parse_number
436
- start = @index
437
- if @json[@index] == '-'
438
- @index += 1
439
- if at_end_of_number?
440
- repair_number_ending_with_numeric_symbol(start)
441
- return true
442
- end
443
- unless digit?(@json[@index])
444
- @index = start
445
- return false
446
- end
447
- end
448
-
449
- # Note that in JSON leading zeros like "00789" are not allowed.
450
- # We will allow all leading zeros here though and at the end of parse_number
451
- # check against trailing zeros and repair that if needed.
452
- # Leading zeros can have meaning, so we should not clear them.
453
- @index += 1 while digit?(@json[@index])
454
-
455
- if @json[@index] == '.'
456
- @index += 1
457
- if at_end_of_number?
458
- repair_number_ending_with_numeric_symbol(start)
459
- return true
460
- end
461
- unless digit?(@json[@index])
462
- @index = start
463
- return false
464
- end
465
- @index += 1 while digit?(@json[@index])
466
- end
467
-
468
- if @json[@index] && @json[@index].downcase == 'e'
469
- @index += 1
470
- @index += 1 if ['-', '+'].include?(@json[@index])
471
- if at_end_of_number?
472
- repair_number_ending_with_numeric_symbol(start)
473
- return true
474
- end
475
- unless digit?(@json[@index])
476
- @index = start
477
- return false
478
- end
479
- @index += 1 while digit?(@json[@index])
480
- end
481
-
482
- # if we're not at the end of the number by this point, allow this to be parsed as another type
483
- unless at_end_of_number?
484
- @index = start
485
- return false
486
- end
487
-
488
- if @index > start
489
- # repair a number with leading zeros like "00789"
490
- num = @json[start...@index]
491
- has_invalid_leading_zero = num.match?(/^0\d/)
492
-
493
- @output += has_invalid_leading_zero ? "\"#{num}\"" : num
494
- return true
495
- end
496
-
497
- false
498
- end
499
-
500
- def at_end_of_number?
501
- @index >= @json.length || delimiter?(@json[@index]) || whitespace?(@json[@index])
502
- end
503
-
504
- # Parse an array like '["item1", "item2", ...]'
505
- def parse_array
506
- if @json[@index] == OPENING_BRACKET
507
- @output += '['
508
- @index += 1
509
- parse_whitespace_and_skip_comments
510
-
511
- # repair: skip leading comma like in [,1,2,3]
512
- parse_whitespace_and_skip_comments if skip_character(COMMA)
513
-
514
- initial = true
515
- while @index < @json.length && @json[@index] != CLOSING_BRACKET
516
- if initial
517
- initial = false
518
- else
519
- processed_comma = parse_character(COMMA)
520
- # repair missing comma
521
- @output = insert_before_last_whitespace(@output, ',') unless processed_comma
522
- end
523
-
524
- skip_ellipsis
525
-
526
- processed_value = parse_value
527
- next if processed_value
528
-
529
- # repair trailing comma
530
- @output = strip_last_occurrence(@output, ',')
531
- break
532
- end
533
-
534
- if @json[@index] == CLOSING_BRACKET
535
- @output += ']'
536
- @index += 1
537
- else
538
- # repair missing closing array bracket
539
- @output = insert_before_last_whitespace(@output, ']')
540
- end
541
-
542
- true
543
- else
544
- false
545
- end
546
- end
547
-
548
- def prev_non_whitespace_index(start)
549
- prev = start
550
- prev -= 1 while prev.positive? && whitespace?(@json[prev])
551
- prev
552
- end
553
-
554
- # Repair concatenated strings like "hello" + "world", change this into "helloworld"
555
- def parse_concatenated_string
556
- processed = false
557
-
558
- parse_whitespace_and_skip_comments
559
- while @json[@index] == PLUS
560
- processed = true
561
- @index += 1
562
- parse_whitespace_and_skip_comments
563
-
564
- # repair: remove the end quote of the first string
565
- @output = strip_last_occurrence(@output, '"', strip_remaining_text: true)
566
- start = @output.length
567
- parsed_str = parse_string
568
- @output = if parsed_str
569
- # repair: remove the start quote of the second string
570
- remove_at_index(@output, start, 1)
571
- else
572
- # repair: remove the '+' because it is not followed by a string
573
- insert_before_last_whitespace(@output, '"')
574
- end
575
- end
576
-
577
- processed
578
- end
579
-
580
- def repair_number_ending_with_numeric_symbol(start)
581
- # repair numbers cut off at the end
582
- # this will only be called when we end after a '.', '-', or 'e' and does not
583
- # change the number more than it needs to make it valid JSON
584
- @output += "#{@json[start...@index]}0"
585
- end
586
-
587
- # Parse and repair Newline Delimited JSON (NDJSON):
588
- # multiple JSON objects separated by a newline character
589
- def parse_newline_delimited_json
590
- # repair NDJSON
591
- initial = true
592
- processed_value = true
593
- while processed_value
594
- if initial
595
- initial = false
596
- else
597
- # parse optional comma, insert when missing
598
- processed_comma = parse_character(COMMA)
599
- unless processed_comma
600
- # repair: add missing comma
601
- @output = insert_before_last_whitespace(@output, ',')
602
- end
603
- end
604
-
605
- processed_value = parse_value
606
- end
607
-
608
- unless processed_value
609
- # repair: remove trailing comma
610
- @output = strip_last_occurrence(@output, ',')
611
- end
612
-
613
- # repair: wrap the output inside array brackets
614
- @output = "[\n#{@output}\n]"
615
- end
616
-
617
- def skip_escape_character
618
- skip_character(BACKSLASH)
619
- end
620
-
621
- def throw_invalid_character(char)
622
- raise JSONRepairError, "Invalid character #{char.inspect} at index #{@index}"
623
- end
624
-
625
- def throw_unexpected_character
626
- raise JSONRepairError, "Unexpected character #{@json[@index].inspect} at index #{@index}"
627
- end
628
-
629
- def throw_unexpected_end
630
- raise JSONRepairError, 'Unexpected end of json string'
631
- end
632
-
633
- def throw_object_key_expected
634
- raise JSONRepairError, 'Object key expected'
635
- end
636
-
637
- def throw_colon_expected
638
- raise JSONRepairError, 'Colon expected'
639
- end
640
-
641
- def throw_invalid_unicode_character
642
- chars = @json[@index, 6]
643
- raise JSONRepairError, "Invalid unicode character #{chars.inspect} at index #{@index}"
644
- end
645
- end
646
- end
647
- end