json-repair 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,645 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'repair/string_utils'
4
+
5
+ module JSON
6
+ class Repairer
7
+ include Repair::StringUtils
8
+
9
+ CONTROL_CHARACTERS = {
10
+ "\b" => '\b',
11
+ "\f" => '\f',
12
+ "\n" => '\n',
13
+ "\r" => '\r',
14
+ "\t" => '\t'
15
+ }.freeze
16
+
17
+ ESCAPE_CHARACTERS = {
18
+ '"' => '"',
19
+ '\\' => '\\',
20
+ '/' => '/',
21
+ 'b' => "\b",
22
+ 'f' => "\f",
23
+ 'n' => "\n",
24
+ 'r' => "\r",
25
+ 't' => "\t"
26
+ }.freeze
27
+
28
+ def initialize(json)
29
+ @json = json
30
+ @index = 0
31
+ @output = ''
32
+ end
33
+
34
+ def repair
35
+ processed = parse_value
36
+
37
+ throw_unexpected_end unless processed
38
+
39
+ processed_comma = parse_character(COMMA)
40
+ parse_whitespace_and_skip_comments if processed_comma
41
+
42
+ if start_of_value?(@json[@index]) && ends_with_comma_or_newline?(@output)
43
+ # start of a new value after end of the root level object: looks like
44
+ # newline delimited JSON -> turn into a root level array
45
+ unless processed_comma
46
+ # repair missing comma
47
+ @output = insert_before_last_whitespace(@output, ',')
48
+ end
49
+
50
+ parse_newline_delimited_json
51
+ elsif processed_comma
52
+ # repair: remove trailing comma
53
+ @output = strip_last_occurrence(@output, ',')
54
+ end
55
+
56
+ # repair redundant end quotes
57
+ while @json[@index] == CLOSING_BRACE || @json[@index] == CLOSING_BRACKET
58
+ @index += 1
59
+ parse_whitespace_and_skip_comments
60
+ end
61
+
62
+ if @index >= @json.length
63
+ # reached the end of the document properly
64
+ return @output
65
+ end
66
+
67
+ throw_unexpected_character
68
+ end
69
+
70
+ private
71
+
72
+ def parse_value
73
+ parse_whitespace_and_skip_comments
74
+ process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
75
+ parse_whitespace_and_skip_comments
76
+
77
+ process
78
+ end
79
+
80
+ def parse_whitespace
81
+ whitespace = ''
82
+ while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
83
+ whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
84
+
85
+ @index += 1
86
+ end
87
+
88
+ unless whitespace.empty?
89
+ @output += whitespace
90
+ return true
91
+ end
92
+
93
+ false
94
+ end
95
+
96
+ def parse_comment
97
+ if @json[@index] == '/' && @json[@index + 1] == '*'
98
+ # Block comment
99
+ @index += 2
100
+ @index += 1 until @json[@index].nil? || (@json[@index] == '*' && @json[@index + 1] == '/')
101
+ @index += 2
102
+ true
103
+ elsif @json[@index] == '/' && @json[@index + 1] == '/'
104
+ # Line comment
105
+ @index += 2
106
+ @index += 1 until @json[@index].nil? || @json[@index] == "\n"
107
+ true
108
+ else
109
+ false
110
+ end
111
+ end
112
+
113
+ # Parse an object like '{"key": "value"}'
114
+ def parse_object
115
+ return false unless @json[@index] == OPENING_BRACE
116
+
117
+ @output += '{'
118
+ @index += 1
119
+ parse_whitespace_and_skip_comments
120
+
121
+ # repair: skip leading comma like in {, message: "hi"}
122
+ parse_whitespace_and_skip_comments if skip_character(COMMA)
123
+
124
+ initial = true
125
+ while @index < @json.length && @json[@index] != CLOSING_BRACE
126
+ processed_comma = true
127
+ if initial
128
+ initial = false
129
+ else
130
+ processed_comma = parse_character(COMMA)
131
+ unless processed_comma
132
+ # repair missing comma
133
+ @output = insert_before_last_whitespace(@output, ',')
134
+ end
135
+ parse_whitespace_and_skip_comments
136
+ end
137
+
138
+ skip_ellipsis
139
+
140
+ processed_key = parse_string || parse_unquoted_string
141
+ unless processed_key
142
+ if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
143
+ @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
144
+ @json[@index].nil?
145
+ # repair trailing comma
146
+ @output = strip_last_occurrence(@output, ',')
147
+ else
148
+ throw_object_key_expected
149
+ end
150
+ break
151
+ end
152
+
153
+ parse_whitespace_and_skip_comments
154
+ processed_colon = parse_character(COLON)
155
+ truncated_text = @index >= @json.length
156
+ unless processed_colon
157
+ if start_of_value?(@json[@index]) || truncated_text
158
+ # repair missing colon
159
+ @output = insert_before_last_whitespace(@output, ':')
160
+ else
161
+ throw_colon_expected
162
+ end
163
+ end
164
+
165
+ processed_value = parse_value
166
+ unless processed_value
167
+ if processed_colon || truncated_text
168
+ # repair missing object value
169
+ @output += 'null'
170
+ else
171
+ throw_colon_expected
172
+ end
173
+ end
174
+ end
175
+
176
+ if @json[@index] == CLOSING_BRACE
177
+ @output += '}'
178
+ @index += 1
179
+ else
180
+ # repair missing end bracket
181
+ @output = insert_before_last_whitespace(@output, '}')
182
+ end
183
+
184
+ true
185
+ end
186
+
187
+ def skip_character(char)
188
+ if @json[@index] == char
189
+ @index += 1
190
+ true
191
+ else
192
+ false
193
+ end
194
+ end
195
+
196
+ # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
197
+ # or a similar construct in objects.
198
+ def skip_ellipsis
199
+ parse_whitespace_and_skip_comments
200
+
201
+ if @json[@index] == DOT &&
202
+ @json[@index + 1] == DOT &&
203
+ @json[@index + 2] == DOT
204
+ # repair: remove the ellipsis (three dots) and optionally a comma
205
+ @index += 3
206
+ parse_whitespace_and_skip_comments
207
+ skip_character(COMMA)
208
+ end
209
+ end
210
+
211
+ # Parse a string enclosed by double quotes "...". Can contain escaped quotes
212
+ # Repair strings enclosed in single quotes or special quotes
213
+ # Repair an escaped string
214
+ #
215
+ # The function can run in two stages:
216
+ # - First, it assumes the string has a valid end quote
217
+ # - If it turns out that the string does not have a valid end quote followed
218
+ # by a delimiter (which should be the case), the function runs again in a
219
+ # more conservative way, stopping the string at the first next delimiter
220
+ # and fixing the string by inserting a quote there.
221
+ def parse_string(stop_at_delimiter: false)
222
+ if @json[@index] == BACKSLASH
223
+ # repair: remove the first escape character
224
+ @index += 1
225
+ skip_escape_chars = true
226
+ end
227
+
228
+ if quote?(@json[@index])
229
+ # double quotes are correct JSON,
230
+ # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
231
+ # otherwise, we will match any double-quote-like start with a double-quote-like end,
232
+ # or any single-quote-like start with a single-quote-like end
233
+ is_end_quote = if double_quote?(@json[@index])
234
+ method(:double_quote?)
235
+ elsif single_quote?(@json[@index])
236
+ method(:single_quote?)
237
+ elsif single_quote_like?(@json[@index])
238
+ method(:single_quote_like?)
239
+ else
240
+ method(:double_quote_like?)
241
+ end
242
+
243
+ i_before = @index
244
+ o_before = @output.length
245
+
246
+ str = '"'
247
+ @index += 1
248
+
249
+ loop do
250
+ if @index >= @json.length
251
+ # end of text, we are missing an end quote
252
+
253
+ i_prev = prev_non_whitespace_index(@index - 1)
254
+ if !stop_at_delimiter && delimiter?(@json[i_prev])
255
+ # if the text ends with a delimiter, like ["hello],
256
+ # so the missing end quote should be inserted before this delimiter
257
+ # retry parsing the string, stopping at the first next delimiter
258
+ @index = i_before
259
+ @output = @output[0...o_before]
260
+
261
+ return parse_string(stop_at_delimiter: true)
262
+ end
263
+
264
+ # repair missing quote
265
+ str = insert_before_last_whitespace(str, '"')
266
+ @output += str
267
+
268
+ return true
269
+ elsif is_end_quote.call(@json[@index])
270
+ # end quote
271
+ i_quote = @index
272
+ o_quote = str.length
273
+ str += '"'
274
+ @index += 1
275
+ @output += str
276
+
277
+ parse_whitespace_and_skip_comments
278
+
279
+ if stop_at_delimiter ||
280
+ @index >= @json.length ||
281
+ delimiter?(@json[@index]) ||
282
+ quote?(@json[@index]) ||
283
+ digit?(@json[@index])
284
+ # The quote is followed by the end of the text, a delimiter, or a next value
285
+ parse_concatenated_string
286
+
287
+ return true
288
+ end
289
+
290
+ if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
291
+ # This is not the right end quote: it is preceded by a delimiter,
292
+ # and NOT followed by a delimiter. So, there is an end quote missing
293
+ # parse the string again and then stop at the first next delimiter
294
+ @index = i_before
295
+ @output = @output[...o_before]
296
+
297
+ return parse_string(stop_at_delimiter: true)
298
+ end
299
+
300
+ # revert to right after the quote but before any whitespace, and continue parsing the string
301
+ @output = @output[...o_before]
302
+ @index = i_quote + 1
303
+
304
+ # repair unescaped quote
305
+ str = "#{str[...o_quote]}\\#{str[o_quote..]}"
306
+ elsif stop_at_delimiter && delimiter?(@json[@index])
307
+ # we're in the mode to stop the string at the first delimiter
308
+ # because there is an end quote missing
309
+
310
+ # repair missing quote
311
+ str = insert_before_last_whitespace(str, '"')
312
+ @output += str
313
+
314
+ parse_concatenated_string
315
+
316
+ return true
317
+ elsif @json[@index] == BACKSLASH
318
+ # handle escaped content like \n or \u2605
319
+ char = @json[@index + 1]
320
+ escape_char = ESCAPE_CHARACTERS[char]
321
+ if escape_char
322
+ str += @json[@index, 2]
323
+ @index += 2
324
+ elsif char == 'u'
325
+ j = 2
326
+ j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
327
+ if j == 6
328
+ str += @json[@index, 6]
329
+ @index += 6
330
+ elsif @index + j >= @json.length
331
+ # repair invalid or truncated unicode char at the end of the text
332
+ # by removing the unicode char and ending the string here
333
+ @index = @json.length
334
+ else
335
+ throw_invalid_unicode_character
336
+ end
337
+ else
338
+ # repair invalid escape character: remove it
339
+ str += char
340
+ @index += 2
341
+ end
342
+ else
343
+ # handle regular characters
344
+ char = @json[@index]
345
+
346
+ if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
347
+ # repair unescaped double quote
348
+ str += "\\#{char}"
349
+ elsif control_character?(char)
350
+ # unescaped control character
351
+ str += CONTROL_CHARACTERS[char]
352
+ else
353
+ throw_invalid_character(char) unless valid_string_character?(char)
354
+ str += char
355
+ end
356
+
357
+ @index += 1
358
+ end
359
+
360
+ if skip_escape_chars
361
+ # repair: skipped escape character (nothing to do)
362
+ skip_escape_character
363
+ end
364
+ end
365
+ end
366
+
367
+ false
368
+ end
369
+
370
+ # Repair an unquoted string by adding quotes around it
371
+ # Repair a MongoDB function call like NumberLong("2")
372
+ # Repair a JSONP function call like callback({...});
373
+ def parse_unquoted_string
374
+ start = @index
375
+ @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
376
+ return if @index <= start
377
+
378
+ if @json[@index] == '(' && function_name?(@json[start...@index].strip)
379
+ # Repair a MongoDB function call like NumberLong("2")
380
+ # Repair a JSONP function call like callback({...});
381
+ @index += 1
382
+
383
+ parse_value
384
+
385
+ if @json[@index] == ')'
386
+ # Repair: skip close bracket of function call
387
+ @index += 1
388
+ # Repair: skip semicolon after JSONP call
389
+ @index += 1 if @json[@index] == ';'
390
+ end
391
+ else
392
+ # Repair unquoted string
393
+ # Also, repair undefined into null
394
+
395
+ # First, go back to prevent getting trailing whitespaces in the string
396
+ @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
397
+
398
+ symbol = @json[start...@index]
399
+ @output += symbol == 'undefined' ? 'null' : symbol.inspect
400
+
401
+ if @json[@index] == '"'
402
+ # We had a missing start quote, but now we encountered the end quote, so we can skip that one
403
+ @index += 1
404
+ end
405
+ end
406
+
407
+ true
408
+ end
409
+
410
+ def parse_character(char)
411
+ if @json[@index] == char
412
+ @output += @json[@index]
413
+ @index += 1
414
+ true
415
+ else
416
+ false
417
+ end
418
+ end
419
+
420
+ def parse_whitespace_and_skip_comments
421
+ start = @index
422
+
423
+ changed = parse_whitespace
424
+ loop do
425
+ changed = parse_comment
426
+ changed = parse_whitespace if changed
427
+ break unless changed
428
+ end
429
+
430
+ @index > start
431
+ end
432
+
433
+ # Parse a number like 2.4 or 2.4e6
434
+ def parse_number
435
+ start = @index
436
+ if @json[@index] == '-'
437
+ @index += 1
438
+ if at_end_of_number?
439
+ repair_number_ending_with_numeric_symbol(start)
440
+ return true
441
+ end
442
+ unless digit?(@json[@index])
443
+ @index = start
444
+ return false
445
+ end
446
+ end
447
+
448
+ # Note that in JSON leading zeros like "00789" are not allowed.
449
+ # We will allow all leading zeros here though and at the end of parse_number
450
+ # check against trailing zeros and repair that if needed.
451
+ # Leading zeros can have meaning, so we should not clear them.
452
+ @index += 1 while digit?(@json[@index])
453
+
454
+ if @json[@index] == '.'
455
+ @index += 1
456
+ if at_end_of_number?
457
+ repair_number_ending_with_numeric_symbol(start)
458
+ return true
459
+ end
460
+ unless digit?(@json[@index])
461
+ @index = start
462
+ return false
463
+ end
464
+ @index += 1 while digit?(@json[@index])
465
+ end
466
+
467
+ if @json[@index] && @json[@index].downcase == 'e'
468
+ @index += 1
469
+ @index += 1 if ['-', '+'].include?(@json[@index])
470
+ if at_end_of_number?
471
+ repair_number_ending_with_numeric_symbol(start)
472
+ return true
473
+ end
474
+ unless digit?(@json[@index])
475
+ @index = start
476
+ return false
477
+ end
478
+ @index += 1 while digit?(@json[@index])
479
+ end
480
+
481
+ # if we're not at the end of the number by this point, allow this to be parsed as another type
482
+ unless at_end_of_number?
483
+ @index = start
484
+ return false
485
+ end
486
+
487
+ if @index > start
488
+ # repair a number with leading zeros like "00789"
489
+ num = @json[start...@index]
490
+ has_invalid_leading_zero = num.match?(/^0\d/)
491
+
492
+ @output += has_invalid_leading_zero ? "\"#{num}\"" : num
493
+ return true
494
+ end
495
+
496
+ false
497
+ end
498
+
499
+ def at_end_of_number?
500
+ @index >= @json.length || delimiter?(@json[@index]) || whitespace?(@json[@index])
501
+ end
502
+
503
+ # Parse an array like '["item1", "item2", ...]'
504
+ def parse_array
505
+ if @json[@index] == OPENING_BRACKET
506
+ @output += '['
507
+ @index += 1
508
+ parse_whitespace_and_skip_comments
509
+
510
+ # repair: skip leading comma like in [,1,2,3]
511
+ parse_whitespace_and_skip_comments if skip_character(COMMA)
512
+
513
+ initial = true
514
+ while @index < @json.length && @json[@index] != CLOSING_BRACKET
515
+ if initial
516
+ initial = false
517
+ else
518
+ processed_comma = parse_character(COMMA)
519
+ # repair missing comma
520
+ @output = insert_before_last_whitespace(@output, ',') unless processed_comma
521
+ end
522
+
523
+ skip_ellipsis
524
+
525
+ processed_value = parse_value
526
+ next if processed_value
527
+
528
+ # repair trailing comma
529
+ @output = strip_last_occurrence(@output, ',')
530
+ break
531
+ end
532
+
533
+ if @json[@index] == CLOSING_BRACKET
534
+ @output += ']'
535
+ @index += 1
536
+ else
537
+ # repair missing closing array bracket
538
+ @output = insert_before_last_whitespace(@output, ']')
539
+ end
540
+
541
+ true
542
+ else
543
+ false
544
+ end
545
+ end
546
+
547
+ def prev_non_whitespace_index(start)
548
+ prev = start
549
+ prev -= 1 while prev.positive? && whitespace?(@json[prev])
550
+ prev
551
+ end
552
+
553
+ # Repair concatenated strings like "hello" + "world", change this into "helloworld"
554
+ def parse_concatenated_string
555
+ processed = false
556
+
557
+ parse_whitespace_and_skip_comments
558
+ while @json[@index] == PLUS
559
+ processed = true
560
+ @index += 1
561
+ parse_whitespace_and_skip_comments
562
+
563
+ # repair: remove the end quote of the first string
564
+ @output = strip_last_occurrence(@output, '"', strip_remaining_text: true)
565
+ start = @output.length
566
+ parsed_str = parse_string
567
+ @output = if parsed_str
568
+ # repair: remove the start quote of the second string
569
+ remove_at_index(@output, start, 1)
570
+ else
571
+ # repair: remove the '+' because it is not followed by a string
572
+ insert_before_last_whitespace(@output, '"')
573
+ end
574
+ end
575
+
576
+ processed
577
+ end
578
+
579
+ def repair_number_ending_with_numeric_symbol(start)
580
+ # repair numbers cut off at the end
581
+ # this will only be called when we end after a '.', '-', or 'e' and does not
582
+ # change the number more than it needs to make it valid JSON
583
+ @output += "#{@json[start...@index]}0"
584
+ end
585
+
586
+ # Parse and repair Newline Delimited JSON (NDJSON):
587
+ # multiple JSON objects separated by a newline character
588
+ def parse_newline_delimited_json
589
+ # repair NDJSON
590
+ initial = true
591
+ processed_value = true
592
+ while processed_value
593
+ if initial
594
+ initial = false
595
+ else
596
+ # parse optional comma, insert when missing
597
+ processed_comma = parse_character(COMMA)
598
+ unless processed_comma
599
+ # repair: add missing comma
600
+ @output = insert_before_last_whitespace(@output, ',')
601
+ end
602
+ end
603
+
604
+ processed_value = parse_value
605
+ end
606
+
607
+ unless processed_value
608
+ # repair: remove trailing comma
609
+ @output = strip_last_occurrence(@output, ',')
610
+ end
611
+
612
+ # repair: wrap the output inside array brackets
613
+ @output = "[\n#{@output}\n]"
614
+ end
615
+
616
+ def skip_escape_character
617
+ skip_character(BACKSLASH)
618
+ end
619
+
620
+ def throw_invalid_character(char)
621
+ raise JSONRepairError, "Invalid character #{char.inspect} at index #{@index}"
622
+ end
623
+
624
+ def throw_unexpected_character
625
+ raise JSONRepairError, "Unexpected character #{@json[@index].inspect} at index #{@index}"
626
+ end
627
+
628
+ def throw_unexpected_end
629
+ raise JSONRepairError, 'Unexpected end of json string'
630
+ end
631
+
632
+ def throw_object_key_expected
633
+ raise JSONRepairError, 'Object key expected'
634
+ end
635
+
636
+ def throw_colon_expected
637
+ raise JSONRepairError, 'Colon expected'
638
+ end
639
+
640
+ def throw_invalid_unicode_character
641
+ chars = @json[@index, 6]
642
+ raise JSONRepairError, "Invalid unicode character #{chars.inspect} at index #{@index}"
643
+ end
644
+ end
645
+ end
data/sig/json/repair.rbs CHANGED
@@ -1,7 +1,7 @@
1
1
  module JSON
2
2
  module Repair
3
3
  VERSION: String
4
-
5
- def self.repair(String) -> ?String
6
4
  end
5
+
6
+ def self.repair(String) -> ?String
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: json-repair
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksandr Zykov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-24 00:00:00.000000000 Z
11
+ date: 2024-06-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This is a simple gem that repairs broken JSON strings.
14
14
  email:
@@ -25,9 +25,9 @@ files:
25
25
  - README.md
26
26
  - Rakefile
27
27
  - lib/json/repair.rb
28
- - lib/json/repair/repairer.rb
29
28
  - lib/json/repair/string_utils.rb
30
29
  - lib/json/repair/version.rb
30
+ - lib/json/repairer.rb
31
31
  - sig/json/repair.rbs
32
32
  homepage: https://github.com/sashazykov/json-repair-rb
33
33
  licenses: