json-repair 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,645 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'repair/string_utils'
4
+
5
+ module JSON
6
+ class Repairer
7
+ include Repair::StringUtils
8
+
9
+ CONTROL_CHARACTERS = {
10
+ "\b" => '\b',
11
+ "\f" => '\f',
12
+ "\n" => '\n',
13
+ "\r" => '\r',
14
+ "\t" => '\t'
15
+ }.freeze
16
+
17
+ ESCAPE_CHARACTERS = {
18
+ '"' => '"',
19
+ '\\' => '\\',
20
+ '/' => '/',
21
+ 'b' => "\b",
22
+ 'f' => "\f",
23
+ 'n' => "\n",
24
+ 'r' => "\r",
25
+ 't' => "\t"
26
+ }.freeze
27
+
28
+ def initialize(json)
29
+ @json = json
30
+ @index = 0
31
+ @output = ''
32
+ end
33
+
34
+ def repair
35
+ processed = parse_value
36
+
37
+ throw_unexpected_end unless processed
38
+
39
+ processed_comma = parse_character(COMMA)
40
+ parse_whitespace_and_skip_comments if processed_comma
41
+
42
+ if start_of_value?(@json[@index]) && ends_with_comma_or_newline?(@output)
43
+ # start of a new value after end of the root level object: looks like
44
+ # newline delimited JSON -> turn into a root level array
45
+ unless processed_comma
46
+ # repair missing comma
47
+ @output = insert_before_last_whitespace(@output, ',')
48
+ end
49
+
50
+ parse_newline_delimited_json
51
+ elsif processed_comma
52
+ # repair: remove trailing comma
53
+ @output = strip_last_occurrence(@output, ',')
54
+ end
55
+
56
+ # repair redundant end quotes
57
+ while @json[@index] == CLOSING_BRACE || @json[@index] == CLOSING_BRACKET
58
+ @index += 1
59
+ parse_whitespace_and_skip_comments
60
+ end
61
+
62
+ if @index >= @json.length
63
+ # reached the end of the document properly
64
+ return @output
65
+ end
66
+
67
+ throw_unexpected_character
68
+ end
69
+
70
+ private
71
+
72
+ def parse_value
73
+ parse_whitespace_and_skip_comments
74
+ process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
75
+ parse_whitespace_and_skip_comments
76
+
77
+ process
78
+ end
79
+
80
+ def parse_whitespace
81
+ whitespace = ''
82
+ while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
83
+ whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
84
+
85
+ @index += 1
86
+ end
87
+
88
+ unless whitespace.empty?
89
+ @output += whitespace
90
+ return true
91
+ end
92
+
93
+ false
94
+ end
95
+
96
+ def parse_comment
97
+ if @json[@index] == '/' && @json[@index + 1] == '*'
98
+ # Block comment
99
+ @index += 2
100
+ @index += 1 until @json[@index].nil? || (@json[@index] == '*' && @json[@index + 1] == '/')
101
+ @index += 2
102
+ true
103
+ elsif @json[@index] == '/' && @json[@index + 1] == '/'
104
+ # Line comment
105
+ @index += 2
106
+ @index += 1 until @json[@index].nil? || @json[@index] == "\n"
107
+ true
108
+ else
109
+ false
110
+ end
111
+ end
112
+
113
+ # Parse an object like '{"key": "value"}'
114
+ def parse_object
115
+ return false unless @json[@index] == OPENING_BRACE
116
+
117
+ @output += '{'
118
+ @index += 1
119
+ parse_whitespace_and_skip_comments
120
+
121
+ # repair: skip leading comma like in {, message: "hi"}
122
+ parse_whitespace_and_skip_comments if skip_character(COMMA)
123
+
124
+ initial = true
125
+ while @index < @json.length && @json[@index] != CLOSING_BRACE
126
+ processed_comma = true
127
+ if initial
128
+ initial = false
129
+ else
130
+ processed_comma = parse_character(COMMA)
131
+ unless processed_comma
132
+ # repair missing comma
133
+ @output = insert_before_last_whitespace(@output, ',')
134
+ end
135
+ parse_whitespace_and_skip_comments
136
+ end
137
+
138
+ skip_ellipsis
139
+
140
+ processed_key = parse_string || parse_unquoted_string
141
+ unless processed_key
142
+ if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
143
+ @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
144
+ @json[@index].nil?
145
+ # repair trailing comma
146
+ @output = strip_last_occurrence(@output, ',')
147
+ else
148
+ throw_object_key_expected
149
+ end
150
+ break
151
+ end
152
+
153
+ parse_whitespace_and_skip_comments
154
+ processed_colon = parse_character(COLON)
155
+ truncated_text = @index >= @json.length
156
+ unless processed_colon
157
+ if start_of_value?(@json[@index]) || truncated_text
158
+ # repair missing colon
159
+ @output = insert_before_last_whitespace(@output, ':')
160
+ else
161
+ throw_colon_expected
162
+ end
163
+ end
164
+
165
+ processed_value = parse_value
166
+ unless processed_value
167
+ if processed_colon || truncated_text
168
+ # repair missing object value
169
+ @output += 'null'
170
+ else
171
+ throw_colon_expected
172
+ end
173
+ end
174
+ end
175
+
176
+ if @json[@index] == CLOSING_BRACE
177
+ @output += '}'
178
+ @index += 1
179
+ else
180
+ # repair missing end bracket
181
+ @output = insert_before_last_whitespace(@output, '}')
182
+ end
183
+
184
+ true
185
+ end
186
+
187
+ def skip_character(char)
188
+ if @json[@index] == char
189
+ @index += 1
190
+ true
191
+ else
192
+ false
193
+ end
194
+ end
195
+
196
+ # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
197
+ # or a similar construct in objects.
198
+ def skip_ellipsis
199
+ parse_whitespace_and_skip_comments
200
+
201
+ if @json[@index] == DOT &&
202
+ @json[@index + 1] == DOT &&
203
+ @json[@index + 2] == DOT
204
+ # repair: remove the ellipsis (three dots) and optionally a comma
205
+ @index += 3
206
+ parse_whitespace_and_skip_comments
207
+ skip_character(COMMA)
208
+ end
209
+ end
210
+
211
+ # Parse a string enclosed by double quotes "...". Can contain escaped quotes
212
+ # Repair strings enclosed in single quotes or special quotes
213
+ # Repair an escaped string
214
+ #
215
+ # The function can run in two stages:
216
+ # - First, it assumes the string has a valid end quote
217
+ # - If it turns out that the string does not have a valid end quote followed
218
+ # by a delimiter (which should be the case), the function runs again in a
219
+ # more conservative way, stopping the string at the first next delimiter
220
+ # and fixing the string by inserting a quote there.
221
+ def parse_string(stop_at_delimiter: false)
222
+ if @json[@index] == BACKSLASH
223
+ # repair: remove the first escape character
224
+ @index += 1
225
+ skip_escape_chars = true
226
+ end
227
+
228
+ if quote?(@json[@index])
229
+ # double quotes are correct JSON,
230
+ # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
231
+ # otherwise, we will match any double-quote-like start with a double-quote-like end,
232
+ # or any single-quote-like start with a single-quote-like end
233
+ is_end_quote = if double_quote?(@json[@index])
234
+ method(:double_quote?)
235
+ elsif single_quote?(@json[@index])
236
+ method(:single_quote?)
237
+ elsif single_quote_like?(@json[@index])
238
+ method(:single_quote_like?)
239
+ else
240
+ method(:double_quote_like?)
241
+ end
242
+
243
+ i_before = @index
244
+ o_before = @output.length
245
+
246
+ str = '"'
247
+ @index += 1
248
+
249
+ loop do
250
+ if @index >= @json.length
251
+ # end of text, we are missing an end quote
252
+
253
+ i_prev = prev_non_whitespace_index(@index - 1)
254
+ if !stop_at_delimiter && delimiter?(@json[i_prev])
255
+ # if the text ends with a delimiter, like ["hello],
256
+ # so the missing end quote should be inserted before this delimiter
257
+ # retry parsing the string, stopping at the first next delimiter
258
+ @index = i_before
259
+ @output = @output[0...o_before]
260
+
261
+ return parse_string(stop_at_delimiter: true)
262
+ end
263
+
264
+ # repair missing quote
265
+ str = insert_before_last_whitespace(str, '"')
266
+ @output += str
267
+
268
+ return true
269
+ elsif is_end_quote.call(@json[@index])
270
+ # end quote
271
+ i_quote = @index
272
+ o_quote = str.length
273
+ str += '"'
274
+ @index += 1
275
+ @output += str
276
+
277
+ parse_whitespace_and_skip_comments
278
+
279
+ if stop_at_delimiter ||
280
+ @index >= @json.length ||
281
+ delimiter?(@json[@index]) ||
282
+ quote?(@json[@index]) ||
283
+ digit?(@json[@index])
284
+ # The quote is followed by the end of the text, a delimiter, or a next value
285
+ parse_concatenated_string
286
+
287
+ return true
288
+ end
289
+
290
+ if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
291
+ # This is not the right end quote: it is preceded by a delimiter,
292
+ # and NOT followed by a delimiter. So, there is an end quote missing
293
+ # parse the string again and then stop at the first next delimiter
294
+ @index = i_before
295
+ @output = @output[...o_before]
296
+
297
+ return parse_string(stop_at_delimiter: true)
298
+ end
299
+
300
+ # revert to right after the quote but before any whitespace, and continue parsing the string
301
+ @output = @output[...o_before]
302
+ @index = i_quote + 1
303
+
304
+ # repair unescaped quote
305
+ str = "#{str[...o_quote]}\\#{str[o_quote..]}"
306
+ elsif stop_at_delimiter && delimiter?(@json[@index])
307
+ # we're in the mode to stop the string at the first delimiter
308
+ # because there is an end quote missing
309
+
310
+ # repair missing quote
311
+ str = insert_before_last_whitespace(str, '"')
312
+ @output += str
313
+
314
+ parse_concatenated_string
315
+
316
+ return true
317
+ elsif @json[@index] == BACKSLASH
318
+ # handle escaped content like \n or \u2605
319
+ char = @json[@index + 1]
320
+ escape_char = ESCAPE_CHARACTERS[char]
321
+ if escape_char
322
+ str += @json[@index, 2]
323
+ @index += 2
324
+ elsif char == 'u'
325
+ j = 2
326
+ j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
327
+ if j == 6
328
+ str += @json[@index, 6]
329
+ @index += 6
330
+ elsif @index + j >= @json.length
331
+ # repair invalid or truncated unicode char at the end of the text
332
+ # by removing the unicode char and ending the string here
333
+ @index = @json.length
334
+ else
335
+ throw_invalid_unicode_character
336
+ end
337
+ else
338
+ # repair invalid escape character: remove it
339
+ str += char
340
+ @index += 2
341
+ end
342
+ else
343
+ # handle regular characters
344
+ char = @json[@index]
345
+
346
+ if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
347
+ # repair unescaped double quote
348
+ str += "\\#{char}"
349
+ elsif control_character?(char)
350
+ # unescaped control character
351
+ str += CONTROL_CHARACTERS[char]
352
+ else
353
+ throw_invalid_character(char) unless valid_string_character?(char)
354
+ str += char
355
+ end
356
+
357
+ @index += 1
358
+ end
359
+
360
+ if skip_escape_chars
361
+ # repair: skipped escape character (nothing to do)
362
+ skip_escape_character
363
+ end
364
+ end
365
+ end
366
+
367
+ false
368
+ end
369
+
370
+ # Repair an unquoted string by adding quotes around it
371
+ # Repair a MongoDB function call like NumberLong("2")
372
+ # Repair a JSONP function call like callback({...});
373
+ def parse_unquoted_string
374
+ start = @index
375
+ @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
376
+ return if @index <= start
377
+
378
+ if @json[@index] == '(' && function_name?(@json[start...@index].strip)
379
+ # Repair a MongoDB function call like NumberLong("2")
380
+ # Repair a JSONP function call like callback({...});
381
+ @index += 1
382
+
383
+ parse_value
384
+
385
+ if @json[@index] == ')'
386
+ # Repair: skip close bracket of function call
387
+ @index += 1
388
+ # Repair: skip semicolon after JSONP call
389
+ @index += 1 if @json[@index] == ';'
390
+ end
391
+ else
392
+ # Repair unquoted string
393
+ # Also, repair undefined into null
394
+
395
+ # First, go back to prevent getting trailing whitespaces in the string
396
+ @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
397
+
398
+ symbol = @json[start...@index]
399
+ @output += symbol == 'undefined' ? 'null' : symbol.inspect
400
+
401
+ if @json[@index] == '"'
402
+ # We had a missing start quote, but now we encountered the end quote, so we can skip that one
403
+ @index += 1
404
+ end
405
+ end
406
+
407
+ true
408
+ end
409
+
410
+ def parse_character(char)
411
+ if @json[@index] == char
412
+ @output += @json[@index]
413
+ @index += 1
414
+ true
415
+ else
416
+ false
417
+ end
418
+ end
419
+
420
+ def parse_whitespace_and_skip_comments
421
+ start = @index
422
+
423
+ changed = parse_whitespace
424
+ loop do
425
+ changed = parse_comment
426
+ changed = parse_whitespace if changed
427
+ break unless changed
428
+ end
429
+
430
+ @index > start
431
+ end
432
+
433
+ # Parse a number like 2.4 or 2.4e6
434
+ def parse_number
435
+ start = @index
436
+ if @json[@index] == '-'
437
+ @index += 1
438
+ if at_end_of_number?
439
+ repair_number_ending_with_numeric_symbol(start)
440
+ return true
441
+ end
442
+ unless digit?(@json[@index])
443
+ @index = start
444
+ return false
445
+ end
446
+ end
447
+
448
+ # Note that in JSON leading zeros like "00789" are not allowed.
449
+ # We will allow all leading zeros here though and at the end of parse_number
450
+ # check against trailing zeros and repair that if needed.
451
+ # Leading zeros can have meaning, so we should not clear them.
452
+ @index += 1 while digit?(@json[@index])
453
+
454
+ if @json[@index] == '.'
455
+ @index += 1
456
+ if at_end_of_number?
457
+ repair_number_ending_with_numeric_symbol(start)
458
+ return true
459
+ end
460
+ unless digit?(@json[@index])
461
+ @index = start
462
+ return false
463
+ end
464
+ @index += 1 while digit?(@json[@index])
465
+ end
466
+
467
+ if @json[@index] && @json[@index].downcase == 'e'
468
+ @index += 1
469
+ @index += 1 if ['-', '+'].include?(@json[@index])
470
+ if at_end_of_number?
471
+ repair_number_ending_with_numeric_symbol(start)
472
+ return true
473
+ end
474
+ unless digit?(@json[@index])
475
+ @index = start
476
+ return false
477
+ end
478
+ @index += 1 while digit?(@json[@index])
479
+ end
480
+
481
+ # if we're not at the end of the number by this point, allow this to be parsed as another type
482
+ unless at_end_of_number?
483
+ @index = start
484
+ return false
485
+ end
486
+
487
+ if @index > start
488
+ # repair a number with leading zeros like "00789"
489
+ num = @json[start...@index]
490
+ has_invalid_leading_zero = num.match?(/^0\d/)
491
+
492
+ @output += has_invalid_leading_zero ? "\"#{num}\"" : num
493
+ return true
494
+ end
495
+
496
+ false
497
+ end
498
+
499
+ def at_end_of_number?
500
+ @index >= @json.length || delimiter?(@json[@index]) || whitespace?(@json[@index])
501
+ end
502
+
503
+ # Parse an array like '["item1", "item2", ...]'
504
+ def parse_array
505
+ if @json[@index] == OPENING_BRACKET
506
+ @output += '['
507
+ @index += 1
508
+ parse_whitespace_and_skip_comments
509
+
510
+ # repair: skip leading comma like in [,1,2,3]
511
+ parse_whitespace_and_skip_comments if skip_character(COMMA)
512
+
513
+ initial = true
514
+ while @index < @json.length && @json[@index] != CLOSING_BRACKET
515
+ if initial
516
+ initial = false
517
+ else
518
+ processed_comma = parse_character(COMMA)
519
+ # repair missing comma
520
+ @output = insert_before_last_whitespace(@output, ',') unless processed_comma
521
+ end
522
+
523
+ skip_ellipsis
524
+
525
+ processed_value = parse_value
526
+ next if processed_value
527
+
528
+ # repair trailing comma
529
+ @output = strip_last_occurrence(@output, ',')
530
+ break
531
+ end
532
+
533
+ if @json[@index] == CLOSING_BRACKET
534
+ @output += ']'
535
+ @index += 1
536
+ else
537
+ # repair missing closing array bracket
538
+ @output = insert_before_last_whitespace(@output, ']')
539
+ end
540
+
541
+ true
542
+ else
543
+ false
544
+ end
545
+ end
546
+
547
+ def prev_non_whitespace_index(start)
548
+ prev = start
549
+ prev -= 1 while prev.positive? && whitespace?(@json[prev])
550
+ prev
551
+ end
552
+
553
+ # Repair concatenated strings like "hello" + "world", change this into "helloworld"
554
+ def parse_concatenated_string
555
+ processed = false
556
+
557
+ parse_whitespace_and_skip_comments
558
+ while @json[@index] == PLUS
559
+ processed = true
560
+ @index += 1
561
+ parse_whitespace_and_skip_comments
562
+
563
+ # repair: remove the end quote of the first string
564
+ @output = strip_last_occurrence(@output, '"', strip_remaining_text: true)
565
+ start = @output.length
566
+ parsed_str = parse_string
567
+ @output = if parsed_str
568
+ # repair: remove the start quote of the second string
569
+ remove_at_index(@output, start, 1)
570
+ else
571
+ # repair: remove the '+' because it is not followed by a string
572
+ insert_before_last_whitespace(@output, '"')
573
+ end
574
+ end
575
+
576
+ processed
577
+ end
578
+
579
+ def repair_number_ending_with_numeric_symbol(start)
580
+ # repair numbers cut off at the end
581
+ # this will only be called when we end after a '.', '-', or 'e' and does not
582
+ # change the number more than it needs to make it valid JSON
583
+ @output += "#{@json[start...@index]}0"
584
+ end
585
+
586
+ # Parse and repair Newline Delimited JSON (NDJSON):
587
+ # multiple JSON objects separated by a newline character
588
+ def parse_newline_delimited_json
589
+ # repair NDJSON
590
+ initial = true
591
+ processed_value = true
592
+ while processed_value
593
+ if initial
594
+ initial = false
595
+ else
596
+ # parse optional comma, insert when missing
597
+ processed_comma = parse_character(COMMA)
598
+ unless processed_comma
599
+ # repair: add missing comma
600
+ @output = insert_before_last_whitespace(@output, ',')
601
+ end
602
+ end
603
+
604
+ processed_value = parse_value
605
+ end
606
+
607
+ unless processed_value
608
+ # repair: remove trailing comma
609
+ @output = strip_last_occurrence(@output, ',')
610
+ end
611
+
612
+ # repair: wrap the output inside array brackets
613
+ @output = "[\n#{@output}\n]"
614
+ end
615
+
616
+ def skip_escape_character
617
+ skip_character(BACKSLASH)
618
+ end
619
+
620
+ def throw_invalid_character(char)
621
+ raise JSONRepairError, "Invalid character #{char.inspect} at index #{@index}"
622
+ end
623
+
624
+ def throw_unexpected_character
625
+ raise JSONRepairError, "Unexpected character #{@json[@index].inspect} at index #{@index}"
626
+ end
627
+
628
+ def throw_unexpected_end
629
+ raise JSONRepairError, 'Unexpected end of json string'
630
+ end
631
+
632
+ def throw_object_key_expected
633
+ raise JSONRepairError, 'Object key expected'
634
+ end
635
+
636
+ def throw_colon_expected
637
+ raise JSONRepairError, 'Colon expected'
638
+ end
639
+
640
+ def throw_invalid_unicode_character
641
+ chars = @json[@index, 6]
642
+ raise JSONRepairError, "Invalid unicode character #{chars.inspect} at index #{@index}"
643
+ end
644
+ end
645
+ end
data/sig/json/repair.rbs CHANGED
@@ -1,7 +1,7 @@
1
1
  module JSON
2
2
  module Repair
3
3
  VERSION: String
4
-
5
- def self.repair(String) -> ?String
6
4
  end
5
+
6
+ def self.repair(String) -> ?String
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: json-repair
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksandr Zykov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-24 00:00:00.000000000 Z
11
+ date: 2024-06-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: This is a simple gem that repairs broken JSON strings.
14
14
  email:
@@ -25,9 +25,9 @@ files:
25
25
  - README.md
26
26
  - Rakefile
27
27
  - lib/json/repair.rb
28
- - lib/json/repair/repairer.rb
29
28
  - lib/json/repair/string_utils.rb
30
29
  - lib/json/repair/version.rb
30
+ - lib/json/repairer.rb
31
31
  - sig/json/repair.rbs
32
32
  homepage: https://github.com/sashazykov/json-repair-rb
33
33
  licenses: