json-repair 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b50bffd203f06c7b7d2fa875802b66dd6fc944d01fe7c7f6c349233b2dc73d60
4
- data.tar.gz: f018489c9572a61a72e9784af8f2b2fec335e215933ad0efea1265cff7d7be4e
3
+ metadata.gz: db2b6fb7849a2e75329405c1f85fa7de836b0fa2f079623032571f42d359514d
4
+ data.tar.gz: 1c845714c4c443bad3c9277a2ceae6cef8ff346125f52f89473aaa50b9ff2132
5
5
  SHA512:
6
- metadata.gz: 7b1047f154815fde7e587fac75c1316ccf799a3ce73c8d5da97ae94305cc8368ea745f7472972a552da4a9df61acb54730d39a2080eb5e61e9fc46d234bbcfd0
7
- data.tar.gz: 25d80f6b35509da21cbf0932a67187bd8f69ccac2f06c15e93dbc74a45c0ae4018145051f6d601df5b651aa33036d8cd4dcafefe800f5533110271a38500b4d7
6
+ metadata.gz: 53929154af31033e2f380ed89979430f4339c97c94c088b6f85da27ac251d658b98840e44085c4ba9b4972bab75c1bb0f8ad750beddd4bb79e439efb135e0386
7
+ data.tar.gz: b4b5150aee81c518eaee8847bb2f5d8d8131a15719bb93badce465a2d447ddc361888155b2d33125fdd69d2568424c08440772c61a7f7f5b35922a4d1270adf8
data/CHANGELOG.md CHANGED
Binary file
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # JSON::Repair [![Gem Version](https://badge.fury.io/rb/json-repair.svg)](https://badge.fury.io/rb/json-repair) [![Build Status](https://github.com/sashazykov/json-repair-rb/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/sashazykov/json-repair-rb/actions)
1
+ # JSON::Repair [![Gem Version](https://badge.fury.io/rb/json-repair.svg)](https://badge.fury.io/rb/json-repair) [![Build Status](https://github.com/sashazykov/json-repair-rb/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/sashazykov/json-repair-rb/actions) [![Stand With Ukraine](https://raw.githubusercontent.com/vshymanskyy/StandWithUkraine/main/badges/StandWithUkraine.svg)](https://stand-with-ukraine.pp.ua)
2
2
 
3
3
  This is a Ruby gem designed to repair broken JSON strings. Inspired by and based on the [jsonrepair js library](https://github.com/josdejong/jsonrepair/). It efficiently handles and corrects malformed JSON data, making it especially useful in scenarios where JSON output from LLMs might not strictly adhere to JSON standards. Whether it's missing quotes, misplaced commas, or unexpected characters, it ensures that the JSON data is valid and can be parsed correctly.
4
4
 
data/Steepfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ target :lib do
4
+ signature 'sig'
5
+ check 'lib'
6
+ end
@@ -35,21 +35,28 @@ module JSON
35
35
  LOWERCASE_E = 'e' # 0x65
36
36
  UPPERCASE_F = 'F' # 0x46
37
37
  LOWERCASE_F = 'f' # 0x66
38
- NON_BREAKING_SPACE = "\u00a0" # 0xa0
39
- EN_QUAD = "\u2000" # 0x2000
40
- HAIR_SPACE = "\u200a" # 0x200a
41
- NARROW_NO_BREAK_SPACE = "\u202f" # 0x202f
42
- MEDIUM_MATHEMATICAL_SPACE = "\u205f" # 0x205f
43
- IDEOGRAPHIC_SPACE = "\u3000" # 0x3000
44
- DOUBLE_QUOTE_LEFT = "\u201c" # 0x201c
45
- DOUBLE_QUOTE_RIGHT = "\u201d" # 0x201d
46
- QUOTE_LEFT = "\u2018" # 0x2018
47
- QUOTE_RIGHT = "\u2019" # 0x2019
38
+ NON_BREAKING_SPACE = ' ' # 0xa0
39
+ MONGOLIAN_VOWEL_SEPARATOR = '᠎' # 0x180e
40
+ EN_QUAD = ' ' # 0x2000
41
+ ZERO_WIDTH_SPACE = '​' # 0x200b
42
+ NARROW_NO_BREAK_SPACE = ' ' # 0x202f
43
+ MEDIUM_MATHEMATICAL_SPACE = ' ' # 0x205f
44
+ IDEOGRAPHIC_SPACE = ' ' # 0x3000
45
+ ZERO_WIDTH_NO_BREAK_SPACE = '' # 0xfeff
46
+ DOUBLE_QUOTE_LEFT = '“' # 0x201c
47
+ DOUBLE_QUOTE_RIGHT = '”' # 0x201d
48
+ QUOTE_LEFT = '‘' # 0x2018
49
+ QUOTE_RIGHT = '’' # 0x2019
48
50
  GRAVE_ACCENT = '`' # 0x0060
49
- ACUTE_ACCENT = "\u00b4" # 0x00b4
51
+ ACUTE_ACCENT = '´' # 0x00b4
50
52
 
51
53
  REGEX_DELIMITER = %r{^[,:\[\]/{}()\n+]+$}
54
+ REGEX_UNQUOTED_STRING_DELIMITER = %r{^[,\[\]/{}\n+]+$}
52
55
  REGEX_START_OF_VALUE = /^[\[{\w-]$/
56
+ # matches "https://" and other schemas
57
+ REGEX_URL_START = %r{^(http|https|ftp|mailto|file|data|irc)://$}
58
+ # matches all valid URL characters EXCEPT "[", "]", and "," (important JSON delimiters)
59
+ REGEX_URL_CHAR = %r{^[A-Za-z0-9\-._~:/?#@!$&'()*+;=]$}
53
60
 
54
61
  # Functions to check character chars
55
62
  def hex?(char)
@@ -70,8 +77,19 @@ module JSON
70
77
  REGEX_DELIMITER.match?(char)
71
78
  end
72
79
 
73
- def delimiter_except_slash?(char)
74
- delimiter?(char) && char != SLASH
80
+ def unquoted_string_delimiter?(char)
81
+ REGEX_UNQUOTED_STRING_DELIMITER.match?(char)
82
+ end
83
+
84
+ REGEX_FUNCTION_NAME_CHAR_START = /\A[a-zA-Z_$]\z/
85
+ REGEX_FUNCTION_NAME_CHAR = /\A[a-zA-Z0-9_$]\z/
86
+
87
+ def function_name_char_start?(char)
88
+ !char.nil? && REGEX_FUNCTION_NAME_CHAR_START.match?(char)
89
+ end
90
+
91
+ def function_name_char?(char)
92
+ !char.nil? && REGEX_FUNCTION_NAME_CHAR.match?(char)
75
93
  end
76
94
 
77
95
  def start_of_value?(char)
@@ -86,11 +104,22 @@ module JSON
86
104
  [SPACE, NEWLINE, TAB, RETURN].include?(char)
87
105
  end
88
106
 
107
+ def whitespace_except_newline?(char)
108
+ [SPACE, TAB, RETURN].include?(char)
109
+ end
110
+
89
111
  def special_whitespace?(char)
112
+ return false unless char
113
+
90
114
  [
91
- NON_BREAKING_SPACE, NARROW_NO_BREAK_SPACE, MEDIUM_MATHEMATICAL_SPACE, IDEOGRAPHIC_SPACE
115
+ NON_BREAKING_SPACE,
116
+ MONGOLIAN_VOWEL_SEPARATOR,
117
+ NARROW_NO_BREAK_SPACE,
118
+ MEDIUM_MATHEMATICAL_SPACE,
119
+ IDEOGRAPHIC_SPACE,
120
+ ZERO_WIDTH_NO_BREAK_SPACE
92
121
  ].include?(char) ||
93
- (char >= EN_QUAD && char <= HAIR_SPACE)
122
+ (char >= EN_QUAD && char <= ZERO_WIDTH_SPACE)
94
123
  end
95
124
 
96
125
  def quote?(char)
@@ -149,7 +178,7 @@ module JSON
149
178
 
150
179
  def parse_keyword(name, value)
151
180
  if @json[@index, name.length] == name
152
- @output += value
181
+ @output << value
153
182
  @index += name.length
154
183
  true
155
184
  else
@@ -161,10 +190,6 @@ module JSON
161
190
  text[0...start] + text[start + count..]
162
191
  end
163
192
 
164
- def function_name?(text)
165
- /^\w+$/.match?(text)
166
- end
167
-
168
193
  def ends_with_comma_or_newline?(text)
169
194
  /[,\n][ \t\r]*$/.match?(text)
170
195
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module JSON
4
4
  module Repair
5
- VERSION = '0.2.0'
5
+ VERSION = '0.3.0'
6
6
  end
7
7
  end
data/lib/json/repairer.rb CHANGED
@@ -25,17 +25,24 @@ module JSON
25
25
  't' => "\t"
26
26
  }.freeze
27
27
 
28
+ MARKDOWN_OPEN_BLOCKS = ['```', '[```', '{```'].freeze
29
+ MARKDOWN_CLOSE_BLOCKS = ['```', '```]', '```}'].freeze
30
+
28
31
  def initialize(json)
29
32
  @json = json
30
33
  @index = 0
31
- @output = ''
34
+ @output = +''
32
35
  end
33
36
 
34
37
  def repair
38
+ parse_markdown_code_block(MARKDOWN_OPEN_BLOCKS)
39
+
35
40
  processed = parse_value
36
41
 
37
42
  throw_unexpected_end unless processed
38
43
 
44
+ parse_markdown_code_block(MARKDOWN_CLOSE_BLOCKS)
45
+
39
46
  processed_comma = parse_character(COMMA)
40
47
  parse_whitespace_and_skip_comments if processed_comma
41
48
 
@@ -71,22 +78,45 @@ module JSON
71
78
 
72
79
  def parse_value
73
80
  parse_whitespace_and_skip_comments
74
- process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
81
+ process = parse_object ||
82
+ parse_array ||
83
+ parse_string ||
84
+ parse_number ||
85
+ parse_keywords ||
86
+ parse_unquoted_string(false) ||
87
+ parse_regex
75
88
  parse_whitespace_and_skip_comments
76
89
 
77
90
  process
78
91
  end
79
92
 
80
- def parse_whitespace
81
- whitespace = ''
82
- while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
83
- whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
93
+ def parse_whitespace_and_skip_comments(skip_newline: true)
94
+ start = @index
95
+
96
+ changed = parse_whitespace(skip_newline: skip_newline)
97
+ loop do
98
+ changed = parse_comment
99
+ changed = parse_whitespace(skip_newline: skip_newline) if changed
100
+ break unless changed
101
+ end
102
+
103
+ @index > start
104
+ end
105
+
106
+ def parse_whitespace(skip_newline: true)
107
+ whitespace = +''
108
+ while @json[@index] && (
109
+ (skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])) ||
110
+ special_whitespace?(@json[@index])
111
+ )
112
+ ws = skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])
113
+ whitespace << (ws ? @json[@index] : ' ')
84
114
 
85
115
  @index += 1
86
116
  end
87
117
 
88
118
  unless whitespace.empty?
89
- @output += whitespace
119
+ @output << whitespace
90
120
  return true
91
121
  end
92
122
 
@@ -110,11 +140,41 @@ module JSON
110
140
  end
111
141
  end
112
142
 
143
+ # Find and skip over a Markdown fenced code block:
144
+ # ``` ... ```
145
+ # or
146
+ # ```json ... ```
147
+ def parse_markdown_code_block(blocks)
148
+ return false unless skip_markdown_code_block(blocks)
149
+
150
+ if function_name_char_start?(@json[@index])
151
+ # strip the optional language specifier like "json"
152
+ @index += 1 while @index < @json.length && function_name_char?(@json[@index])
153
+ end
154
+
155
+ parse_whitespace_and_skip_comments
156
+
157
+ true
158
+ end
159
+
160
+ def skip_markdown_code_block(blocks)
161
+ parse_whitespace(skip_newline: true)
162
+
163
+ blocks.each do |block|
164
+ if @json[@index, block.length] == block
165
+ @index += block.length
166
+ return true
167
+ end
168
+ end
169
+
170
+ false
171
+ end
172
+
113
173
  # Parse an object like '{"key": "value"}'
114
174
  def parse_object
115
175
  return false unless @json[@index] == OPENING_BRACE
116
176
 
117
- @output += '{'
177
+ @output << '{'
118
178
  @index += 1
119
179
  parse_whitespace_and_skip_comments
120
180
 
@@ -137,7 +197,7 @@ module JSON
137
197
 
138
198
  skip_ellipsis
139
199
 
140
- processed_key = parse_string || parse_unquoted_string
200
+ processed_key = parse_string || parse_unquoted_string(true)
141
201
  unless processed_key
142
202
  if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
143
203
  @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
@@ -166,7 +226,7 @@ module JSON
166
226
  unless processed_value
167
227
  if processed_colon || truncated_text
168
228
  # repair missing object value
169
- @output += 'null'
229
+ @output << 'null'
170
230
  else
171
231
  throw_colon_expected
172
232
  end
@@ -174,7 +234,7 @@ module JSON
174
234
  end
175
235
 
176
236
  if @json[@index] == CLOSING_BRACE
177
- @output += '}'
237
+ @output << '}'
178
238
  @index += 1
179
239
  else
180
240
  # repair missing end bracket
@@ -217,199 +277,273 @@ module JSON
217
277
  # - If it turns out that the string does not have a valid end quote followed
218
278
  # by a delimiter (which should be the case), the function runs again in a
219
279
  # more conservative way, stopping the string at the first next delimiter
220
- # and fixing the string by inserting a quote there.
221
- def parse_string(stop_at_delimiter: false)
222
- if @json[@index] == BACKSLASH
280
+ # and fixing the string by inserting a quote there, or stopping at a
281
+ # stop index detected in the first iteration.
282
+ def parse_string(stop_at_delimiter: false, stop_at_index: -1)
283
+ skip_escape_chars = @json[@index] == BACKSLASH
284
+ if skip_escape_chars
223
285
  # repair: remove the first escape character
224
286
  @index += 1
225
- skip_escape_chars = true
226
287
  end
227
288
 
228
- if quote?(@json[@index])
229
- # double quotes are correct JSON,
230
- # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
231
- # otherwise, we will match any double-quote-like start with a double-quote-like end,
232
- # or any single-quote-like start with a single-quote-like end
233
- is_end_quote = if double_quote?(@json[@index])
234
- method(:double_quote?)
235
- elsif single_quote?(@json[@index])
236
- method(:single_quote?)
237
- elsif single_quote_like?(@json[@index])
238
- method(:single_quote_like?)
239
- else
240
- method(:double_quote_like?)
241
- end
242
-
243
- i_before = @index
244
- o_before = @output.length
245
-
246
- str = '"'
247
- @index += 1
289
+ return false unless quote?(@json[@index])
290
+
291
+ # double quotes are correct JSON,
292
+ # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
293
+ # otherwise, we will match any double-quote-like start with a double-quote-like end,
294
+ # or any single-quote-like start with a single-quote-like end
295
+ is_end_quote = if double_quote?(@json[@index])
296
+ method(:double_quote?)
297
+ elsif single_quote?(@json[@index])
298
+ method(:single_quote?)
299
+ elsif single_quote_like?(@json[@index])
300
+ method(:single_quote_like?)
301
+ else
302
+ method(:double_quote_like?)
303
+ end
304
+
305
+ i_before = @index
306
+ o_before = @output.length
307
+
308
+ str = +'"'
309
+ @index += 1
248
310
 
249
- loop do
250
- if @index >= @json.length
251
- # end of text, we are missing an end quote
311
+ loop do
312
+ if @index >= @json.length
313
+ # end of text, we are missing an end quote
314
+
315
+ i_prev = prev_non_whitespace_index(@index - 1)
316
+ if !stop_at_delimiter && delimiter?(@json[i_prev])
317
+ # if the text ends with a delimiter, like ["hello],
318
+ # so the missing end quote should be inserted before this delimiter
319
+ # retry parsing the string, stopping at the first next delimiter
320
+ @index = i_before
321
+ @output = @output[0...o_before]
322
+
323
+ return parse_string(stop_at_delimiter: true)
324
+ end
252
325
 
253
- i_prev = prev_non_whitespace_index(@index - 1)
254
- if !stop_at_delimiter && delimiter?(@json[i_prev])
255
- # if the text ends with a delimiter, like ["hello],
256
- # so the missing end quote should be inserted before this delimiter
257
- # retry parsing the string, stopping at the first next delimiter
258
- @index = i_before
259
- @output = @output[0...o_before]
326
+ # repair missing quote
327
+ str = insert_before_last_whitespace(str, '"')
328
+ @output << str
260
329
 
261
- return parse_string(stop_at_delimiter: true)
262
- end
330
+ return true
331
+ end
263
332
 
264
- # repair missing quote
265
- str = insert_before_last_whitespace(str, '"')
266
- @output += str
333
+ if @index == stop_at_index
334
+ # use the stop index detected in the first iteration, and repair end quote
335
+ str = insert_before_last_whitespace(str, '"')
336
+ @output << str
267
337
 
268
- return true
269
- elsif is_end_quote.call(@json[@index])
270
- # end quote
271
- i_quote = @index
272
- o_quote = str.length
273
- str += '"'
274
- @index += 1
275
- @output += str
338
+ return true
339
+ end
276
340
 
277
- parse_whitespace_and_skip_comments
341
+ if is_end_quote.call(@json[@index])
342
+ # end quote
343
+ # let us check what is before and after the quote to verify whether this is a legit end quote
344
+ i_quote = @index
345
+ o_quote = str.length
346
+ str << '"'
347
+ @index += 1
348
+ @output << str
278
349
 
279
- if stop_at_delimiter ||
280
- @index >= @json.length ||
281
- delimiter?(@json[@index]) ||
282
- quote?(@json[@index]) ||
283
- digit?(@json[@index])
284
- # The quote is followed by the end of the text, a delimiter, or a next value
285
- parse_concatenated_string
350
+ parse_whitespace_and_skip_comments(skip_newline: false)
286
351
 
287
- return true
288
- end
352
+ if stop_at_delimiter ||
353
+ @index >= @json.length ||
354
+ delimiter?(@json[@index]) ||
355
+ quote?(@json[@index]) ||
356
+ digit?(@json[@index])
357
+ # The quote is followed by the end of the text, a delimiter, or a next value
358
+ parse_concatenated_string
289
359
 
290
- if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
291
- # This is not the right end quote: it is preceded by a delimiter,
292
- # and NOT followed by a delimiter. So, there is an end quote missing
293
- # parse the string again and then stop at the first next delimiter
294
- @index = i_before
295
- @output = @output[...o_before]
360
+ return true
361
+ end
296
362
 
297
- return parse_string(stop_at_delimiter: true)
298
- end
363
+ i_prev_char = prev_non_whitespace_index(i_quote - 1)
364
+ prev_char = @json[i_prev_char]
365
+
366
+ if prev_char == ','
367
+ # A comma followed by a quote, like '{"a":"b,c,"d":"e"}'.
368
+ # We assume that the quote is a start quote, and that the end quote
369
+ # should have been located right before the comma but is missing.
370
+ @index = i_before
371
+ @output = @output[0...o_before]
299
372
 
300
- # revert to right after the quote but before any whitespace, and continue parsing the string
373
+ return parse_string(stop_at_delimiter: false, stop_at_index: i_prev_char)
374
+ end
375
+
376
+ if delimiter?(prev_char)
377
+ # This is not the right end quote: it is preceded by a delimiter,
378
+ # and NOT followed by a delimiter. So, there is an end quote missing
379
+ # parse the string again and then stop at the first next delimiter
380
+ @index = i_before
301
381
  @output = @output[...o_before]
302
- @index = i_quote + 1
303
382
 
304
- # repair unescaped quote
305
- str = "#{str[...o_quote]}\\#{str[o_quote..]}"
306
- elsif stop_at_delimiter && delimiter?(@json[@index])
307
- # we're in the mode to stop the string at the first delimiter
308
- # because there is an end quote missing
383
+ return parse_string(stop_at_delimiter: true)
384
+ end
309
385
 
310
- # repair missing quote
311
- str = insert_before_last_whitespace(str, '"')
312
- @output += str
386
+ # revert to right after the quote but before any whitespace, and continue parsing the string
387
+ @output = @output[...o_before]
388
+ @index = i_quote + 1
389
+
390
+ # repair unescaped quote
391
+ str = "#{str[...o_quote]}\\#{str[o_quote..]}"
392
+ elsif stop_at_delimiter && unquoted_string_delimiter?(@json[@index])
393
+ # we're in the mode to stop the string at the first delimiter
394
+ # because there is an end quote missing
395
+
396
+ # test start of an url like "https://..." (this would be parsed as a comment)
397
+ if @json[@index - 1] == ':' &&
398
+ REGEX_URL_START.match?(@json[(i_before + 1)..(@index + 1)] || '')
399
+ while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
400
+ str << @json[@index]
401
+ @index += 1
402
+ end
403
+ end
313
404
 
314
- parse_concatenated_string
405
+ # repair missing quote
406
+ str = insert_before_last_whitespace(str, '"')
407
+ @output << str
315
408
 
316
- return true
317
- elsif @json[@index] == BACKSLASH
318
- # handle escaped content like \n or \u2605
319
- char = @json[@index + 1]
320
- escape_char = ESCAPE_CHARACTERS[char]
321
- if escape_char
322
- str += @json[@index, 2]
323
- @index += 2
324
- elsif char == 'u'
325
- j = 2
326
- j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
327
- if j == 6
328
- str += @json[@index, 6]
329
- @index += 6
330
- elsif @index + j >= @json.length
331
- # repair invalid or truncated unicode char at the end of the text
332
- # by removing the unicode char and ending the string here
333
- @index = @json.length
334
- else
335
- throw_invalid_unicode_character
336
- end
409
+ parse_concatenated_string
410
+
411
+ return true
412
+ elsif @json[@index] == BACKSLASH
413
+ # handle escaped content like \n or ★
414
+ char = @json[@index + 1]
415
+ escape_char = ESCAPE_CHARACTERS[char]
416
+ if escape_char
417
+ str << @json[@index, 2]
418
+ @index += 2
419
+ elsif char == 'u'
420
+ j = 2
421
+ j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
422
+ if j == 6
423
+ str << @json[@index, 6]
424
+ @index += 6
425
+ elsif @index + j >= @json.length
426
+ # repair invalid or truncated unicode char at the end of the text
427
+ # by removing the unicode char and ending the string here
428
+ @index = @json.length
337
429
  else
338
- # repair invalid escape character: remove it
339
- str += char
340
- @index += 2
430
+ throw_invalid_unicode_character
341
431
  end
432
+ elsif char == "\n"
433
+ # repair a backslash escaped newline (like in Bash scripts)
434
+ str << '\n'
435
+ @index += 2
342
436
  else
343
- # handle regular characters
344
- char = @json[@index]
345
-
346
- if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
347
- # repair unescaped double quote
348
- str += "\\#{char}"
349
- elsif control_character?(char)
350
- # unescaped control character
351
- str += CONTROL_CHARACTERS[char]
352
- else
353
- throw_invalid_character(char) unless valid_string_character?(char)
354
- str += char
355
- end
356
-
357
- @index += 1
437
+ # repair invalid escape character: remove it
438
+ str << char
439
+ @index += 2
358
440
  end
359
-
360
- if skip_escape_chars
361
- # repair: skipped escape character (nothing to do)
362
- skip_escape_character
441
+ else
442
+ # handle regular characters
443
+ char = @json[@index]
444
+
445
+ if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
446
+ # repair unescaped double quote
447
+ str << "\\#{char}"
448
+ elsif control_character?(char)
449
+ # unescaped control character
450
+ str << CONTROL_CHARACTERS[char]
451
+ else
452
+ throw_invalid_character(char) unless valid_string_character?(char)
453
+ str << char
363
454
  end
455
+ @index += 1
364
456
  end
365
- end
366
457
 
367
- false
458
+ if skip_escape_chars
459
+ # repair: skipped escape character (nothing to do)
460
+ skip_escape_character
461
+ end
462
+ end
368
463
  end
369
464
 
370
465
  # Repair an unquoted string by adding quotes around it
371
466
  # Repair a MongoDB function call like NumberLong("2")
372
467
  # Repair a JSONP function call like callback({...});
373
- def parse_unquoted_string
468
+ def parse_unquoted_string(is_key)
469
+ # NOTE: that the symbol can end with whitespaces: we stop at the next delimiter
470
+ # also, note that we allow strings to contain a slash / in order to support repairing regular expressions
374
471
  start = @index
375
- @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
376
- return if @index <= start
377
472
 
378
- if @json[@index] == '(' && function_name?(@json[start...@index].strip)
379
- # Repair a MongoDB function call like NumberLong("2")
380
- # Repair a JSONP function call like callback({...});
381
- @index += 1
473
+ if function_name_char_start?(@json[@index])
474
+ @index += 1 while @index < @json.length && function_name_char?(@json[@index])
382
475
 
383
- parse_value
476
+ j = @index
477
+ j += 1 while whitespace?(@json[j])
384
478
 
385
- if @json[@index] == ')'
386
- # Repair: skip close bracket of function call
387
- @index += 1
388
- # Repair: skip semicolon after JSONP call
389
- @index += 1 if @json[@index] == ';'
390
- end
391
- else
392
- # Repair unquoted string
393
- # Also, repair undefined into null
479
+ if @json[j] == '('
480
+ # repair a MongoDB function call like NumberLong("2")
481
+ # repair a JSONP function call like callback({...});
482
+ @index = j + 1
394
483
 
395
- # First, go back to prevent getting trailing whitespaces in the string
396
- @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
484
+ parse_value
397
485
 
398
- symbol = @json[start...@index]
399
- @output += symbol == 'undefined' ? 'null' : symbol.inspect
486
+ if @json[@index] == ')'
487
+ # Repair: skip close bracket of function call
488
+ @index += 1
489
+ # Repair: skip semicolon after JSONP call
490
+ @index += 1 if @json[@index] == ';'
491
+ end
400
492
 
401
- if @json[@index] == '"'
402
- # We had a missing start quote, but now we encountered the end quote, so we can skip that one
403
- @index += 1
493
+ return true
404
494
  end
405
495
  end
406
496
 
497
+ while @index < @json.length &&
498
+ !unquoted_string_delimiter?(@json[@index]) &&
499
+ !quote?(@json[@index]) &&
500
+ (!is_key || @json[@index] != ':')
501
+ @index += 1
502
+ end
503
+
504
+ # test start of an url like "https://..." (this would be parsed as a comment)
505
+ if @json[@index - 1] == ':' &&
506
+ REGEX_URL_START.match?(@json[start...(@index + 2)] || '')
507
+ @index += 1 while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
508
+ end
509
+
510
+ return false if @index <= start
511
+
512
+ # Repair unquoted string
513
+ # Also, repair undefined into null
514
+
515
+ # First, go back to prevent getting trailing whitespaces in the string
516
+ @index -= 1 while @index.positive? && whitespace?(@json[@index - 1])
517
+
518
+ symbol = @json[start...@index]
519
+ @output << (symbol == 'undefined' ? 'null' : symbol.inspect)
520
+
521
+ if @json[@index] == '"'
522
+ # We had a missing start quote, but now we encountered the end quote, so we can skip that one
523
+ @index += 1
524
+ end
525
+
526
+ true
527
+ end
528
+
529
+ # Parse a regular expression literal like /foo/ or /foo\/bar/
530
+ def parse_regex
531
+ return false unless @json[@index] == '/'
532
+
533
+ start = @index
534
+ @index += 1
535
+
536
+ @index += 1 while @index < @json.length && (@json[@index] != '/' || @json[@index - 1] == BACKSLASH)
537
+ @index += 1
538
+
539
+ @output << @json[start...@index].inspect
540
+
407
541
  true
408
542
  end
409
543
 
410
544
  def parse_character(char)
411
545
  if @json[@index] == char
412
- @output += @json[@index]
546
+ @output << @json[@index]
413
547
  @index += 1
414
548
  true
415
549
  else
@@ -417,19 +551,6 @@ module JSON
417
551
  end
418
552
  end
419
553
 
420
- def parse_whitespace_and_skip_comments
421
- start = @index
422
-
423
- changed = parse_whitespace
424
- loop do
425
- changed = parse_comment
426
- changed = parse_whitespace if changed
427
- break unless changed
428
- end
429
-
430
- @index > start
431
- end
432
-
433
554
  # Parse a number like 2.4 or 2.4e6
434
555
  def parse_number
435
556
  start = @index
@@ -489,7 +610,7 @@ module JSON
489
610
  num = @json[start...@index]
490
611
  has_invalid_leading_zero = num.match?(/^0\d/)
491
612
 
492
- @output += has_invalid_leading_zero ? "\"#{num}\"" : num
613
+ @output << (has_invalid_leading_zero ? "\"#{num}\"" : num)
493
614
  return true
494
615
  end
495
616
 
@@ -503,7 +624,7 @@ module JSON
503
624
  # Parse an array like '["item1", "item2", ...]'
504
625
  def parse_array
505
626
  if @json[@index] == OPENING_BRACKET
506
- @output += '['
627
+ @output << '['
507
628
  @index += 1
508
629
  parse_whitespace_and_skip_comments
509
630
 
@@ -531,7 +652,7 @@ module JSON
531
652
  end
532
653
 
533
654
  if @json[@index] == CLOSING_BRACKET
534
- @output += ']'
655
+ @output << ']'
535
656
  @index += 1
536
657
  else
537
658
  # repair missing closing array bracket
@@ -580,7 +701,7 @@ module JSON
580
701
  # repair numbers cut off at the end
581
702
  # this will only be called when we end after a '.', '-', or 'e' and does not
582
703
  # change the number more than it needs to make it valid JSON
583
- @output += "#{@json[start...@index]}0"
704
+ @output << "#{@json[start...@index]}0"
584
705
  end
585
706
 
586
707
  # Parse and repair Newline Delimited JSON (NDJSON):
@@ -0,0 +1,165 @@
1
+ module JSON
2
+ module Repair
3
+ module StringUtils
4
+ @output: untyped
5
+
6
+ @index: untyped
7
+
8
+ # Constants for character chars
9
+ BACKSLASH: "\\"
10
+
11
+ SLASH: "/"
12
+
13
+ ASTERISK: "*"
14
+
15
+ OPENING_BRACE: "{"
16
+
17
+ CLOSING_BRACE: "}"
18
+
19
+ OPENING_BRACKET: "["
20
+
21
+ CLOSING_BRACKET: "]"
22
+
23
+ OPEN_PARENTHESIS: "("
24
+
25
+ CLOSE_PARENTHESIS: ")"
26
+
27
+ SPACE: " "
28
+
29
+ NEWLINE: "\n"
30
+
31
+ TAB: "\t"
32
+
33
+ RETURN: "\r"
34
+
35
+ BACKSPACE: "\b"
36
+
37
+ FORM_FEED: "\f"
38
+
39
+ DOUBLE_QUOTE: "\""
40
+
41
+ PLUS: "+"
42
+
43
+ MINUS: "-"
44
+
45
+ QUOTE: "'"
46
+
47
+ ZERO: "0"
48
+
49
+ NINE: "9"
50
+
51
+ COMMA: ","
52
+
53
+ DOT: "."
54
+
55
+ COLON: ":"
56
+
57
+ SEMICOLON: ";"
58
+
59
+ UPPERCASE_A: "A"
60
+
61
+ LOWERCASE_A: "a"
62
+
63
+ UPPERCASE_E: "E"
64
+
65
+ LOWERCASE_E: "e"
66
+
67
+ UPPERCASE_F: "F"
68
+
69
+ LOWERCASE_F: "f"
70
+
71
+ NON_BREAKING_SPACE: ::String
72
+
73
+ MONGOLIAN_VOWEL_SEPARATOR: ::String
74
+
75
+ EN_QUAD: ::String
76
+
77
+ ZERO_WIDTH_SPACE: ::String
78
+
79
+ NARROW_NO_BREAK_SPACE: ::String
80
+
81
+ MEDIUM_MATHEMATICAL_SPACE: ::String
82
+
83
+ IDEOGRAPHIC_SPACE: ::String
84
+
85
+ ZERO_WIDTH_NO_BREAK_SPACE: ::String
86
+
87
+ DOUBLE_QUOTE_LEFT: ::String
88
+
89
+ DOUBLE_QUOTE_RIGHT: ::String
90
+
91
+ QUOTE_LEFT: ::String
92
+
93
+ QUOTE_RIGHT: ::String
94
+
95
+ GRAVE_ACCENT: "`"
96
+
97
+ ACUTE_ACCENT: ::String
98
+
99
+ REGEX_DELIMITER: ::Regexp
100
+
101
+ REGEX_UNQUOTED_STRING_DELIMITER: ::Regexp
102
+
103
+ REGEX_START_OF_VALUE: ::Regexp
104
+
105
+ REGEX_URL_START: ::Regexp
106
+
107
+ REGEX_URL_CHAR: ::Regexp
108
+
109
+ REGEX_FUNCTION_NAME_CHAR_START: ::Regexp
110
+
111
+ REGEX_FUNCTION_NAME_CHAR: ::Regexp
112
+
113
+ # Functions to check character chars
114
+ def hex?: (untyped char) -> untyped
115
+
116
+ def digit?: (untyped char) -> untyped
117
+
118
+ def valid_string_character?: (untyped char) -> untyped
119
+
120
+ def delimiter?: (untyped char) -> untyped
121
+
122
+ def unquoted_string_delimiter?: (untyped char) -> untyped
123
+
124
+ def function_name_char_start?: (untyped char) -> untyped
125
+
126
+ def function_name_char?: (untyped char) -> untyped
127
+
128
+ def start_of_value?: (untyped char) -> untyped
129
+
130
+ def control_character?: (untyped char) -> untyped
131
+
132
+ def whitespace?: (untyped char) -> untyped
133
+
134
+ def whitespace_except_newline?: (untyped char) -> untyped
135
+
136
+ def special_whitespace?: (untyped char) -> untyped
137
+
138
+ def quote?: (untyped char) -> untyped
139
+
140
+ def double_quote?: (untyped char) -> untyped
141
+
142
+ def single_quote?: (untyped char) -> untyped
143
+
144
+ def double_quote_like?: (untyped char) -> untyped
145
+
146
+ def single_quote_like?: (untyped char) -> untyped
147
+
148
+ # Strip last occurrence of text_to_strip from text
149
+ def strip_last_occurrence: (untyped text, untyped text_to_strip, ?strip_remaining_text: bool) -> untyped
150
+
151
+ def insert_before_last_whitespace: (untyped text, untyped text_to_insert) -> untyped
152
+
153
+ # Parse keywords true, false, null
154
+ # Repair Python keywords True, False, None
155
+ # Repair Ruby keyword nil
156
+ def parse_keywords: () -> untyped
157
+
158
+ def parse_keyword: (untyped name, untyped value) -> (true | false)
159
+
160
+ def remove_at_index: (untyped text, untyped start, untyped count) -> untyped
161
+
162
+ def ends_with_comma_or_newline?: (untyped text) -> untyped
163
+ end
164
+ end
165
+ end
data/sig/json/repair.rbs CHANGED
@@ -1,7 +1,10 @@
1
1
  module JSON
2
+ class JSONRepairError < StandardError
3
+ end
4
+
2
5
  module Repair
3
- VERSION: String
6
+ VERSION: ::String
4
7
  end
5
8
 
6
- def self.repair(String) -> ?String
9
+ def self.repair: (::String json) -> ::String
7
10
  end
@@ -0,0 +1,103 @@
1
+ module JSON
2
+ class Repairer
3
+ @json: ::String
4
+
5
+ @index: Integer
6
+
7
+ @output: ::String
8
+
9
+ include Repair::StringUtils
10
+
11
+ CONTROL_CHARACTERS: ::Hash[::String, "\\b" | "\\f" | "\\n" | "\\r" | "\\t"]
12
+
13
+ ESCAPE_CHARACTERS: ::Hash[::String, "\"" | "\\" | "/" | "\b" | "\f" | "\n" | "\r" | "\t"]
14
+
15
+ MARKDOWN_OPEN_BLOCKS: ::Array[::String]
16
+
17
+ MARKDOWN_CLOSE_BLOCKS: ::Array[::String]
18
+
19
+ def initialize: (::String json) -> void
20
+
21
+ def repair: () -> ::String
22
+
23
+ private
24
+
25
+ def parse_value: () -> untyped
26
+
27
+ def parse_whitespace: (?skip_newline: bool) -> (true | false)
28
+
29
+ def parse_comment: () -> (true | false)
30
+
31
+ # Find and skip over a Markdown fenced code block
32
+ def parse_markdown_code_block: (::Array[::String] blocks) -> (true | false)
33
+
34
+ def skip_markdown_code_block: (::Array[::String] blocks) -> (true | false)
35
+
36
+ # Parse an object like '{"key": "value"}'
37
+ def parse_object: () -> (false | true)
38
+
39
+ def skip_character: (untyped char) -> (true | false)
40
+
41
+ # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
42
+ # or a similar construct in objects.
43
+ def skip_ellipsis: () -> untyped
44
+
45
+ # Parse a string enclosed by double quotes "...". Can contain escaped quotes
46
+ # Repair strings enclosed in single quotes or special quotes
47
+ # Repair an escaped string
48
+ #
49
+ # The function can run in two stages:
50
+ # - First, it assumes the string has a valid end quote
51
+ # - If it turns out that the string does not have a valid end quote followed
52
+ # by a delimiter (which should be the case), the function runs again in a
53
+ # more conservative way, stopping the string at the first next delimiter
54
+ # and fixing the string by inserting a quote there, or stopping at a
55
+ # stop index detected in the first iteration.
56
+ def parse_string: (?stop_at_delimiter: bool, ?stop_at_index: ::Integer) -> (untyped | true | false)
57
+
58
+ # Repair an unquoted string by adding quotes around it
59
+ # Repair a MongoDB function call like NumberLong("2")
60
+ # Repair a JSONP function call like callback({...});
61
+ def parse_unquoted_string: (bool is_key) -> (false | true)
62
+
63
+ # Parse a regular expression literal like /foo/ or /foo\/bar/
64
+ def parse_regex: () -> (false | true)
65
+
66
+ def parse_character: (untyped char) -> (true | false)
67
+
68
+ def parse_whitespace_and_skip_comments: (?skip_newline: bool) -> untyped
69
+
70
+ # Parse a number like 2.4 or 2.4e6
71
+ def parse_number: () -> (true | false)
72
+
73
+ def at_end_of_number?: () -> untyped
74
+
75
+ # Parse an array like '["item1", "item2", ...]'
76
+ def parse_array: () -> (true | false)
77
+
78
+ def prev_non_whitespace_index: (untyped start) -> untyped
79
+
80
+ # Repair concatenated strings like "hello" + "world", change this into "helloworld"
81
+ def parse_concatenated_string: () -> untyped
82
+
83
+ def repair_number_ending_with_numeric_symbol: (untyped start) -> untyped
84
+
85
+ # Parse and repair Newline Delimited JSON (NDJSON):
86
+ # multiple JSON objects separated by a newline character
87
+ def parse_newline_delimited_json: () -> untyped
88
+
89
+ def skip_escape_character: () -> untyped
90
+
91
+ def throw_invalid_character: (untyped char) -> untyped
92
+
93
+ def throw_unexpected_character: () -> untyped
94
+
95
+ def throw_unexpected_end: () -> untyped
96
+
97
+ def throw_object_key_expected: () -> untyped
98
+
99
+ def throw_colon_expected: () -> untyped
100
+
101
+ def throw_invalid_unicode_character: () -> untyped
102
+ end
103
+ end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: json-repair
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksandr Zykov
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-06-04 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: This is a simple gem that repairs broken JSON strings.
14
13
  email:
@@ -24,11 +23,14 @@ files:
24
23
  - LICENSE.txt
25
24
  - README.md
26
25
  - Rakefile
26
+ - Steepfile
27
27
  - lib/json/repair.rb
28
28
  - lib/json/repair/string_utils.rb
29
29
  - lib/json/repair/version.rb
30
30
  - lib/json/repairer.rb
31
31
  - sig/json/repair.rbs
32
+ - sig/json/repair/string_utils.rbs
33
+ - sig/json/repairer.rbs
32
34
  homepage: https://github.com/sashazykov/json-repair-rb
33
35
  licenses:
34
36
  - ISC
@@ -37,7 +39,6 @@ metadata:
37
39
  homepage_uri: https://github.com/sashazykov/json-repair-rb
38
40
  source_code_uri: https://github.com/sashazykov/json-repair-rb
39
41
  changelog_uri: https://github.com/sashazykov/json-repair-rb/blob/main/CHANGELOG.md
40
- post_install_message:
41
42
  rdoc_options: []
42
43
  require_paths:
43
44
  - lib
@@ -52,8 +53,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  - !ruby/object:Gem::Version
53
54
  version: '0'
54
55
  requirements: []
55
- rubygems_version: 3.5.10
56
- signing_key:
56
+ rubygems_version: 3.6.9
57
57
  specification_version: 4
58
58
  summary: Repairs broken JSON strings.
59
59
  test_files: []