RubyGems - json-repair - Versions diffs - 0.2.0 → 0.3.0 - Mend

json-repair 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +0 -0
data/README.md +1 -1
data/Steepfile +6 -0
data/lib/json/repair/string_utils.rb +45 -20
data/lib/json/repair/version.rb +1 -1
data/lib/json/repairer.rb +295 -174
data/sig/json/repair/string_utils.rbs +165 -0
data/sig/json/repair.rbs +5 -2
data/sig/json/repairer.rbs +103 -0
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b50bffd203f06c7b7d2fa875802b66dd6fc944d01fe7c7f6c349233b2dc73d60
-  data.tar.gz: f018489c9572a61a72e9784af8f2b2fec335e215933ad0efea1265cff7d7be4e
+  metadata.gz: db2b6fb7849a2e75329405c1f85fa7de836b0fa2f079623032571f42d359514d
+  data.tar.gz: 1c845714c4c443bad3c9277a2ceae6cef8ff346125f52f89473aaa50b9ff2132
 SHA512:
-  metadata.gz: 7b1047f154815fde7e587fac75c1316ccf799a3ce73c8d5da97ae94305cc8368ea745f7472972a552da4a9df61acb54730d39a2080eb5e61e9fc46d234bbcfd0
-  data.tar.gz: 25d80f6b35509da21cbf0932a67187bd8f69ccac2f06c15e93dbc74a45c0ae4018145051f6d601df5b651aa33036d8cd4dcafefe800f5533110271a38500b4d7
+  metadata.gz: 53929154af31033e2f380ed89979430f4339c97c94c088b6f85da27ac251d658b98840e44085c4ba9b4972bab75c1bb0f8ad750beddd4bb79e439efb135e0386
+  data.tar.gz: b4b5150aee81c518eaee8847bb2f5d8d8131a15719bb93badce465a2d447ddc361888155b2d33125fdd69d2568424c08440772c61a7f7f5b35922a4d1270adf8

data/CHANGELOG.md CHANGED Viewed

Binary file

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# JSON::Repair [![Gem Version](https://badge.fury.io/rb/json-repair.svg)](https://badge.fury.io/rb/json-repair) [![Build Status](https://github.com/sashazykov/json-repair-rb/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/sashazykov/json-repair-rb/actions)
+# JSON::Repair [![Gem Version](https://badge.fury.io/rb/json-repair.svg)](https://badge.fury.io/rb/json-repair) [![Build Status](https://github.com/sashazykov/json-repair-rb/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/sashazykov/json-repair-rb/actions) [![Stand With Ukraine](https://raw.githubusercontent.com/vshymanskyy/StandWithUkraine/main/badges/StandWithUkraine.svg)](https://stand-with-ukraine.pp.ua)
 This is a Ruby gem designed to repair broken JSON strings. Inspired by and based on the [jsonrepair js library](https://github.com/josdejong/jsonrepair/). It efficiently handles and corrects malformed JSON data, making it especially useful in scenarios where JSON output from LLMs might not strictly adhere to JSON standards. Whether it's missing quotes, misplaced commas, or unexpected characters, it ensures that the JSON data is valid and can be parsed correctly.

data/Steepfile ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+target :lib do
+  signature 'sig'
+  check 'lib'
+end

data/lib/json/repair/string_utils.rb CHANGED Viewed

@@ -35,21 +35,28 @@ module JSON
       LOWERCASE_E = 'e' # 0x65
       UPPERCASE_F = 'F' # 0x46
       LOWERCASE_F = 'f' # 0x66
-      NON_BREAKING_SPACE = "\u00a0" # 0xa0
-      EN_QUAD = "\u2000" # 0x2000
-      HAIR_SPACE = "\u200a" # 0x200a
-      NARROW_NO_BREAK_SPACE = "\u202f" # 0x202f
-      MEDIUM_MATHEMATICAL_SPACE = "\u205f" # 0x205f
-      IDEOGRAPHIC_SPACE = "\u3000" # 0x3000
-      DOUBLE_QUOTE_LEFT = "\u201c" # 0x201c
-      DOUBLE_QUOTE_RIGHT = "\u201d" # 0x201d
-      QUOTE_LEFT = "\u2018" # 0x2018
-      QUOTE_RIGHT = "\u2019" # 0x2019
+      NON_BREAKING_SPACE = ' ' # 0xa0
+      MONGOLIAN_VOWEL_SEPARATOR = '᠎' # 0x180e
+      EN_QUAD = ' ' # 0x2000
+      ZERO_WIDTH_SPACE = '' # 0x200b
+      NARROW_NO_BREAK_SPACE = ' ' # 0x202f
+      MEDIUM_MATHEMATICAL_SPACE = ' ' # 0x205f
+      IDEOGRAPHIC_SPACE = '　' # 0x3000
+      ZERO_WIDTH_NO_BREAK_SPACE = '' # 0xfeff
+      DOUBLE_QUOTE_LEFT = '“' # 0x201c
+      DOUBLE_QUOTE_RIGHT = '”' # 0x201d
+      QUOTE_LEFT = '‘' # 0x2018
+      QUOTE_RIGHT = '’' # 0x2019
       GRAVE_ACCENT = '`' # 0x0060
-      ACUTE_ACCENT = "\u00b4" # 0x00b4
+      ACUTE_ACCENT = '´' # 0x00b4
       REGEX_DELIMITER = %r{^[,:\[\]/{}()\n+]+$}
+      REGEX_UNQUOTED_STRING_DELIMITER = %r{^[,\[\]/{}\n+]+$}
       REGEX_START_OF_VALUE = /^[\[{\w-]$/
+      # matches "https://" and other schemas
+      REGEX_URL_START = %r{^(http|https|ftp|mailto|file|data|irc)://$}
+      # matches all valid URL characters EXCEPT "[", "]", and "," (important JSON delimiters)
+      REGEX_URL_CHAR = %r{^[A-Za-z0-9\-._~:/?#@!$&'()*+;=]$}
       # Functions to check character chars
       def hex?(char)
@@ -70,8 +77,19 @@ module JSON
         REGEX_DELIMITER.match?(char)
       end
-      def delimiter_except_slash?(char)
-        delimiter?(char) && char != SLASH
+      def unquoted_string_delimiter?(char)
+        REGEX_UNQUOTED_STRING_DELIMITER.match?(char)
+      end
+      REGEX_FUNCTION_NAME_CHAR_START = /\A[a-zA-Z_$]\z/
+      REGEX_FUNCTION_NAME_CHAR = /\A[a-zA-Z0-9_$]\z/
+      def function_name_char_start?(char)
+        !char.nil? && REGEX_FUNCTION_NAME_CHAR_START.match?(char)
+      end
+      def function_name_char?(char)
+        !char.nil? && REGEX_FUNCTION_NAME_CHAR.match?(char)
       end
       def start_of_value?(char)
@@ -86,11 +104,22 @@ module JSON
         [SPACE, NEWLINE, TAB, RETURN].include?(char)
       end
+      def whitespace_except_newline?(char)
+        [SPACE, TAB, RETURN].include?(char)
+      end
       def special_whitespace?(char)
+        return false unless char
         [
-          NON_BREAKING_SPACE, NARROW_NO_BREAK_SPACE, MEDIUM_MATHEMATICAL_SPACE, IDEOGRAPHIC_SPACE
+          NON_BREAKING_SPACE,
+          MONGOLIAN_VOWEL_SEPARATOR,
+          NARROW_NO_BREAK_SPACE,
+          MEDIUM_MATHEMATICAL_SPACE,
+          IDEOGRAPHIC_SPACE,
+          ZERO_WIDTH_NO_BREAK_SPACE
         ].include?(char) ||
-          (char >= EN_QUAD && char <= HAIR_SPACE)
+          (char >= EN_QUAD && char <= ZERO_WIDTH_SPACE)
       end
       def quote?(char)
@@ -149,7 +178,7 @@ module JSON
       def parse_keyword(name, value)
         if @json[@index, name.length] == name
-          @output += value
+          @output << value
           @index += name.length
           true
         else
@@ -161,10 +190,6 @@ module JSON
         text[0...start] + text[start + count..]
       end
-      def function_name?(text)
-        /^\w+$/.match?(text)
-      end
       def ends_with_comma_or_newline?(text)
         /[,\n][ \t\r]*$/.match?(text)
       end

data/lib/json/repair/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@
 module JSON
   module Repair
-    VERSION = '0.2.0'
+    VERSION = '0.3.0'
   end
 end

data/lib/json/repairer.rb CHANGED Viewed

@@ -25,17 +25,24 @@ module JSON
       't' => "\t"
     }.freeze
+    MARKDOWN_OPEN_BLOCKS = ['```', '[```', '{```'].freeze
+    MARKDOWN_CLOSE_BLOCKS = ['```', '```]', '```}'].freeze
     def initialize(json)
       @json = json
       @index = 0
-      @output = ''
+      @output = +''
     end
     def repair
+      parse_markdown_code_block(MARKDOWN_OPEN_BLOCKS)
       processed = parse_value
       throw_unexpected_end unless processed
+      parse_markdown_code_block(MARKDOWN_CLOSE_BLOCKS)
       processed_comma = parse_character(COMMA)
       parse_whitespace_and_skip_comments if processed_comma
@@ -71,22 +78,45 @@ module JSON
     def parse_value
       parse_whitespace_and_skip_comments
-      process = parse_object || parse_array || parse_string || parse_number || parse_keywords || parse_unquoted_string
+      process = parse_object ||
+                parse_array ||
+                parse_string ||
+                parse_number ||
+                parse_keywords ||
+                parse_unquoted_string(false) ||
+                parse_regex
       parse_whitespace_and_skip_comments
       process
     end
-    def parse_whitespace
-      whitespace = ''
-      while @json[@index] && (whitespace?(@json[@index]) || special_whitespace?(@json[@index]))
-        whitespace += whitespace?(@json[@index]) ? @json[@index] : ' '
+    def parse_whitespace_and_skip_comments(skip_newline: true)
+      start = @index
+      changed = parse_whitespace(skip_newline: skip_newline)
+      loop do
+        changed = parse_comment
+        changed = parse_whitespace(skip_newline: skip_newline) if changed
+        break unless changed
+      end
+      @index > start
+    end
+    def parse_whitespace(skip_newline: true)
+      whitespace = +''
+      while @json[@index] && (
+        (skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])) ||
+        special_whitespace?(@json[@index])
+      )
+        ws = skip_newline ? whitespace?(@json[@index]) : whitespace_except_newline?(@json[@index])
+        whitespace << (ws ? @json[@index] : ' ')
         @index += 1
       end
       unless whitespace.empty?
-        @output += whitespace
+        @output << whitespace
         return true
       end
@@ -110,11 +140,41 @@ module JSON
       end
     end
+    # Find and skip over a Markdown fenced code block:
+    #     ``` ... ```
+    # or
+    #     ```json ... ```
+    def parse_markdown_code_block(blocks)
+      return false unless skip_markdown_code_block(blocks)
+      if function_name_char_start?(@json[@index])
+        # strip the optional language specifier like "json"
+        @index += 1 while @index < @json.length && function_name_char?(@json[@index])
+      end
+      parse_whitespace_and_skip_comments
+      true
+    end
+    def skip_markdown_code_block(blocks)
+      parse_whitespace(skip_newline: true)
+      blocks.each do |block|
+        if @json[@index, block.length] == block
+          @index += block.length
+          return true
+        end
+      end
+      false
+    end
     # Parse an object like '{"key": "value"}'
     def parse_object
       return false unless @json[@index] == OPENING_BRACE
-      @output += '{'
+      @output << '{'
       @index += 1
       parse_whitespace_and_skip_comments
@@ -137,7 +197,7 @@ module JSON
         skip_ellipsis
-        processed_key = parse_string || parse_unquoted_string
+        processed_key = parse_string || parse_unquoted_string(true)
         unless processed_key
           if @json[@index] == CLOSING_BRACE || @json[@index] == OPENING_BRACE ||
              @json[@index] == CLOSING_BRACKET || @json[@index] == OPENING_BRACKET ||
@@ -166,7 +226,7 @@ module JSON
         unless processed_value
           if processed_colon || truncated_text
             # repair missing object value
-            @output += 'null'
+            @output << 'null'
           else
             throw_colon_expected
           end
@@ -174,7 +234,7 @@ module JSON
       end
       if @json[@index] == CLOSING_BRACE
-        @output += '}'
+        @output << '}'
         @index += 1
       else
         # repair missing end bracket
@@ -217,199 +277,273 @@ module JSON
     # - If it turns out that the string does not have a valid end quote followed
     #   by a delimiter (which should be the case), the function runs again in a
     #   more conservative way, stopping the string at the first next delimiter
-    #   and fixing the string by inserting a quote there.
-    def parse_string(stop_at_delimiter: false)
-      if @json[@index] == BACKSLASH
+    #   and fixing the string by inserting a quote there, or stopping at a
+    #   stop index detected in the first iteration.
+    def parse_string(stop_at_delimiter: false, stop_at_index: -1)
+      skip_escape_chars = @json[@index] == BACKSLASH
+      if skip_escape_chars
         # repair: remove the first escape character
         @index += 1
-        skip_escape_chars = true
       end
-      if quote?(@json[@index])
-        # double quotes are correct JSON,
-        # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
-        # otherwise, we will match any double-quote-like start with a double-quote-like end,
-        # or any single-quote-like start with a single-quote-like end
-        is_end_quote = if double_quote?(@json[@index])
-                         method(:double_quote?)
-                       elsif single_quote?(@json[@index])
-                         method(:single_quote?)
-                       elsif single_quote_like?(@json[@index])
-                         method(:single_quote_like?)
-                       else
-                         method(:double_quote_like?)
-                       end
-        i_before = @index
-        o_before = @output.length
-        str = '"'
-        @index += 1
+      return false unless quote?(@json[@index])
+      # double quotes are correct JSON,
+      # single quotes come from JavaScript for example, we assume it will have a correct single end quote too
+      # otherwise, we will match any double-quote-like start with a double-quote-like end,
+      # or any single-quote-like start with a single-quote-like end
+      is_end_quote = if double_quote?(@json[@index])
+                       method(:double_quote?)
+                     elsif single_quote?(@json[@index])
+                       method(:single_quote?)
+                     elsif single_quote_like?(@json[@index])
+                       method(:single_quote_like?)
+                     else
+                       method(:double_quote_like?)
+                     end
+      i_before = @index
+      o_before = @output.length
+      str = +'"'
+      @index += 1
-        loop do
-          if @index >= @json.length
-            # end of text, we are missing an end quote
+      loop do
+        if @index >= @json.length
+          # end of text, we are missing an end quote
+          i_prev = prev_non_whitespace_index(@index - 1)
+          if !stop_at_delimiter && delimiter?(@json[i_prev])
+            # if the text ends with a delimiter, like ["hello],
+            # so the missing end quote should be inserted before this delimiter
+            # retry parsing the string, stopping at the first next delimiter
+            @index = i_before
+            @output = @output[0...o_before]
+            return parse_string(stop_at_delimiter: true)
+          end
-            i_prev = prev_non_whitespace_index(@index - 1)
-            if !stop_at_delimiter && delimiter?(@json[i_prev])
-              # if the text ends with a delimiter, like ["hello],
-              # so the missing end quote should be inserted before this delimiter
-              # retry parsing the string, stopping at the first next delimiter
-              @index = i_before
-              @output = @output[0...o_before]
+          # repair missing quote
+          str = insert_before_last_whitespace(str, '"')
+          @output << str
-              return parse_string(stop_at_delimiter: true)
-            end
+          return true
+        end
-            # repair missing quote
-            str = insert_before_last_whitespace(str, '"')
-            @output += str
+        if @index == stop_at_index
+          # use the stop index detected in the first iteration, and repair end quote
+          str = insert_before_last_whitespace(str, '"')
+          @output << str
-            return true
-          elsif is_end_quote.call(@json[@index])
-            # end quote
-            i_quote = @index
-            o_quote = str.length
-            str += '"'
-            @index += 1
-            @output += str
+          return true
+        end
-            parse_whitespace_and_skip_comments
+        if is_end_quote.call(@json[@index])
+          # end quote
+          # let us check what is before and after the quote to verify whether this is a legit end quote
+          i_quote = @index
+          o_quote = str.length
+          str << '"'
+          @index += 1
+          @output << str
-            if stop_at_delimiter ||
-               @index >= @json.length ||
-               delimiter?(@json[@index]) ||
-               quote?(@json[@index]) ||
-               digit?(@json[@index])
-              # The quote is followed by the end of the text, a delimiter, or a next value
-              parse_concatenated_string
+          parse_whitespace_and_skip_comments(skip_newline: false)
-              return true
-            end
+          if stop_at_delimiter ||
+             @index >= @json.length ||
+             delimiter?(@json[@index]) ||
+             quote?(@json[@index]) ||
+             digit?(@json[@index])
+            # The quote is followed by the end of the text, a delimiter, or a next value
+            parse_concatenated_string
-            if delimiter?(@json[prev_non_whitespace_index(i_quote - 1)])
-              # This is not the right end quote: it is preceded by a delimiter,
-              # and NOT followed by a delimiter. So, there is an end quote missing
-              # parse the string again and then stop at the first next delimiter
-              @index = i_before
-              @output = @output[...o_before]
+            return true
+          end
-              return parse_string(stop_at_delimiter: true)
-            end
+          i_prev_char = prev_non_whitespace_index(i_quote - 1)
+          prev_char = @json[i_prev_char]
+          if prev_char == ','
+            # A comma followed by a quote, like '{"a":"b,c,"d":"e"}'.
+            # We assume that the quote is a start quote, and that the end quote
+            # should have been located right before the comma but is missing.
+            @index = i_before
+            @output = @output[0...o_before]
-            # revert to right after the quote but before any whitespace, and continue parsing the string
+            return parse_string(stop_at_delimiter: false, stop_at_index: i_prev_char)
+          end
+          if delimiter?(prev_char)
+            # This is not the right end quote: it is preceded by a delimiter,
+            # and NOT followed by a delimiter. So, there is an end quote missing
+            # parse the string again and then stop at the first next delimiter
+            @index = i_before
             @output = @output[...o_before]
-            @index = i_quote + 1
-            # repair unescaped quote
-            str = "#{str[...o_quote]}\\#{str[o_quote..]}"
-          elsif stop_at_delimiter && delimiter?(@json[@index])
-            # we're in the mode to stop the string at the first delimiter
-            # because there is an end quote missing
+            return parse_string(stop_at_delimiter: true)
+          end
-            # repair missing quote
-            str = insert_before_last_whitespace(str, '"')
-            @output += str
+          # revert to right after the quote but before any whitespace, and continue parsing the string
+          @output = @output[...o_before]
+          @index = i_quote + 1
+          # repair unescaped quote
+          str = "#{str[...o_quote]}\\#{str[o_quote..]}"
+        elsif stop_at_delimiter && unquoted_string_delimiter?(@json[@index])
+          # we're in the mode to stop the string at the first delimiter
+          # because there is an end quote missing
+          # test start of an url like "https://..." (this would be parsed as a comment)
+          if @json[@index - 1] == ':' &&
+             REGEX_URL_START.match?(@json[(i_before + 1)..(@index + 1)] || '')
+            while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
+              str << @json[@index]
+              @index += 1
+            end
+          end
-            parse_concatenated_string
+          # repair missing quote
+          str = insert_before_last_whitespace(str, '"')
+          @output << str
-            return true
-          elsif @json[@index] == BACKSLASH
-            # handle escaped content like \n or \u2605
-            char = @json[@index + 1]
-            escape_char = ESCAPE_CHARACTERS[char]
-            if escape_char
-              str += @json[@index, 2]
-              @index += 2
-            elsif char == 'u'
-              j = 2
-              j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
-              if j == 6
-                str += @json[@index, 6]
-                @index += 6
-              elsif @index + j >= @json.length
-                # repair invalid or truncated unicode char at the end of the text
-                # by removing the unicode char and ending the string here
-                @index = @json.length
-              else
-                throw_invalid_unicode_character
-              end
+          parse_concatenated_string
+          return true
+        elsif @json[@index] == BACKSLASH
+          # handle escaped content like \n or ★
+          char = @json[@index + 1]
+          escape_char = ESCAPE_CHARACTERS[char]
+          if escape_char
+            str << @json[@index, 2]
+            @index += 2
+          elsif char == 'u'
+            j = 2
+            j += 1 while j < 6 && @json[@index + j] && hex?(@json[@index + j])
+            if j == 6
+              str << @json[@index, 6]
+              @index += 6
+            elsif @index + j >= @json.length
+              # repair invalid or truncated unicode char at the end of the text
+              # by removing the unicode char and ending the string here
+              @index = @json.length
             else
-              # repair invalid escape character: remove it
-              str += char
-              @index += 2
+              throw_invalid_unicode_character
             end
+          elsif char == "\n"
+            # repair a backslash escaped newline (like in Bash scripts)
+            str << '\n'
+            @index += 2
           else
-            # handle regular characters
-            char = @json[@index]
-            if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
-              # repair unescaped double quote
-              str += "\\#{char}"
-            elsif control_character?(char)
-              # unescaped control character
-              str += CONTROL_CHARACTERS[char]
-            else
-              throw_invalid_character(char) unless valid_string_character?(char)
-              str += char
-            end
-            @index += 1
+            # repair invalid escape character: remove it
+            str << char
+            @index += 2
           end
-          if skip_escape_chars
-            # repair: skipped escape character (nothing to do)
-            skip_escape_character
+        else
+          # handle regular characters
+          char = @json[@index]
+          if char == DOUBLE_QUOTE && @json[@index - 1] != BACKSLASH
+            # repair unescaped double quote
+            str << "\\#{char}"
+          elsif control_character?(char)
+            # unescaped control character
+            str << CONTROL_CHARACTERS[char]
+          else
+            throw_invalid_character(char) unless valid_string_character?(char)
+            str << char
           end
+          @index += 1
         end
-      end
-      false
+        if skip_escape_chars
+          # repair: skipped escape character (nothing to do)
+          skip_escape_character
+        end
+      end
     end
     # Repair an unquoted string by adding quotes around it
     # Repair a MongoDB function call like NumberLong("2")
     # Repair a JSONP function call like callback({...});
-    def parse_unquoted_string
+    def parse_unquoted_string(is_key)
+      # NOTE: that the symbol can end with whitespaces: we stop at the next delimiter
+      # also, note that we allow strings to contain a slash / in order to support repairing regular expressions
       start = @index
-      @index += 1 while @index < @json.length && !delimiter_except_slash?(@json[@index]) && !quote?(@json[@index])
-      return if @index <= start
-      if @json[@index] == '(' && function_name?(@json[start...@index].strip)
-        # Repair a MongoDB function call like NumberLong("2")
-        # Repair a JSONP function call like callback({...});
-        @index += 1
+      if function_name_char_start?(@json[@index])
+        @index += 1 while @index < @json.length && function_name_char?(@json[@index])
-        parse_value
+        j = @index
+        j += 1 while whitespace?(@json[j])
-        if @json[@index] == ')'
-          # Repair: skip close bracket of function call
-          @index += 1
-          # Repair: skip semicolon after JSONP call
-          @index += 1 if @json[@index] == ';'
-        end
-      else
-        # Repair unquoted string
-        # Also, repair undefined into null
+        if @json[j] == '('
+          # repair a MongoDB function call like NumberLong("2")
+          # repair a JSONP function call like callback({...});
+          @index = j + 1
-        # First, go back to prevent getting trailing whitespaces in the string
-        @index -= 1 while whitespace?(@json[@index - 1]) && @index.positive?
+          parse_value
-        symbol = @json[start...@index]
-        @output += symbol == 'undefined' ? 'null' : symbol.inspect
+          if @json[@index] == ')'
+            # Repair: skip close bracket of function call
+            @index += 1
+            # Repair: skip semicolon after JSONP call
+            @index += 1 if @json[@index] == ';'
+          end
-        if @json[@index] == '"'
-          # We had a missing start quote, but now we encountered the end quote, so we can skip that one
-          @index += 1
+          return true
         end
       end
+      while @index < @json.length &&
+            !unquoted_string_delimiter?(@json[@index]) &&
+            !quote?(@json[@index]) &&
+            (!is_key || @json[@index] != ':')
+        @index += 1
+      end
+      # test start of an url like "https://..." (this would be parsed as a comment)
+      if @json[@index - 1] == ':' &&
+         REGEX_URL_START.match?(@json[start...(@index + 2)] || '')
+        @index += 1 while @index < @json.length && REGEX_URL_CHAR.match?(@json[@index])
+      end
+      return false if @index <= start
+      # Repair unquoted string
+      # Also, repair undefined into null
+      # First, go back to prevent getting trailing whitespaces in the string
+      @index -= 1 while @index.positive? && whitespace?(@json[@index - 1])
+      symbol = @json[start...@index]
+      @output << (symbol == 'undefined' ? 'null' : symbol.inspect)
+      if @json[@index] == '"'
+        # We had a missing start quote, but now we encountered the end quote, so we can skip that one
+        @index += 1
+      end
+      true
+    end
+    # Parse a regular expression literal like /foo/ or /foo\/bar/
+    def parse_regex
+      return false unless @json[@index] == '/'
+      start = @index
+      @index += 1
+      @index += 1 while @index < @json.length && (@json[@index] != '/' || @json[@index - 1] == BACKSLASH)
+      @index += 1
+      @output << @json[start...@index].inspect
       true
     end
     def parse_character(char)
       if @json[@index] == char
-        @output += @json[@index]
+        @output << @json[@index]
         @index += 1
         true
       else
@@ -417,19 +551,6 @@ module JSON
       end
     end
-    def parse_whitespace_and_skip_comments
-      start = @index
-      changed = parse_whitespace
-      loop do
-        changed = parse_comment
-        changed = parse_whitespace if changed
-        break unless changed
-      end
-      @index > start
-    end
     # Parse a number like 2.4 or 2.4e6
     def parse_number
       start = @index
@@ -489,7 +610,7 @@ module JSON
         num = @json[start...@index]
         has_invalid_leading_zero = num.match?(/^0\d/)
-        @output += has_invalid_leading_zero ? "\"#{num}\"" : num
+        @output << (has_invalid_leading_zero ? "\"#{num}\"" : num)
         return true
       end
@@ -503,7 +624,7 @@ module JSON
     # Parse an array like '["item1", "item2", ...]'
     def parse_array
       if @json[@index] == OPENING_BRACKET
-        @output += '['
+        @output << '['
         @index += 1
         parse_whitespace_and_skip_comments
@@ -531,7 +652,7 @@ module JSON
         end
         if @json[@index] == CLOSING_BRACKET
-          @output += ']'
+          @output << ']'
           @index += 1
         else
           # repair missing closing array bracket
@@ -580,7 +701,7 @@ module JSON
       # repair numbers cut off at the end
       # this will only be called when we end after a '.', '-', or 'e' and does not
       # change the number more than it needs to make it valid JSON
-      @output += "#{@json[start...@index]}0"
+      @output << "#{@json[start...@index]}0"
     end
     # Parse and repair Newline Delimited JSON (NDJSON):

data/sig/json/repair/string_utils.rbs ADDED Viewed

@@ -0,0 +1,165 @@
+module JSON
+  module Repair
+    module StringUtils
+      @output: untyped
+      @index: untyped
+      # Constants for character chars
+      BACKSLASH: "\\"
+      SLASH: "/"
+      ASTERISK: "*"
+      OPENING_BRACE: "{"
+      CLOSING_BRACE: "}"
+      OPENING_BRACKET: "["
+      CLOSING_BRACKET: "]"
+      OPEN_PARENTHESIS: "("
+      CLOSE_PARENTHESIS: ")"
+      SPACE: " "
+      NEWLINE: "\n"
+      TAB: "\t"
+      RETURN: "\r"
+      BACKSPACE: "\b"
+      FORM_FEED: "\f"
+      DOUBLE_QUOTE: "\""
+      PLUS: "+"
+      MINUS: "-"
+      QUOTE: "'"
+      ZERO: "0"
+      NINE: "9"
+      COMMA: ","
+      DOT: "."
+      COLON: ":"
+      SEMICOLON: ";"
+      UPPERCASE_A: "A"
+      LOWERCASE_A: "a"
+      UPPERCASE_E: "E"
+      LOWERCASE_E: "e"
+      UPPERCASE_F: "F"
+      LOWERCASE_F: "f"
+      NON_BREAKING_SPACE: ::String
+      MONGOLIAN_VOWEL_SEPARATOR: ::String
+      EN_QUAD: ::String
+      ZERO_WIDTH_SPACE: ::String
+      NARROW_NO_BREAK_SPACE: ::String
+      MEDIUM_MATHEMATICAL_SPACE: ::String
+      IDEOGRAPHIC_SPACE: ::String
+      ZERO_WIDTH_NO_BREAK_SPACE: ::String
+      DOUBLE_QUOTE_LEFT: ::String
+      DOUBLE_QUOTE_RIGHT: ::String
+      QUOTE_LEFT: ::String
+      QUOTE_RIGHT: ::String
+      GRAVE_ACCENT: "`"
+      ACUTE_ACCENT: ::String
+      REGEX_DELIMITER: ::Regexp
+      REGEX_UNQUOTED_STRING_DELIMITER: ::Regexp
+      REGEX_START_OF_VALUE: ::Regexp
+      REGEX_URL_START: ::Regexp
+      REGEX_URL_CHAR: ::Regexp
+      REGEX_FUNCTION_NAME_CHAR_START: ::Regexp
+      REGEX_FUNCTION_NAME_CHAR: ::Regexp
+      # Functions to check character chars
+      def hex?: (untyped char) -> untyped
+      def digit?: (untyped char) -> untyped
+      def valid_string_character?: (untyped char) -> untyped
+      def delimiter?: (untyped char) -> untyped
+      def unquoted_string_delimiter?: (untyped char) -> untyped
+      def function_name_char_start?: (untyped char) -> untyped
+      def function_name_char?: (untyped char) -> untyped
+      def start_of_value?: (untyped char) -> untyped
+      def control_character?: (untyped char) -> untyped
+      def whitespace?: (untyped char) -> untyped
+      def whitespace_except_newline?: (untyped char) -> untyped
+      def special_whitespace?: (untyped char) -> untyped
+      def quote?: (untyped char) -> untyped
+      def double_quote?: (untyped char) -> untyped
+      def single_quote?: (untyped char) -> untyped
+      def double_quote_like?: (untyped char) -> untyped
+      def single_quote_like?: (untyped char) -> untyped
+      # Strip last occurrence of text_to_strip from text
+      def strip_last_occurrence: (untyped text, untyped text_to_strip, ?strip_remaining_text: bool) -> untyped
+      def insert_before_last_whitespace: (untyped text, untyped text_to_insert) -> untyped
+      # Parse keywords true, false, null
+      # Repair Python keywords True, False, None
+      # Repair Ruby keyword nil
+      def parse_keywords: () -> untyped
+      def parse_keyword: (untyped name, untyped value) -> (true | false)
+      def remove_at_index: (untyped text, untyped start, untyped count) -> untyped
+      def ends_with_comma_or_newline?: (untyped text) -> untyped
+    end
+  end
+end

data/sig/json/repair.rbs CHANGED Viewed

@@ -1,7 +1,10 @@
 module JSON
+  class JSONRepairError < StandardError
+  end
   module Repair
-    VERSION: String
+    VERSION: ::String
   end
-  def self.repair(String) -> ?String
+  def self.repair: (::String json) -> ::String
 end

data/sig/json/repairer.rbs ADDED Viewed

@@ -0,0 +1,103 @@
+module JSON
+  class Repairer
+    @json: ::String
+    @index: Integer
+    @output: ::String
+    include Repair::StringUtils
+    CONTROL_CHARACTERS: ::Hash[::String, "\\b" | "\\f" | "\\n" | "\\r" | "\\t"]
+    ESCAPE_CHARACTERS: ::Hash[::String, "\"" | "\\" | "/" | "\b" | "\f" | "\n" | "\r" | "\t"]
+    MARKDOWN_OPEN_BLOCKS: ::Array[::String]
+    MARKDOWN_CLOSE_BLOCKS: ::Array[::String]
+    def initialize: (::String json) -> void
+    def repair: () -> ::String
+    private
+    def parse_value: () -> untyped
+    def parse_whitespace: (?skip_newline: bool) -> (true | false)
+    def parse_comment: () -> (true | false)
+    # Find and skip over a Markdown fenced code block
+    def parse_markdown_code_block: (::Array[::String] blocks) -> (true | false)
+    def skip_markdown_code_block: (::Array[::String] blocks) -> (true | false)
+    # Parse an object like '{"key": "value"}'
+    def parse_object: () -> (false | true)
+    def skip_character: (untyped char) -> (true | false)
+    # Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
+    # or a similar construct in objects.
+    def skip_ellipsis: () -> untyped
+    # Parse a string enclosed by double quotes "...". Can contain escaped quotes
+    # Repair strings enclosed in single quotes or special quotes
+    # Repair an escaped string
+    #
+    # The function can run in two stages:
+    # - First, it assumes the string has a valid end quote
+    # - If it turns out that the string does not have a valid end quote followed
+    #   by a delimiter (which should be the case), the function runs again in a
+    #   more conservative way, stopping the string at the first next delimiter
+    #   and fixing the string by inserting a quote there, or stopping at a
+    #   stop index detected in the first iteration.
+    def parse_string: (?stop_at_delimiter: bool, ?stop_at_index: ::Integer) -> (untyped | true | false)
+    # Repair an unquoted string by adding quotes around it
+    # Repair a MongoDB function call like NumberLong("2")
+    # Repair a JSONP function call like callback({...});
+    def parse_unquoted_string: (bool is_key) -> (false | true)
+    # Parse a regular expression literal like /foo/ or /foo\/bar/
+    def parse_regex: () -> (false | true)
+    def parse_character: (untyped char) -> (true | false)
+    def parse_whitespace_and_skip_comments: (?skip_newline: bool) -> untyped
+    # Parse a number like 2.4 or 2.4e6
+    def parse_number: () -> (true | false)
+    def at_end_of_number?: () -> untyped
+    # Parse an array like '["item1", "item2", ...]'
+    def parse_array: () -> (true | false)
+    def prev_non_whitespace_index: (untyped start) -> untyped
+    # Repair concatenated strings like "hello" + "world", change this into "helloworld"
+    def parse_concatenated_string: () -> untyped
+    def repair_number_ending_with_numeric_symbol: (untyped start) -> untyped
+    # Parse and repair Newline Delimited JSON (NDJSON):
+    # multiple JSON objects separated by a newline character
+    def parse_newline_delimited_json: () -> untyped
+    def skip_escape_character: () -> untyped
+    def throw_invalid_character: (untyped char) -> untyped
+    def throw_unexpected_character: () -> untyped
+    def throw_unexpected_end: () -> untyped
+    def throw_object_key_expected: () -> untyped
+    def throw_colon_expected: () -> untyped
+    def throw_invalid_unicode_character: () -> untyped
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: json-repair
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Aleksandr Zykov
-autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-06-04 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies: []
 description: This is a simple gem that repairs broken JSON strings.
 email:
@@ -24,11 +23,14 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- Steepfile
 - lib/json/repair.rb
 - lib/json/repair/string_utils.rb
 - lib/json/repair/version.rb
 - lib/json/repairer.rb
 - sig/json/repair.rbs
+- sig/json/repair/string_utils.rbs
+- sig/json/repairer.rbs
 homepage: https://github.com/sashazykov/json-repair-rb
 licenses:
 - ISC
@@ -37,7 +39,6 @@ metadata:
   homepage_uri: https://github.com/sashazykov/json-repair-rb
   source_code_uri: https://github.com/sashazykov/json-repair-rb
   changelog_uri: https://github.com/sashazykov/json-repair-rb/blob/main/CHANGELOG.md
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -52,8 +53,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.10
-signing_key:
+rubygems_version: 3.6.9
 specification_version: 4
 summary: Repairs broken JSON strings.
 test_files: []