RubyGems - json_mend - Versions diffs - 0.1.7 → 0.2.1 - Mend

json_mend 0.1.7 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/.rubocop.yml +6 -3
data/.tool-versions +1 -1
data/README.md +13 -1
data/lib/json_mend/parser.rb +186 -98
data/lib/json_mend/version.rb +1 -1
data/lib/json_mend.rb +10 -4
metadata +4 -4
/data/sig/{manifest.yaml → manifest.yml} +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e52539860bdf2bf56f8a621ccb622547649cb178a3ff61deb3b345cc3704fb73
-  data.tar.gz: 4575c06664dfb59e559a883d0a9bac575250c074e9fde3a7a767a63d0f4a649c
+  metadata.gz: 9a3bfc54ceae164d2837fbcdf3751e79359397b735693ec990954fee19bfa60e
+  data.tar.gz: 2dfea1b0a6ada799891385ec4c35a83153005f5b344e23b7a9921c726a94c220
 SHA512:
-  metadata.gz: f88a854314da8e06e38dc85656a006d31c92daa08e016cf6f2a04ba1f560b2b182e15fdc7d786a2970d2a41ed70ef3275a40dc7770501db994ec645429272083
-  data.tar.gz: 6ea1af49216aebe97b7947b775d792f82b2de2feae72e08d5435bee797e108d060df6cb23f5d1939c0375e10ffc497d703c58979ca431cd06dc3bd201ff56309
+  metadata.gz: '0592fea3e3859aaafbc6b2508b03de3a54eefd17e4520375ce1163e929ed24be10ad16b2936db2175ec79d9f9f4dd7acbd82ba72b50c3df3ea347ffe8436ded6'
+  data.tar.gz: 6708504ef9b2c1f68f0bdc1da889c254580a169066d37db99993468f78bcb9c3f8ca351dfaafecbc62780d319c12e51a4536b6da70a589d450ead3f407ed8ea7

data/.rubocop.yml CHANGED Viewed

@@ -7,10 +7,10 @@ AllCops:
   SuggestExtensions: false
 Metrics/AbcSize:
-  Max: 65
+  Max: 70
 Metrics/ClassLength:
-  Max: 820
+  Max: 900
 Metrics/CyclomaticComplexity:
   Max: 35
@@ -18,11 +18,14 @@ Metrics/CyclomaticComplexity:
 Metrics/MethodLength:
   Max: 80
+Metrics/BlockLength:
+  Max: 40
 Metrics/PerceivedComplexity:
   Max: 35
 Metrics/BlockNesting:
-  Max: 5
+  Max: 8
 Naming/PredicateMethod:
   Enabled: false

data/.tool-versions CHANGED Viewed

	@@ -1 +1 @@
1	- ruby 3.4.8
1	+ ruby 4.0.2

data/README.md CHANGED Viewed

@@ -1,6 +1,18 @@
 # JsonMend [![Ruby Checks](https://github.com/le0pard/json_mend/actions/workflows/main.yml/badge.svg)](https://github.com/le0pard/json_mend/actions/workflows/main.yml)
-`JsonMend` is a robust Ruby gem designed to repair broken or malformed JSON strings. It is specifically optimized to handle common errors found in JSON generated by Large Language Models (LLMs), such as missing quotes, trailing commas, unescaped characters, and stray comments.
+`JsonMend` is a robust Ruby gem designed to repair broken or malformed JSON strings. It is specifically optimized to handle common errors found in JSON generated by Large Language Models (LLMs), such as missing quotes, trailing commas, unescaped characters, and stray comments
+# Why?
+Integrating Large Language Models (LLMs) into software workflows often requires structured data output. While prompting an LLM to "return JSON" is a common pattern, models are probabilistic text generators, not strict serialization engines. They frequently treat JSON syntax as a loose suggestion rather than a rigid standard.
+Standard `JSON.parse` is fragile when facing the chaotic output of an LLM. Common failure modes include:
+- **Hallucinated Syntax**: LLMs often include trailing commas, code comments (`//` or `#`), single quotes, or Python-style literals (`True`, `False`) that break standard JSON parsers
+- **"Chatty" Wrappers**: Models frequently wrap JSON in Markdown code blocks (`json ...`) or include conversational preambles (`Here is the data you requested: ...`), turning valid data into invalid syntax errors
+- **Truncation**: JSON is verbose. Output limits often cut off the response mid-stream, leaving unclosed brackets and braces
+`JsonMend` acts as a middleware layer between the messy text output of an LLM and your Ruby application. It aggressively parses, cleans, and repairs the raw string—handling truncation, stripping garbage text, and normalizing syntax—to ensure you get usable structured data instead of a `JSON::ParserError`
 ## Features

data/lib/json_mend/parser.rb CHANGED Viewed

@@ -1,15 +1,23 @@
 # frozen_string_literal: true
 require 'strscan'
-require 'set'
 # Root module
 module JsonMend
   # The core parser that does the heavy lifting of fixing the JSON
   class Parser
+    MAX_ALLOWED_DEPTH = 100
     COMMENT_DELIMETERS = ['#', '/'].freeze
     NUMBER_CHARS = Set.new('0123456789-.eE/,_'.chars).freeze
     STRING_DELIMITERS = ['"', "'", '“', '”'].freeze
+    SKIP_CHARS_REGEX_CACHE = {
+      '"' => /"/,
+      "'" => /'/,
+      '“' => /“/,
+      '”' => /”/,
+      ':' => /:/,
+      '}' => /\}/
+    }.freeze
     ESCAPE_MAPPING = {
       't' => "\t",
       'n' => "\n",
@@ -40,6 +48,7 @@ module JsonMend
     def initialize(json_string)
       @scanner = StringScanner.new(json_string)
       @context = []
+      @depth = 0
     end
     # Kicks off the parsing process. This is a direct port of the robust Python logic
@@ -63,8 +72,11 @@ module JsonMend
             # Ignore strings that look like closing braces garbage (e.g. "}", " ] ")
             next if new_json.is_a?(String) && new_json.strip.match?(/^[}\]]+$/)
-            json.pop if both_hash?(json.last, new_json)
-            json << new_json
+            if both_hash?(json.last, new_json)
+              deep_merge_hashes!(json.last, new_json)
+            else
+              json << new_json
+            end
           end
         end
@@ -76,6 +88,33 @@ module JsonMend
     private
+    def with_depth_check
+      @depth += 1
+      raise JSON::NestingError, "nesting of #{@depth} is too deep" if @depth > MAX_ALLOWED_DEPTH
+      yield
+    ensure
+      @depth -= 1
+    end
+    def deep_merge_hashes!(target, source)
+      source.each do |key, new_val|
+        if target.key?(key)
+          old_val = target[key]
+          if old_val.is_a?(Hash) && new_val.is_a?(Hash)
+            deep_merge_hashes!(old_val, new_val)
+          elsif old_val.is_a?(Array) && new_val.is_a?(Array)
+            target[key] = old_val + new_val
+          else
+            target[key] = new_val
+          end
+        else
+          target[key] = new_val
+        end
+      end
+      target
+    end
     def parse_json
       until @scanner.eos?
         char = peek_char
@@ -123,51 +162,53 @@ module JsonMend
     # Parses a JSON object.
     def parse_object
-      object = {}
+      with_depth_check do
+        object = {}
-      loop do
-        skip_whitespaces
+        loop do
+          skip_whitespaces
-        # Explicitly consume comments to ensure they don't hide separators (like commas)
-        # or get parsed as part of the next key.
-        if COMMENT_DELIMETERS.include?(peek_char)
-          parse_comment
-          next
-        end
+          # Explicitly consume comments to ensure they don't hide separators (like commas)
+          # or get parsed as part of the next key.
+          if COMMENT_DELIMETERS.include?(peek_char)
+            parse_comment
+            next
+          end
-        # >> PRIMARY EXIT: End of object or end of string.
-        break if @scanner.eos? || @scanner.scan('}') || peek_char == ']'
+          # >> PRIMARY EXIT: End of object or end of string.
+          break if @scanner.eos? || @scanner.scan('}') || peek_char == ']'
-        # Leniently consume any leading junk characters (like stray commas or colons)
-        # that might appear before a key.
-        @scanner.skip(/[,\s]+/)
+          # Leniently consume any leading junk characters (like stray commas or colons)
+          # that might appear before a key.
+          @scanner.skip(/[,\s]+/)
-        # --- Delegate to a helper to parse the next Key-Value pair ---
-        key, value, colon_found = parse_object_pair(object)
-        next if SKIPPED_KEYS.include?(key)
+          # --- Delegate to a helper to parse the next Key-Value pair ---
+          key, value, colon_found = parse_object_pair(object)
+          next if SKIPPED_KEYS.include?(key)
-        # If the helper returns nil for the key, it signals that we should
-        # stop parsing this object (e.g. a duplicate key was found,
-        # indicating the start of a new object).
-        if key.nil?
-          @scanner.scan('}')
-          break
-        end
+          # If the helper returns nil for the key, it signals that we should
+          # stop parsing this object (e.g. a duplicate key was found,
+          # indicating the start of a new object).
+          if key.nil?
+            @scanner.scan('}')
+            break
+          end
-        # Assign the parsed pair to our object, avoiding empty keys.
-        # But only if we didn't firmly establish the key with a colon already.
-        skip_whitespaces
-        if peek_char == ':' && !colon_found
-          key = value.to_s
-          @scanner.getch # consume ':'
-          value = parse_object_value
+          # Assign the parsed pair to our object, avoiding empty keys.
+          # But only if we didn't firmly establish the key with a colon already.
+          skip_whitespaces
+          if peek_char == ':' && !colon_found
+            key = value.to_s
+            @scanner.getch # consume ':'
+            value = parse_object_value
+          end
+          # Assign the parsed pair to our object.
+          object[key] = value
         end
-        # Assign the parsed pair to our object.
-        object[key] = value
+        object
       end
-      object
     end
     # Attempts to parse a single key-value pair.
@@ -294,60 +335,62 @@ module JsonMend
     # Assumes the opening '[' has already been consumed by the caller.
     # This is a lenient parser designed to handle malformed JSON.
     def parse_array
-      arr = []
-      @context.push(:array)
-      char = peek_char
-      # Stop when you find the closing bracket or an invalid character like '}'
-      while !@scanner.eos? && !TERMINATORS_ARRAY.include?(char)
-        skip_whitespaces
+      with_depth_check do
+        arr = []
+        @context.push(:array)
         char = peek_char
-        # Check for comments explicitly inside array to avoid recursion or garbage consumption issues
-        if COMMENT_DELIMETERS.include?(char)
-          parse_comment
+        # Stop when you find the closing bracket or an invalid character like '}'
+        while !@scanner.eos? && !TERMINATORS_ARRAY.include?(char)
+          skip_whitespaces
           char = peek_char
-          next
-        end
-        value = ''
-        if STRING_DELIMITERS.include?(char)
-          # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
-          # So we are going to check if this string is followed by a : or not
-          # And either parse the string or parse the object
-          i = 1
-          i = skip_to_character(char, start_idx: i)
-          i = skip_whitespaces_at(start_idx: i + 1)
-          value = (peek_char(i) == ':' ? parse_object : parse_string)
-        else
-          value = parse_json
-        end
+          # Check for comments explicitly inside array to avoid recursion or garbage consumption issues
+          if COMMENT_DELIMETERS.include?(char)
+            parse_comment
+            char = peek_char
+            next
+          end
-        # Handle JSON_STOP_TOKEN from parse_json (EOS or consumed terminator)
-        if value == JSON_STOP_TOKEN
-          # Do nothing, just skipped garbage
-        elsif strictly_empty?(value)
-          # Only consume if we didn't just hit a terminator that parse_json successfully respected
-          @scanner.getch unless value.nil? && TERMINATORS_ARRAY.include?(peek_char)
-        elsif value == '...' && @scanner.string.getbyte(@scanner.pos - 1) == 46
-          # just skip if the previous byte was a dot (46)
-        else
-          arr << value
-        end
+          value = ''
+          if STRING_DELIMITERS.include?(char)
+            # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
+            # So we are going to check if this string is followed by a : or not
+            # And either parse the string or parse the object
+            i = 1
+            i = skip_to_character(char, start_idx: i)
+            i = skip_whitespaces_at(start_idx: i + 1)
+            value = (peek_char(i) == ':' ? parse_object : parse_string)
+          else
+            value = parse_json
+          end
+          # Handle JSON_STOP_TOKEN from parse_json (EOS or consumed terminator)
+          if value == JSON_STOP_TOKEN
+            # Do nothing, just skipped garbage
+          elsif strictly_empty?(value)
+            # Only consume if we didn't just hit a terminator that parse_json successfully respected
+            @scanner.getch unless value.nil? && TERMINATORS_ARRAY.include?(peek_char)
+          elsif value == '...' && @scanner.string.getbyte(@scanner.pos - 1) == 46
+            # just skip if the previous byte was a dot (46)
+          else
+            arr << value
+          end
-        char = peek_char
-        while char && char != ']' && (char.match?(/\s/) || char == ',')
-          @scanner.getch
           char = peek_char
+          while char && char != ']' && (char.match?(/\s/) || char == ',')
+            @scanner.getch
+            char = peek_char
+          end
         end
-      end
-      # Handle a potentially missing closing bracket, a common LLM error.
-      unless @scanner.scan(']')
-        @scanner.scan('}') # Consume } if it was the closer
-      end
-      @context.pop
+        # Handle a potentially missing closing bracket, a common LLM error.
+        unless @scanner.scan(']')
+          @scanner.scan('}') # Consume } if it was the closer
+        end
+        @context.pop
-      arr
+        arr
+      end
     end
     # Parses a JSON string. This is a very lenient parser designed to handle
@@ -744,7 +787,9 @@ module JsonMend
         bk = 1
         slashes = 0
         # Look back in the string buffer directly for speed
-        while (char_code = @scanner.string.getbyte(@scanner.pos - 1 - bk)) && char_code == 92 # 92 is backslash
+        while (@scanner.pos - 1 - bk >= 0) &&
+              (char_code = @scanner.string.getbyte(@scanner.pos - 1 - bk)) &&
+              char_code == 92 # 92 is backslash
           slashes += 1
           bk += 1
         end
@@ -902,7 +947,37 @@ module JsonMend
           # Validate valid hex digits
           if hex_parts.length == num_chars && hex_parts.all? { |c| c.match?(/[0-9a-fA-F]/) }
             string_parts.pop
-            string_parts << hex_parts.join.to_i(16).chr('UTF-8')
+            hex_val = hex_parts.join.to_i(16)
+            if char == 'u' && hex_val.between?(0xD800, 0xDBFF)
+              # Handle high surrogate pair
+              saved_pos = @scanner.pos
+              if @scanner.scan(/\\u([0-9a-fA-F]{4})/)
+                low_surrogate = @scanner[1].to_i(16)
+                if low_surrogate.between?(0xDC00, 0xDFFF)
+                  # Combine surrogates into a valid UTF-8 character
+                  code_point = 0x10000 + ((hex_val - 0xD800) * 0x400) + (low_surrogate - 0xDC00)
+                  string_parts << code_point.chr('UTF-8')
+                else
+                  # Invalid low surrogate: backtrack and use replacement char
+                  @scanner.pos = saved_pos
+                  string_parts << "\uFFFD"
+                end
+              else
+                # Missing low surrogate
+                string_parts << "\uFFFD"
+              end
+            elsif char == 'u' && hex_val.between?(0xDC00, 0xDFFF)
+              # Unpaired low surrogate
+              string_parts << "\uFFFD"
+            else
+              # Regular code point or hex escape
+              begin
+                string_parts << hex_val.chr('UTF-8')
+              rescue RangeError
+                string_parts << "\uFFFD"
+              end
+            end
             # Scanner is already advanced past digits
             char = peek_char
@@ -1014,7 +1089,18 @@ module JsonMend
         if scanned_str.end_with?('.')
           Float(scanned_str[0...-1])
         elsif scanned_str.include?(',')
-          Float(scanned_str.tr(',', '.'))
+          # Check if commas are being used as thousands separators (e.g., 1,234 or 1,234,567.89)
+          if scanned_str.count(',') > 1 || scanned_str.match?(/,\d{3}(?:\.\d+)?$/)
+            cleaned = scanned_str.delete(',')
+            if cleaned.match?(/[.eE]/)
+              Float(cleaned)
+            else
+              Integer(cleaned, 10)
+            end
+          else
+            # Treat single comma as a decimal point (European style, e.g., 1,5 -> 1.5)
+            Float(scanned_str.tr(',', '.'))
+          end
         elsif scanned_str.match?(/[.eE]/)
           Float(scanned_str)
         else
@@ -1060,20 +1146,24 @@ module JsonMend
         if context_contain?(:object_key)
           # If parsing a key, we must stop at ':' and structural closers
-          @scanner.scan_until(/(?=[\n\r:}\]])/)
+          @scanner.scan_until(/(?=[\n\r:}\]]|\\n|\\r)/) || @scanner.terminate
         elsif in_array && in_object
           # Nested ambiguity, stop at any closer
-          @scanner.scan_until(/(?=[\n\r}\]])/)
+          @scanner.scan_until(/(?=[\n\r}\]]|\\n|\\r)/) || @scanner.terminate
         elsif in_array
           # Inside array, stop at ']'
-          @scanner.scan_until(/(?=[\n\r\]])/)
+          @scanner.scan_until(/(?=[\n\r\]]|\\n|\\r)/) || @scanner.terminate
         elsif in_object
           # Inside object value, stop at '}'
-          @scanner.scan_until(/(?=[\n\r}])/)
+          @scanner.scan_until(/(?=[\n\r}]|\\n|\\r)/) || @scanner.terminate
         else
           # Top level or neutral, stop at newline
-          @scanner.scan_until(/(?=[\n\r])/)
+          @scanner.scan_until(/(?=[\n\r]|\\n|\\r)/) || @scanner.terminate
         end
+        # Consume literal escaped newlines so they don't break subsequent parsing.
+        # (Real newlines will be left alone here and consumed normally by skip_whitespaces).
+        @scanner.skip(/\\n|\\r/)
       else
         # The character at the current position (likely '/') is not the start of a
         # valid comment. To prevent an infinite loop in the calling parser, we must
@@ -1088,13 +1178,11 @@ module JsonMend
     # It quickly iterates to find a character, handling escaped characters, and
     # returns the index (offset) from the scanner
     def skip_to_character(characters, start_idx: 0)
-      pattern = if characters.is_a?(Regexp)
-                  characters
-                else
-                  # Escape if it's a string, join if it's an array
-                  chars = Array(characters).map { |c| Regexp.escape(c.to_s) }
-                  Regexp.new(chars.join('|'))
-                end
+      pattern = SKIP_CHARS_REGEX_CACHE.fetch(characters, nil)
+      if pattern.nil?
+        chars = Array(characters).map { |c| Regexp.escape(c.to_s) }
+        pattern = Regexp.new(chars.join('|'))
+      end
       saved_pos = @scanner.pos
       # Skip start_idx

data/lib/json_mend/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module JsonMend
-  VERSION = '0.1.7'
+  VERSION = '0.2.1'
 end

data/lib/json_mend.rb CHANGED Viewed

@@ -16,12 +16,18 @@ module JsonMend
     def repair(json_string, return_objects: false)
       # First, attempt to parse the string with the standard library.
       repaired_json = begin
-        JSON.parse(
+        parsed = JSON.parse(
           json_string,
           allow_trailing_comma: true,
           allow_control_characters: true
         )
-      rescue JSON::ParserError
+        # Verify the native parser didn't produce invalid UTF-8 (like unpaired surrogates)
+        # by ensuring it can safely dump its own output.
+        JSON.dump(parsed)
+        parsed
+      rescue JSON::ParserError, JSON::GeneratorError
         parser = Parser.new(json_string)
         parser.parse
       end
@@ -29,8 +35,8 @@ module JsonMend
       # Avoids returning `null` for empty results, returns the object directly
       return repaired_json if return_objects
-      # For string output, ensure we don't just return the string "null" for an empty input
-      repaired_json.nil? ? '' : JSON.dump(repaired_json)
+      # Always return a valid JSON string. For unparseable input, `nil` dumps to "null".
+      JSON.dump(repaired_json)
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: json_mend
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.2.1
 platform: ruby
 authors:
 - Oleksii Vasyliev
@@ -59,7 +59,7 @@ files:
 - lib/json_mend/parser.rb
 - lib/json_mend/version.rb
 - sig/json_mend.rbs
-- sig/manifest.yaml
+- sig/manifest.yml
 homepage: https://github.com/le0pard/json_mend
 licenses:
 - MIT
@@ -77,14 +77,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 3.1.0
+      version: 3.2.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.9
+rubygems_version: 4.0.6
 specification_version: 4
 summary: Repair broken JSON
 test_files: []

/data/sig/{manifest.yaml → manifest.yml} RENAMED Viewed

File without changes