PyPI - json-repair - Versions diffs - 0.16.3__tar.gz → 0.17.0__tar.gz - Mend

json-repair 0.16.3tar.gz → 0.17.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{json_repair-0.16.3/src/json_repair.egg-info → json_repair-0.17.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: json_repair
-Version: 0.16.3
+Version: 0.17.0
 Summary: A package to repair broken json strings
 Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
 License: MIT License

{json_repair-0.16.3 → json_repair-0.17.0}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "json_repair"
-version = "0.16.3"
+version = "0.17.0"
 license = {file = "LICENSE"}
 authors = [
   { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },

{json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair/json_repair.py RENAMED Viewed

@@ -11,7 +11,7 @@ This module will parse the JSON file following the BNF definition:
     <container> ::= <object> | <array>
     <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
-    <object> ::= '{' [ <string> *(', ' <member>) ] '}' ; A sequence of 'members'
+    <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
     <member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
 If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
@@ -27,9 +27,11 @@ from typing import Any, Dict, List, Union, TextIO
 class JSONParser:
-    def __init__(self, json_str: str, logging: bool = False) -> None:
+    def __init__(self, json_str: str, json_fd: TextIO, logging: bool = False) -> None:
         # The string to parse
         self.json_str = json_str
+        # Alternatively, the file description with a json file in it
+        self.json_fd = json_fd
         # Index is our iterator that will keep track of which character we are looking at right now
         self.index = 0
         # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -56,48 +58,28 @@ class JSONParser:
             return ""
         # <object> starts with '{'
         # but an object key must be a string
-        elif self.get_context() != "object_key" and char == "{":
+        elif char == "{":
             self.index += 1
             return self.parse_object()
         # <array> starts with '['
         # but an object key must be a string
-        elif self.get_context() != "object_key" and char == "[":
+        elif char == "[":
             self.index += 1
             return self.parse_array()
         # there can be an edge case in which a key is empty and at the end of an object
         # like "key": }. We return an empty string here to close the object properly
-        elif self.get_context() != "object_key" and char == "}":
+        elif char == "}":
             self.log(
                 "At the end of an object we found a key with missing value, skipping",
                 "info",
             )
             return ""
-        # <string> starts with '"'
-        elif char == '"':
+        # <string> starts with a quote
+        elif char in ['"', "'", "“"] or char.isalpha():
             return self.parse_string()
-        elif char == "'":
-            return self.parse_string(string_quotes="'")
-        elif char == "“":
-            return self.parse_string(string_quotes=["“", "”"])
         # <number> starts with [0-9] or minus
-        elif (
-            self.get_context() != ""
-            and self.get_context() != "object_key"
-            and char.isdigit()
-            or char == "-"
-            or char == "."
-        ):
+        elif char.isdigit() or char == "-" or char == ".":
             return self.parse_number()
-        # <boolean> could be (T)rue or (F)alse or (N)ull
-        elif (
-            self.get_context() != ""
-            and self.get_context() != "object_key"
-            and char.lower() in ["t", "f", "n"]
-        ):
-            return self.parse_boolean_or_null()
-        # This might be a <string> that is missing the starting '"'
-        elif self.get_context() != "" and char.isalpha():
-            return self.parse_string()
         # If everything else fails, we just ignore and move on
         else:
             self.index += 1
@@ -117,11 +99,9 @@ class JSONParser:
             # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
             if (self.get_char_at() or "") == ":":
                 self.log(
-                    "While parsing an object we found a : before a key, replacing with ,",
+                    "While parsing an object we found a : before a key, ignoring",
                     "info",
                 )
-                self.remove_char_at()
-                self.insert_char_at(",")
                 self.index += 1
             # We are now searching for they string key
@@ -133,7 +113,7 @@ class JSONParser:
             # <member> starts with a <string>
             key = ""
             while key == "" and self.get_char_at():
-                key = self.parse_json()
+                key = self.parse_string()
                 # This can happen sometimes like { "": "value" }
                 if key == "" and self.get_char_at() == ":":
@@ -153,10 +133,10 @@ class JSONParser:
             # An extreme case of missing ":" after a key
             if (self.get_char_at() or "") != ":":
                 self.log(
-                    "While parsing an object we missed a : after a key, adding it back",
+                    "While parsing an object we missed a : after a key",
                     "info",
                 )
-                self.insert_char_at(":")
             self.index += 1
             self.reset_context()
             self.set_context("object_value")
@@ -176,10 +156,10 @@ class JSONParser:
         # Especially at the end of an LLM generated json you might miss the last "}"
         if (self.get_char_at() or "}") != "}":
             self.log(
-                "While parsing an object, we couldn't find the closing }, adding it back",
+                "While parsing an object, we couldn't find the closing }, ignoring",
                 "info",
             )
-            self.insert_char_at("}")
         self.index += 1
         return obj
@@ -205,6 +185,10 @@ class JSONParser:
                 char = self.get_char_at()
             # If this is the right value of an object and we are closing the object, it means the array is over
             if self.get_context() == "object_value" and char == "}":
+                self.log(
+                    "While parsing an array inside an object, we got to the end without finding a ]. Stopped parsing",
+                    "info",
+                )
                 break
         # Especially at the end of an LLM generated json you might miss the last "]"
@@ -217,35 +201,68 @@ class JSONParser:
             if char == ",":
                 # Remove trailing "," before adding the "]"
                 self.log(
-                    "While parsing an array, remove a trailing , before adding ]",
+                    "While parsing an array, found a trailing , before adding ]",
                     "info",
                 )
-                self.remove_char_at()
-            self.insert_char_at("]")
             self.index -= 1
         self.index += 1
         self.reset_context()
         return arr
-    def parse_string(self, string_quotes=False) -> str:
+    def parse_string(self) -> str:
         # <string> is a string of valid characters enclosed in quotes
         # i.e. { name: "John" }
         # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
         # Flag to manage corner cases related to missing starting quote
-        fixed_quotes = False
+        missing_quotes = False
         doubled_quotes = False
         lstring_delimiter = rstring_delimiter = '"'
-        if isinstance(string_quotes, list):
-            lstring_delimiter = string_quotes[0]
-            rstring_delimiter = string_quotes[1]
-        elif isinstance(string_quotes, str):
-            lstring_delimiter = rstring_delimiter = string_quotes
+        char = self.get_char_at()
+        # A valid string can only start with a valid quote or, in our case, with a literal
+        while char and char not in ['"', "'", "“"] and not char.isalpha():
+            self.index += 1
+            char = self.get_char_at()
+        # Ensuring we use the right delimiter
+        if char == "'":
+            lstring_delimiter = rstring_delimiter = "'"
+        elif char == "“":
+            lstring_delimiter = "“"
+            rstring_delimiter = "”"
+        elif char.isalpha():
+            # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
+            if char.lower() in ["t", "f", "n"]:
+                value = self.parse_boolean_or_null()
+                if value != "":
+                    return value
+            self.log(
+                "While parsing a string, we found a literal instead of a quote",
+                "info",
+            )
+            if self.get_context() == "":
+                # A string literal in the wild isn't a valid json and not something we can fix
+                self.log(
+                    "While parsing a string, we found a literal outside of context, ignoring it",
+                    "info",
+                )
+                self.index += 1
+                return self.parse_json()
+            self.log(
+                "While parsing a string, we found no starting quote, ignoring", "info"
+            )
+            missing_quotes = True
+        if not missing_quotes:
+            self.index += 1
         # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
-        if self.get_char_at(1) == lstring_delimiter:
+        if self.get_char_at() == lstring_delimiter:
             # This is a valid exception only if it's closed by a double delimiter again
-            i = 2
+            i = 1
             next_c = self.get_char_at(i)
             while next_c and next_c != rstring_delimiter:
                 i += 1
@@ -259,18 +276,9 @@ class JSONParser:
                 )
                 doubled_quotes = True
                 self.index += 1
-        char = self.get_char_at()
-        if char != lstring_delimiter:
-            self.log(
-                "While parsing a string, we found no starting quote, adding it", "info"
-            )
-            self.insert_char_at(lstring_delimiter)
-            fixed_quotes = True
-        else:
-            self.index += 1
-        # Start position of the string (to use later in the return value)
-        start = self.index
+        # Initialize our return value
+        string_acc = ""
         # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
         # In that case we need to use the ":|,|}" characters as terminators of the string
@@ -280,22 +288,25 @@ class JSONParser:
         # * If we are fixing missing quotes in an object, when it finds the special terminators
         char = self.get_char_at()
         while char and char != rstring_delimiter:
-            if fixed_quotes:
+            if missing_quotes:
                 if self.get_context() == "object_key" and (
                     char == ":" or char.isspace()
                 ):
                     break
                 elif self.get_context() == "object_value" and char in [",", "}"]:
                     break
+            string_acc += char
             self.index += 1
             char = self.get_char_at()
             # If the string contains an escaped character we should respect that or remove the escape
             if self.get_char_at(-1) == "\\":
                 if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
+                    string_acc += char
                     self.index += 1
                     char = self.get_char_at()
                 else:
-                    self.remove_char_at(-1)
+                    # Remove this character from the final output
+                    string_acc = string_acc[:-2] + string_acc[-1:]
                     self.index -= 1
             # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
             if char == rstring_delimiter:
@@ -305,8 +316,6 @@ class JSONParser:
                         "While parsing a string, we found a doubled quote, ignoring it",
                         "info",
                     )
-                    # self destruct this character
-                    self.remove_char_at()
                 else:
                     # Check if eventually there is a rstring delimiter, otherwise we bail
                     i = 1
@@ -343,12 +352,13 @@ class JSONParser:
                                 "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
                                 "info",
                             )
+                            string_acc += char
                             self.index += 1
                             char = self.get_char_at()
         if (
             char
-            and fixed_quotes
+            and missing_quotes
             and self.get_context() == "object_key"
             and char.isspace()
         ):
@@ -360,19 +370,16 @@ class JSONParser:
             if self.get_char_at() not in [":", ","]:
                 return ""
-        end = self.index
         # A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
         if char != rstring_delimiter:
             self.log(
-                "While parsing a string, we missed the closing quote, adding it back",
+                "While parsing a string, we missed the closing quote, ignoring",
                 "info",
             )
-            self.insert_char_at(rstring_delimiter)
         else:
             self.index += 1
-        return self.json_str[start:end].rstrip()
+        return string_acc.rstrip()
     def parse_number(self) -> Union[float, int, str]:
         # <number> is a valid real number expressed in one of a number of given formats
@@ -395,51 +402,57 @@ class JSONParser:
             except ValueError:
                 return number_str
         else:
-            # This is a string then
-            return self.parse_string()
+            # If nothing works, let's skip and keep parsing
+            return self.parse_json()
     def parse_boolean_or_null(self) -> Union[bool, str, None]:
         # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
-        boolean_map = {"true": (True, 4), "false": (False, 5), "null": (None, 4)}
-        for key, (value, length) in boolean_map.items():
-            if self.json_str.lower().startswith(key, self.index):
-                self.index += length
-                return value
+        starting_index = self.index
+        value = ""
+        char = self.get_char_at().lower()
+        if char == "t":
+            value = ("true", True)
+        elif char == "f":
+            value = ("false", False)
+        elif char == "n":
+            value = ("null", None)
+        if len(value):
+            i = 0
+            while char and i < len(value[0]) and char == value[0][i]:
+                i += 1
+                self.index += 1
+                char = self.get_char_at().lower()
+            if i == len(value[0]):
+                return value[1]
-        # This is a string then
-        return self.parse_string()
-    def insert_char_at(self, char: str) -> None:
-        self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
-        self.index += 1
+        # If nothing works reset the index before returning
+        self.index = starting_index
+        return ""
     def get_char_at(self, count: int = 0) -> Union[str, bool]:
-        # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
-        try:
-            return self.json_str[self.index + count]
-        except IndexError:
-            return False
-    def remove_char_at(self, count: int = 0) -> None:
-        self.json_str = (
-            self.json_str[: self.index + count]
-            + self.json_str[self.index + count + 1 :]
-        )
+        if self.json_fd:
+            self.json_fd.seek(self.index + count)
+            char = self.json_fd.read(1)
+            if char == "":
+                return False
+            return char
+        else:
+            # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
+            try:
+                return self.json_str[self.index + count]
+            except IndexError:
+                return False
     def skip_whitespaces_at(self) -> None:
-        # Remove trailing spaces
-        # I'd rather not do this BUT this method is called so many times that it makes sense to expand get_char_at
-        # At least this is what the profiler said and I believe in our lord and savior the profiler
-        try:
-            char = self.json_str[self.index]
-        except IndexError:
-            return
+        """
+        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
+        """
+        char = self.get_char_at()
         while char and char.isspace():
             self.index += 1
-            try:
-                char = self.json_str[self.index]
-            except IndexError:
-                return
+            char = self.get_char_at()
     def set_context(self, value: str) -> None:
         # If a value is provided update the context variable and save in stack
@@ -460,23 +473,31 @@ class JSONParser:
     def log(self, text: str, level: str) -> None:
         if level == self.logger["log_level"]:
+            context = ""
+            if self.json_fd:
+                self.json_fd.seek(self.index - self.logger["window"])
+                context = self.json_fd.read(self.logger["window"] * 2)
+                self.json_fd.seek(self.index)
+            else:
+                context = self.json_str[
+                    self.index
+                    - self.logger["window"] : self.index
+                    + self.logger["window"]
+                ]
             self.logger["log"].append(
                 {
                     "text": text,
-                    "context": self.json_str[
-                        self.index
-                        - self.logger["window"] : self.index
-                        + self.logger["window"]
-                    ],
+                    "context": context,
                 }
             )
 def repair_json(
-    json_str: str,
+    json_str: str = "",
     return_objects: bool = False,
     skip_json_loads: bool = False,
     logging: bool = False,
+    json_fd: TextIO = None,
 ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
     """
     Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -485,13 +506,15 @@ def repair_json(
     When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
     When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
     """
-    json_str = json_str.strip().lstrip("```json")
-    parser = JSONParser(json_str, logging)
+    parser = JSONParser(json_str, json_fd, logging)
     if skip_json_loads:
         parsed_json = parser.parse()
     else:
         try:
-            parsed_json = json.loads(json_str)
+            if json_fd:
+                parsed_json = json.load(json_fd)
+            else:
+                parsed_json = json.loads(json_str)
         except json.JSONDecodeError:
             parsed_json = parser.parse()
     # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
@@ -507,18 +530,30 @@ def loads(
     This function works like `json.loads()` except that it will fix your JSON in the process.
     It is a wrapper around the `repair_json()` function with `return_objects=True`.
     """
-    return repair_json(json_str, True, skip_json_loads, logging)
+    return repair_json(
+        json_str=json_str,
+        return_objects=True,
+        skip_json_loads=skip_json_loads,
+        logging=logging,
+    )
 def load(
-    fp: TextIO, skip_json_loads: bool = False, logging: bool = False
+    fd: TextIO, skip_json_loads: bool = False, logging: bool = False
 ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
-    return loads(fp.read(), skip_json_loads, logging)
+    """
+    This function works like `json.load()` except that it will fix your JSON in the process.
+    It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
+    """
+    return repair_json(json_fd=fd, skip_json_loads=skip_json_loads, logging=logging)
 def from_file(
     filename: str, skip_json_loads: bool = False, logging: bool = False
 ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
+    """
+    This function is a wrapper around `load()` so you can pass the filename as string
+    """
     fd = open(filename)
     jsonobj = load(fd, skip_json_loads, logging)
     fd.close()

{json_repair-0.16.3 → json_repair-0.17.0/src/json_repair.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: json_repair
-Version: 0.16.3
+Version: 0.17.0
 Summary: A package to repair broken json strings
 Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
 License: MIT License

json-repair 0.16.3__tar.gz → 0.17.0__tar.gz

json-repair 0.16.3tar.gz → 0.17.0tar.gz